1 # -*- coding:utf-8 -*-
2 '''
3 从11c开始提取
4 '''
5 import re
6 import numpy as np
7 import os
8 year = '17A'
9 ss="./data/edmd/"
10 # filename=ss+"/EDMDI1.17A"
11 def get_tag():
12 try:
13 os.rename(ss+"/EDMDI1.17A",ss+"/EDMDI1.txt")
14 except:
15 pass
16 f1=open(ss+"/EDMDI1.txt")
17 p1=re.compile(r"^(?:\s{3}|X\s{2}|\W\s{2})([A-Z]{6})\s.+\n")
18 list_tag=list()
19 for line in f1.readlines():
20 # print(line)
21 match1=re.findall(p1,line)
22 # print(match1)
23 if match1:
24 for j in match1:
25 list_tag.append(j)
26 # filename_w1= ss+'%s'%list_tag[MM]
27 print(list_tag)
28 return list_tag
29 def trmd_b1_nonote(list_tag):
30 if not os.path.exists('./data/edmd/new/'):
31 os.makedirs('./data/edmd/new/')
32
33 for MM in range(len(list_tag)):
34 try:
35 os.rename(ss+'%s_D.17A'%list_tag[MM],ss+'%s.txt'%list_tag[MM])
36 except:
37 break
38
39 filename_w= ss+'new/%s_w.txt'%list_tag[MM]
40 if os.path.exists(filename_w):
41 os.remove(filename_w)
42 # import os
43
44 # os.rename('./data/CODECO_D.02A','./data/CODECO_D.txt')
45 filename_r = ss+'%s.txt'%list_tag[MM] # txt文件和当前脚本在同一目录下,所以不用写具体路径
46 #00010 UNH Message header M 1
47 pattern1 = re.compile(r"(^\d{4,5})\s{3}[A-Z]{3}.+[CM]\s{3}\d*\s{1,}\|{0,}\n")#00010
48 pattern1_2 = re.compile(r"^\d{4,5}\s{3}([A-Z]{3}).+[CM]\s{3}\d*\s{1,}\|{0,}\n")#UNH
49 #pattern1_3 = re.compile(r"^\d{5}\s{3}[A-Z]{3}(.+)[CM]\s{3}\d*\s{1,}\|{0,}\n")#Message header
50 pattern1_4 = re.compile(r"^\d{4,5}\s{3}[A-Z]{3}.+([CM])\s{3}\d*\s{1,}\|{0,}\n")#C
51 pattern1_5 = re.compile(r"^\d{4,5}\s{3}[A-Z]{3}.+[CM]\s{3}(\d*)\s{1,}\|{0,}\n")#1
52 #pattern2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d)*.+[CM]\s{3}\d*\-+\+\n" )#+结尾
53 #00050 ---- Segment group 1 ------------------ C 9----------------+
54 pattern4_1 = re.compile(r"(^\d{4,5}).+Segment\sgroup\s\d*.+[CM]\s{3}\d*.+\n")
55 pattern4_2 = re.compile(r"^\d{4,5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*.+\n")
56 pattern4_3 = re.compile(r"^\d{4,5}.+Segment\sgroup\s\d*.+([CM])\s{3}\d*.+\n")
57 pattern4_4 = re.compile(r"^\d{4,5}.+Segment\sgroup\s\d*.+[CM]\s{3}(\d*).+\n")
58 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的的每个字段
59 #如00280 RNG Range details C 1---------------+|
60 pattern5_1 = re.compile(r"(^\d{4,5})\s{3}[A-Z]{3}.+[CM]\s{3}\d*\-+\+{1,10}\|{0,20}\n" )
61 pattern5_2 = re.compile(r"^\d{4,5}\s{3}([A-Z]{3}).+[CM]\s{3}\d*\-+\+{1,10}\|{0,20}\n" )
62 pattern5_3 = re.compile(r"^\d{4,5}\s{3}[A-Z]{3}.+([CM])\s{3}\d*\-+\+{1,10}\|{0,20}\n" )
63 pattern5_4 = re.compile(r"^\d{4,5}\s{3}[A-Z]{3}.+[CM]\s{3}(\d*)\-+\+{1,10}\|{0,20}\n" )
64 #以下是确定层级关系
65 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的
66 pattern5 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+[CM]\s{3}\d*\-+\+\|{0,10}\n" )
67 #匹配每组的开头一行即有Segment group的以+、+|、+||、+|||……结尾的
68 pattern2_1 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\n" )#+结尾
69 pattern2_2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\n" )#+|结尾
70 pattern2_3 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\n" )#+||结尾
71 pattern2_4 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\n" )
72 pattern2_5 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\n" )
73 pattern2_6 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\|\n" )
74 pattern2_7 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\|\|\n" )
75 #匹配有同时多个组同时结束的情况,即以++、++|、++||……++、++|、++||……等结尾的
76 pattern3_1 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{2}\|{0,20}\n")# 匹配++、++|、++||……等结尾
77 pattern3_2 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{3}\|{0,20}\n")# 匹配+++、+++|、+++||……等结尾
78 pattern3_3 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{4}\|{0,20}\n")
79 pattern3_4 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{5}\|{0,20}\n")
80 pattern3_5 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{6}\|{0,20}\n")
81 pattern3_6 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{7}\|{0,20}\n")
82
83
84 flag = 0
85 #listgr中第一个不为0的点
86 pos = -1
87 listgr =[0,0,0,0,0,0,0,0,0,0]
88
89 fr = open(filename_r)
90 w2 = open(filename_w,'a')#a代表追加 w代表重写
91 # w2.write("code_pos,parent,TRSD_tag,year,list_tag[MM],S,R")
92 for line in fr.readlines():
93 matcher1 = re.findall(pattern1,line)
94 matcher1_2 = re.findall(pattern1_2,line)
95 #matcher1_3 = re.findall(pattern1_3,line)
96 matcher1_4 = re.findall(pattern1_4,line)
97 matcher1_5 = re.findall(pattern1_5,line)
98 matcher2_1 = re.findall(pattern2_1,line)
99 matcher2_2 = re.findall(pattern2_2,line)
100 matcher2_3 = re.findall(pattern2_3,line)
101 matcher2_4 = re.findall(pattern2_4,line)
102 matcher2_5 = re.findall(pattern2_5,line)
103 matcher2_6 = re.findall(pattern2_6,line)
104 matcher2_7 = re.findall(pattern2_7,line)
105 matcher3_1 = re.findall(pattern3_1,line)
106 matcher3_2 = re.findall(pattern3_2,line)
107 matcher3_3 = re.findall(pattern3_3,line)
108 matcher3_4 = re.findall(pattern3_4,line)
109 matcher3_5 = re.findall(pattern3_5,line)
110 matcher3_6 = re.findall(pattern3_6,line)
111 matcher4_1 = re.findall(pattern4_1,line)
112 matcher4_2 = re.findall(pattern4_2,line)
113 matcher4_3 = re.findall(pattern4_3,line)
114 matcher4_4 = re.findall(pattern4_4,line)
115 matcher5 = re.findall(pattern5,line)
116 matcher5_1 = re.findall(pattern5_1,line)
117 matcher5_2 = re.findall(pattern5_2,line)
118 matcher5_3 = re.findall(pattern5_3,line)
119 matcher5_4 = re.findall(pattern5_4,line)
120
121 if matcher4_1!=[]:
122 w2.write("\n")
123 for j in matcher4_1:
124 for k in j:
125 w2.write(k)
126 if matcher4_2!=[]:
127 w2.write(",")
128 #写入parent列
129 if pos!= -1:
130 numgr =listgr[pos]
131 else:
132 numgr = 0
133 if numgr ==0:
134 w2.write("SG0,")
135 else:
136 w2.write("SG"+str(numgr)+",")
137 for j in matcher4_2:
138 for k in j:
139 w2.write("SG"+str(k))
140 if matcher4_3!=[]:
141 flag = 3
142 w2.write(",")
143 #默认写入year,list_tag[MM]两列
144 w2.write(year+","+list_tag[MM]+",")
145 for j in matcher4_3:
146 for k in j:
147 w2.write(k)
148 if matcher4_4!=[]:
149 w2.write(",")
150 for j in matcher4_4:
151 for k in j:
152 w2.write(k)
153 if matcher5_1!=[]:
154 w2.write("\n")
155 for j in matcher5_1:
156 for k in j:
157 w2.write(k)
158 if matcher5_2!=[]:
159 w2.write(",")
160 #写入parent列
161 if pos!= -1:
162 numgr =listgr[pos]
163 else:
164 numgr = 0
165 if numgr ==0:
166 w2.write("SG0,")
167 else:
168 w2.write("SG"+str(numgr)+",")
169 for j in matcher5_2:
170 for k in j:
171 w2.write(k)
172 if matcher5_3!=[]:
173 flag = 3
174 w2.write(",")
175 #默认写入year,list_tag[MM]两列
176 w2.write(year+","+list_tag[MM]+",")
177 for j in matcher5_3:
178 for k in j:
179 w2.write(k)
180 if matcher5_4!=[]:
181 w2.write(",")
182 for j in matcher5_4:
183 for k in j:
184 w2.write(k)
185 #确定层级关系,也就是确定listgr
186 if(matcher5!=[]):
187 for i in listgr:
188 if i==0:
189 pos = listgr.index(i)-1
190 break
191 listgr[pos]=0
192 if (matcher2_1!=[]):
193 # print "2_1"
194 for j in matcher2_1:
195 # print j
196 if(listgr[0]==0):
197 listgr[0]=j
198 else:
199 listgr[0]=0
200 # print listgr
201 if (matcher2_2!=[]):
202 for j in matcher2_2:
203 #numgr_d = j
204 if(listgr[1]==0):
205 listgr[1]=j
206 else:
207 listgr[1]=0
208 if (matcher2_3!=[]):
209 for j in matcher2_3:
210 if(listgr[2]==0):
211 listgr[2]=j
212 else:
213 listgr[2]=0
214 if (matcher2_4!=[]):
215 for j in matcher2_4:
216 if(listgr[3]==0):
217 listgr[3]=j
218 else:
219 listgr[3]=0
220 if (matcher2_5!=[]):
221 for j in matcher2_5:
222 if(listgr[4]==0):
223 listgr[4]=j
224 else:
225 listgr[4]=0
226 if (matcher2_6!=[]):
227 for j in matcher2_6:
228 if(listgr[5]==0):
229 listgr[5]=j
230 else:
231 listgr[5]=0
232 if (matcher2_7!=[]):
233 for j in matcher2_7:
234 if(listgr[6]==0):
235 listgr[6]=j
236 else:
237 listgr[6]=0
238 if (matcher3_1!=[]):
239 for i in listgr:
240 if i==0:
241 pos = listgr.index(i)-1
242 break
243 listgr[pos]=0
244 listgr[pos-1]=0
245 if (matcher3_2!=[]):
246 for i in listgr:
247 if i==0:
248 pos = listgr.index(i)-1
249 break
250 for k in range((pos-2),(pos+1)):
251 listgr[k]=0
252 if (matcher3_3!=[]):
253 for i in listgr:
254 if i==0:
255 pos = listgr.index(i)-1
256 break
257 for k in range((pos-3),(pos+1)):
258 listgr[k]=0
259 if (matcher3_4!=[]):
260 for i in listgr:
261 if i==0:
262 pos = listgr.index(i)-1
263 break
264 for k in range(pos-4,pos+1):
265 listgr[k]=0
266 if (matcher3_5!=[]):
267 for i in listgr:
268 if i==0:
269 pos = listgr.index(i)-1
270 break
271 for k in range(pos-5,pos+1):
272 listgr[k]=0
273 if (matcher3_6!=[]):
274 for i in listgr:
275 if i==0:
276 pos = listgr.index(i)-1
277 break
278 for k in range(pos-6,pos+1):
279 listgr[k]=0
280 #确定层级关系结束
281 if (matcher1!=[]):
282 flag = 1
283 w2.write("\n")
284 for j in matcher1:
285 for k in j:
286 w2.write(k)
287 #print listgr
288 #判断当前lit不为0的位置
289 for i in listgr:
290 if i==0:
291 pos = listgr.index(i)-1
292 break
293 if matcher1_2!=[]:
294 flag = 2
295 w2.write(",")
296 #写入parent列
297 if pos!= -1:
298 numgr =listgr[pos]
299 else:
300 numgr = 0
301 if numgr ==0:
302 w2.write("SG0,")
303 else:
304 w2.write("SG"+str(numgr)+",")
305 for j in matcher1_2:
306 for k in j:
307 w2.write(k)
308 # if matcher1_3!=[]:
309 # flag = 3
310 # w2.write(",")
311 # for j in matcher1_3:
312 # for k in j:
313 # w2.write(k)
314 if matcher1_4!=[]:
315 flag = 4
316 w2.write(",")
317 #默认写入year,list_tag[MM]两列
318 w2.write(year+","+list_tag[MM]+",")
319 for j in matcher1_4:
320 for k in j:
321 w2.write(k)
322 if ((matcher1_5!=[])and(flag ==4)):
323 flag = 5
324 w2.write(",")
325 for j in matcher1_5:
326 for k in j:
327 w2.write(k)
328 w2.close()
329 fr.close()
330 def trmd_b1_note(list_tag):
331 for MM in range(len(list_tag)):
332 filename_r = ss+'%s.txt'%list_tag[MM]
333 filename_w = ss+'new/%s_wnote.txt'%list_tag[MM]
334 if os.path.exists(filename_w):
335 os.remove(filename_w)
336
337 fr = open(filename_r)
338 w2 = open(filename_w,'a')
339 m=0
340 for line in fr.readlines():
341 list1 = [3,6,9,12,15,18,21,24,27,30]
342 for i in range(10):
343 k = list1[i]
344 # print k
345 pattern1 = re.compile(r"^(\d{4,5})\s{"+str(k)+"}[^ ].+\n")
346 matcher1 = re.findall(pattern1,line)
347 if matcher1!=[]:
348 flag = 1
349 m = k
350 # print m
351 w2.write("\"\n")
352 # for j in matcher1:
353 # w2.write(j)
354 flag = 1
355 w2.write("\"")
356 break
357 v = m+5
358 #print v
359 pattern2 = re.compile(r"^\s{"+str(v)+"}([^ ].+)\n")
360 matcher2 = re.findall(pattern2,line)
361 if (matcher2!=[]):
362 for j in matcher2:
363 w2.write(j)
364 w2.write(" ")
365 #防止匹配到下面结构中的行
366 pattern3 = re.compile(r"(:?4.3\s{4}Message\sstructure)|(:?Pos\s+Tag\sName\s+S\s+R)")
367 matcher3 = re.findall(pattern3,line)
368 if (matcher3!=[]):
369 break
370 w2.write("\"")
371 w2.close( )
372 #把第一行的“修改为note
373 old_file=filename_w
374 fopen=open(old_file,'r')
375 w_str=""
376 i =0
377 for line in fopen:
378 i =i+1
379 if ((re.search("\"",line)) and (i ==1)):
380 line=re.sub('\"','code_pos,note',line)
381 w_str+=line
382 else:
383 w_str+=line
384 # print w_str
385 wopen=open(old_file,'w')
386 wopen.write(w_str)
387 fopen.close()
388 wopen.close()
389 def join(list_tag):
390 for MM in range(len(list_tag)):
391 f1 = open(ss+'new/%s_w.txt'%list_tag[MM])
392 f2 = open(ss+'new/%s_wnote.txt'%list_tag[MM])
393
394
395 list_note=[]
396 for line1 in f1:
397 # print(line1)
398
399 list_note.append(line1)
400
401 f1.close()
402
403 # print(list_note)
404 f2_w= open(ss+'new/b1%s.csv'%year,'a')
405 # for i in range(len(list_note)):
406 j=0
407 # f2_r = open(ss+'/new/%s_w.txt'%list_tag[MM])
408 for line2 in f2:
409
410 str11="%s,%s\n"%(list_note[j].strip('\n'),line2.strip('\n'))
411 j=j+1
412 # print(i)
413 # print(str11)
414 f2_w.write(str11)
415
416
417
418 f2.close()
419 f2_w.close()
420
421
422
423 if __name__ == '__main__':
424 list_tag=get_tag()
425 trmd_b1_nonote(list_tag)
426 trmd_b1_note(list_tag)
427 join(list_tag)
428
429 """
430 特殊情况
431
432
433
434 """