1 # -*- coding:utf-8 -*-
2 '''
3 从11c开始提取
4 '''
5 import re
6 import numpy as np
7 import os
8 year = '17A'
9 ss="./data/edmd/"
10 # filename=ss+"/EDMDI1.17A"
11 try:
12 os.rename(ss+"/EDMDI1.17A",ss+"/EDMDI1.txt")
13 except:
14 pass
15 f1=open(ss+"/EDMDI1.txt")
16 p1=re.compile(r"^(?:\s{3}|X\s{2}|\W\s{2})([A-Z]{6})\s.+\n")
17 list_tag=list()
18 for line in f1.readlines():
19 # print(line)
20 match1=re.findall(p1,line)
21 # print(match1)
22 if match1:
23 for j in match1:
24 list_tag.append(j)
25 # filename_w1= ss+'%s'%list_tag[i]
26 print(list_tag)
27 for i in range(len(list_tag)):
28 try:
29 os.rename(ss+'%s_D.17A'%list_tag[i],ss+'%s.txt'%list_tag[i])
30 except:
31 break
32
33 filename_w= ss+'/new/%s_w.txt'%list_tag[i]
34 if os.path.exists(filename_w):
35 os.remove(filename_w)
36 # import os
37
38 # os.rename('./data/CODECO_D.02A','./data/CODECO_D.txt')
39 filename_r = ss+'%s.txt'%list_tag[i] # txt文件和当前脚本在同一目录下,所以不用写具体路径
40 #00010 UNH Message header M 1
41 pattern1 = re.compile(r"(^\d{5})\s{3}[A-Z]{3}.+[CM]\s{3}\d*\s{1,}\|{0,}\n")#00010
42 pattern1_2 = re.compile(r"^\d{5}\s{3}([A-Z]{3}).+[CM]\s{3}\d*\s{1,}\|{0,}\n")#UNH
43 pattern1_3 = re.compile(r"^\d{5}\s{3}[A-Z]{3}(.+)[CM]\s{3}\d*\s{1,}\|{0,}\n")#Message header
44 pattern1_4 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+([CM])\s{3}\d*\s{1,}\|{0,}\n")#C
45 pattern1_5 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+[CM]\s{3}(\d*)\s{1,}\|{0,}\n")#1
46 #pattern2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d)*.+[CM]\s{3}\d*\-+\+\n" )#+结尾
47 #00050 ---- Segment group 1 ------------------ C 9----------------+
48 pattern4_1 = re.compile(r"(^\d{5}).+Segment\sgroup\s\d*.+[CM]\s{3}\d*.+\n")
49 pattern4_2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*.+\n")
50 pattern4_3 = re.compile(r"^\d{5}.+Segment\sgroup\s\d*.+([CM])\s{3}\d*.+\n")
51 pattern4_4 = re.compile(r"^\d{5}.+Segment\sgroup\s\d*.+[CM]\s{3}(\d*).+\n")
52 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的的每个字段
53 #如00280 RNG Range details C 1---------------+|
54 pattern5_1 = re.compile(r"(^\d{5})\s{3}[A-Z]{3}.+[CM]\s{3}\d*\-+\+{1,10}\|{0,20}\n" )
55 pattern5_2 = re.compile(r"^\d{5}\s{3}([A-Z]{3}).+[CM]\s{3}\d*\-+\+{1,10}\|{0,20}\n" )
56 pattern5_3 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+([CM])\s{3}\d*\-+\+{1,10}\|{0,20}\n" )
57 pattern5_4 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+[CM]\s{3}(\d*)\-+\+{1,10}\|{0,20}\n" )
58 #以下是确定层级关系
59 #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的
60 pattern5 = re.compile(r"^\d{5}\s{3}[A-Z]{3}.+[CM]\s{3}\d*\-+\+\|{0,10}\n" )
61 #匹配每组的开头一行即有Segment group的以+、+|、+||、+|||……结尾的
62 pattern2_1 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\n" )#+结尾
63 pattern2_2 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\n" )#+|结尾
64 pattern2_3 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\n" )#+||结尾
65 pattern2_4 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\n" )
66 pattern2_5 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\n" )
67 pattern2_6 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\|\n" )
68 pattern2_7 = re.compile(r"^\d{5}.+Segment\sgroup\s(\d*).+[CM]\s{3}\d*\-+\+\|\|\|\|\|\|\n" )
69 #匹配有同时多个组同时结束的情况,即以++、++|、++||……++、++|、++||……等结尾的
70 pattern3_1 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{2}\|{0,20}\n")# 匹配++、++|、++||……等结尾
71 pattern3_2 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{3}\|{0,20}\n")# 匹配+++、+++|、+++||……等结尾
72 pattern3_3 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{4}\|{0,20}\n")
73 pattern3_4 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{5}\|{0,20}\n")
74 pattern3_5 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{6}\|{0,20}\n")
75 pattern3_6 = re.compile(r"^\d{5}.+[CM]\s{3}\d*\-+\+{7}\|{0,20}\n")
76
77
78 flag = 0
79 #listgr中第一个不为0的点
80 pos = -1
81 listgr =[0,0,0,0,0,0,0,0,0,0]
82
83 fr = open(filename_r)
84 w2 = open(filename_w,'a')#a代表追加 w代表重写
85 for line in fr.readlines():
86 matcher1 = re.findall(pattern1,line)
87 matcher1_2 = re.findall(pattern1_2,line)
88 matcher1_3 = re.findall(pattern1_3,line)
89 matcher1_4 = re.findall(pattern1_4,line)
90 matcher1_5 = re.findall(pattern1_5,line)
91 matcher2_1 = re.findall(pattern2_1,line)
92 matcher2_2 = re.findall(pattern2_2,line)
93 matcher2_3 = re.findall(pattern2_3,line)
94 matcher2_4 = re.findall(pattern2_4,line)
95 matcher2_5 = re.findall(pattern2_5,line)
96 matcher2_6 = re.findall(pattern2_6,line)
97 matcher2_7 = re.findall(pattern2_7,line)
98 matcher3_1 = re.findall(pattern3_1,line)
99 matcher3_2 = re.findall(pattern3_2,line)
100 matcher3_3 = re.findall(pattern3_3,line)
101 matcher3_4 = re.findall(pattern3_4,line)
102 matcher3_5 = re.findall(pattern3_5,line)
103 matcher3_6 = re.findall(pattern3_6,line)
104 matcher4_1 = re.findall(pattern4_1,line)
105 matcher4_2 = re.findall(pattern4_2,line)
106 matcher4_3 = re.findall(pattern4_3,line)
107 matcher4_4 = re.findall(pattern4_4,line)
108 matcher5 = re.findall(pattern5,line)
109 matcher5_1 = re.findall(pattern5_1,line)
110 matcher5_2 = re.findall(pattern5_2,line)
111 matcher5_3 = re.findall(pattern5_3,line)
112 matcher5_4 = re.findall(pattern5_4,line)
113
114 if matcher4_1!=[]:
115 w2.write("\n")
116 for j in matcher4_1:
117 for k in j:
118 w2.write(k)
119 if matcher4_2!=[]:
120 w2.write(",")
121 #写入parent列
122 if pos!= -1:
123 numgr =listgr[pos]
124 else:
125 numgr = 0
126 w2.write("SG"+str(numgr)+",")
127 for j in matcher4_2:
128 for k in j:
129 w2.write(k)
130 if matcher4_3!=[]:
131 flag = 3
132 w2.write(",")
133 for j in matcher4_3:
134 for k in j:
135 w2.write(k)
136 if matcher4_4!=[]:
137 w2.write(",")
138 for j in matcher4_4:
139 for k in j:
140 w2.write(k)
141 if matcher5_1!=[]:
142 w2.write("\n")
143 for j in matcher5_1:
144 for k in j:
145 w2.write(k)
146 if matcher5_2!=[]:
147 w2.write(",")
148 #写入parent列
149 if pos!= -1:
150 numgr =listgr[pos]
151 else:
152 numgr = 0
153 w2.write("SG"+str(numgr)+",")
154 for j in matcher5_2:
155 for k in j:
156 w2.write(k)
157 if matcher5_3!=[]:
158 flag = 3
159 w2.write(",")
160 for j in matcher5_3:
161 for k in j:
162 w2.write(k)
163 if matcher5_4!=[]:
164 w2.write(",")
165 for j in matcher5_4:
166 for k in j:
167 w2.write(k)
168 #确定层级关系,也就是确定listgr
169 if(matcher5!=[]):
170 for i in listgr:
171 if i==0:
172 pos = listgr.index(i)-1
173 break
174 listgr[pos]=0
175 if (matcher2_1!=[]):
176 # print "2_1"
177 for j in matcher2_1:
178 # print j
179 if(listgr[0]==0):
180 listgr[0]=j
181 else:
182 listgr[0]=0
183 # print listgr
184 if (matcher2_2!=[]):
185 for j in matcher2_2:
186 #numgr_d = j
187 if(listgr[1]==0):
188 listgr[1]=j
189 else:
190 listgr[1]=0
191 if (matcher2_3!=[]):
192 for j in matcher2_3:
193 if(listgr[2]==0):
194 listgr[2]=j
195 else:
196 listgr[2]=0
197 if (matcher2_4!=[]):
198 for j in matcher2_4:
199 if(listgr[3]==0):
200 listgr[3]=j
201 else:
202 listgr[3]=0
203 if (matcher2_5!=[]):
204 for j in matcher2_5:
205 if(listgr[4]==0):
206 listgr[4]=j
207 else:
208 listgr[4]=0
209 if (matcher2_6!=[]):
210 for j in matcher2_6:
211 if(listgr[5]==0):
212 listgr[5]=j
213 else:
214 listgr[5]=0
215 if (matcher2_7!=[]):
216 for j in matcher2_7:
217 if(listgr[6]==0):
218 listgr[6]=j
219 else:
220 listgr[6]=0
221 if (matcher3_1!=[]):
222 for i in listgr:
223 if i==0:
224 pos = listgr.index(i)-1
225 break
226 listgr[pos]=0
227 listgr[pos-1]=0
228 if (matcher3_2!=[]):
229 for i in listgr:
230 if i==0:
231 pos = listgr.index(i)-1
232 break
233 for k in range((pos-2),(pos+1)):
234 listgr[k]=0
235 if (matcher3_3!=[]):
236 for i in listgr:
237 if i==0:
238 pos = listgr.index(i)-1
239 break
240 for k in range((pos-3),(pos+1)):
241 listgr[k]=0
242 if (matcher3_4!=[]):
243 for i in listgr:
244 if i==0:
245 pos = listgr.index(i)-1
246 break
247 for k in range(pos-4,pos+1):
248 listgr[k]=0
249 if (matcher3_5!=[]):
250 for i in listgr:
251 if i==0:
252 pos = listgr.index(i)-1
253 break
254 for k in range(pos-5,pos+1):
255 listgr[k]=0
256 if (matcher3_6!=[]):
257 for i in listgr:
258 if i==0:
259 pos = listgr.index(i)-1
260 break
261 for k in range(pos-6,pos+1):
262 listgr[k]=0
263 #确定层级关系结束
264 if (matcher1!=[]):
265 flag = 1
266 w2.write("\n")
267 for j in matcher1:
268 for k in j:
269 w2.write(k)
270 #print listgr
271 #判断当前lit不为0的位置
272 for i in listgr:
273 if i==0:
274 pos = listgr.index(i)-1
275 break
276 if matcher1_2!=[]:
277 flag = 2
278 w2.write(",")
279 #写入parent列
280 if pos!= -1:
281 numgr =listgr[pos]
282 else:
283 numgr = 0
284 w2.write("SG"+str(numgr)+",")
285 for j in matcher1_2:
286 for k in j:
287 w2.write(k)
288 if matcher1_3!=[]:
289 flag = 3
290 w2.write(",")
291 for j in matcher1_3:
292 for k in j:
293 w2.write(k)
294 if matcher1_4!=[]:
295 flag = 4
296 w2.write(",")
297 for j in matcher1_4:
298 for k in j:
299 w2.write(k)
300 if ((matcher1_5!=[])and(flag ==4)):
301 flag = 5
302 w2.write(",")
303 for j in matcher1_5:
304 for k in j:
305 w2.write(k)
306
307 w2.close( )
308 fr.close()
309
310 f2_w= open(ss+'/new/%s.txt'%year,'a')
311
312 for i in range(len(list_tag)):
313 f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
314 for line in f2_r:
315 # for j in line:
316 f2_w.write(year+','+line)
317 f2_r.close()
318 print("--%i--is ok"%i)
319 f2_w.close()
320
321 # if __name__ == '__main__':
322
323
324 """
325 特殊情况
326
327
328
329 """