懵懂的菜鸟

Stay hungry,Stay foolish.

导航

trsd_extract_EDSD_new

  1 # -*- coding:utf-8 -*-
  2 import re
  3 
  4 
  5 '''
  6 适应新版本
  7 '''
  8 
  9 
 10 year='17A'#用户自定义
 11 ss='./data/'#根目录
 12 filename = ss+'EDSD%s.txt'%year#输入文件名
 13 
 14 
 15 
 16 
 17 def trsd_nonote():
 18 
 19 
 20     p1 = r"^\s{4}(?:X|\W)\s{2}([A-Z]{3})\s\s.+\n"#TCC
 21     p2 = r"\s{4}(?:X|\W)\s{2}[A-Z]{3}\s\s(.+)\n"
 22     """
 23            Function: To specify information regarding the transport
 24                      such as mode of transport, means of transport,
 25                      its conveyance reference number and the
 26                      identification of the means of transport.
 27     """
 28     p3 = r"^\s{7}Function:\s(.+\w\w\.)\n"
 29     p4 = r"^\s{7}Function:\s(.+\.g\.|.+[^\.])\n"
 30     # p4 = r"^\s{7}Function:\s(.+[\.g\.|[^\.]])\n"
 31     p5 = r"^\s{17}(\w.+[^\.])\n"
 32     p6 = r"^\s{17}(.+\.)\n"
 33 
 34     #Note
 35     # p7 = r"^\s{7}Note:\s\n"#Note
 36     # p8= r"^\s{12}([A-Z].+\.)\n"#Note内容只有1行
 37     # p9 = r"^\s{12}(.+[^\.]|)\n"#Note内容只多行的非最后行
 38     # p10 = r"^\s{12}(.+\.)\n"#Note内容只多行的最后行
 39 
 40     pattern1 = re.compile(p1)
 41     pattern2 = re.compile(p2)
 42     pattern3 = re.compile(p3)
 43     pattern4 = re.compile(p4)
 44     pattern5 = re.compile(p5)
 45     pattern6 = re.compile(p6)
 46     fr = open(filename)
 47     # temp = "";
 48     flag = 0
 49     for line in fr.readlines():
 50         matcher1 = re.findall(pattern1,line)
 51         matcher2 = re.findall(pattern2,line)
 52         matcher3 = re.findall(pattern3,line)
 53         matcher4 = re.findall(pattern4,line)
 54         matcher5 = re.findall(pattern5,line)
 55         matcher6 = re.findall(pattern6,line)
 56         #print matcher
 57         w2 = open(ss+'trsd_nonote%s.txt'%year,'a')#a代表追加 w代表重写
 58         if matcher1:
 59             flag = 1
 60             w2.write("\n")
 61             for j in matcher1:
 62                 # for k in j:
 63                     w2.write(j)
 64         if ((matcher2!=[])and(flag ==1)):
 65             flag = 2
 66             w2.write(",")
 67             for j in matcher2:
 68                 # for k in j:
 69                     w2.write(j)
 70         if ((matcher3!=[])and(flag ==2)):
 71             flag = 3
 72             #防止有逗号,用双引号括起
 73             w2.write(",\"")
 74             for j in matcher3:
 75                 # for k in j:
 76                     w2.write(j)
 77             w2.write("\"")
 78         if ((matcher4!=[])and(flag ==2)):
 79             flag = 4
 80             w2.write(",\"")
 81             for j in matcher4:
 82                 # for k in j:
 83                     w2.write(j)
 84         if ((matcher5!=[])and(flag ==4 or 5)):
 85             flag = 5
 86             w2.write(" ")
 87             for j in matcher5:
 88                 # for k in j:
 89                     w2.write(j)
 90             # w2.write("\"")
 91         if ((matcher6!=[])and(flag ==4 or flag==5)):
 92             flag = 6
 93             w2.write(" ")
 94             for j in matcher6:
 95                 # for k in j:
 96                     w2.write(j)
 97             w2.write("\"")
 98     w2.close( )
 99 
100 def trsd_note():
101 
102     p1 = r"^(?:\s{7}|X\s{6}|\W\s{6})([A-Z]{3})\s\s[A-Z].+$"#匹配1001
103     p2 = r"^\s{7}Note:\s\n"#Note
104     p3= r"^\s{12}([^ ].+)\n"#Note内容
105     p4= r"^(?:-|컴)+\n"
106     pattern1 = re.compile(p1)
107     pattern2 = re.compile(p2)
108     pattern3 = re.compile(p3)
109     pattern4 = re.compile(p4)
110 
111 
112     fr = open(filename)
113     w2 = open(ss+'trsd_note%s.txt'%year,'a')#a代表追加 w代表重写
114     # temp = ();
115     flag = 0
116     flag1=0
117     for line in fr.readlines():
118         matcher1 = re.findall(pattern1,line)
119         matcher2 = re.findall(pattern2,line)
120         matcher3 = re.findall(pattern3,line)
121         matcher4 = re.findall(pattern4,line)
122 
123        
124         #print matcher
125 
126         if matcher1!=[]:
127             flag = 1
128             w2.write("\n")
129             # for j in matcher1:
130                 
131             #     w2.write(j)
132 
133         if ((matcher2!=[])and(flag == 1)):
134             flag = 2
135             flag1=1
136             # w2.write(",")
137         if flag1==1:
138             if ((matcher3!=[])and(flag ==2 or 3)):
139                 flag = 3
140                 w2.write(" ")
141                 for j in matcher3:
142                     
143                     w2.write(j)
144             # w2.write(")
145             if ((matcher4!=[])and(flag == 3)):
146                 flag=0
147                 flag1=0
148     w2.write("\n")
149     w2.close( )
150     fr.close()
151 
152 def join():
153 
154 
155 
156     f1= open(ss+'trsd_note%s.txt'%year)
157     f2 =open(ss+'trsd_nonote%s.txt'%year) 
158 
159     list_note=[]
160     for line1 in f1:
161         # print(line1)
162         if line1.isspace():
163             list_note.append('')
164         else:
165             list_note.append(line1)
166          
167     f1.close()
168 
169     # print(list_note)
170     f2_w= open(ss+'trsd%s.csv'%year,'a')  
171     # for i in range(len(list_note)):
172     i=0
173         # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
174     for line2 in f2:
175 
176         str11="%s,\"%s\"\n"%(line2.strip('\n'),list_note[i].strip('\n'))
177         i=i+1
178         # print(i)
179         # print(str11)
180         f2_w.write(str11)
181 
182 
183     f2_w.close() 
184     f2.close()
185 if __name__ == '__main__':
186     trsd_nonote()
187     trsd_note()
188     join()

 

posted on 2017-08-25 13:47  懵懂的菜鸟  阅读(314)  评论(0)    收藏  举报