懵懂的菜鸟

Stay hungry,Stay foolish.

导航

trcd_extract_EDCD_new

  1 # -*- coding:utf-8 -*-
  2 import re
  3 
  4 
  5 '''
  6 适应新版本
  7 '''
  8 
  9 
 10 year='17A'#用户自定义
 11 ss='./data/'#根目录
 12 filename = ss+'EDCD%s.txt'%year#输入文件名
 13 
 14 
 15 
 16 
 17 def trcd_nonote():
 18 
 19 
 20     p1 = r"^\s{4}(?:X|\W)\s{2}(C\d\d\d)\s.+\n"
 21     p2 = r"^\s{4}(?:X|\W)\s{2}C\d\d\d\s(.+)\n"
 22     p3 = r"^\s{7}Desc:\s(.+\.)\n"
 23     p4 = r"^\s{7}Desc:\s(.+[^\.])\n"
 24     p5 = r"^\s{13}(.+[^\.])\n"
 25     p6 = r"^\s{13}(.+\.)\n"
 26     pattern1 = re.compile(p1)
 27     pattern2 = re.compile(p2)
 28     pattern3 = re.compile(p3)
 29     pattern4 = re.compile(p4)
 30     pattern5 = re.compile(p5)
 31     pattern6 = re.compile(p6)
 32     fr = open(filename)
 33     temp = ();
 34     flag = 0
 35     for line in fr.readlines():
 36         matcher1 = re.findall(pattern1,line)
 37         matcher2 = re.findall(pattern2,line)
 38         matcher3 = re.findall(pattern3,line)
 39         matcher4 = re.findall(pattern4,line)
 40         matcher5 = re.findall(pattern5,line)
 41         matcher6 = re.findall(pattern6,line)
 42         #print matcher
 43         w2 = open(ss+'trcd_nonote%s.txt'%year,'a')#a代表追加 w代表重写
 44         if matcher1:
 45             flag = 1
 46             w2.write("\n")
 47             for j in matcher1:
 48                 for k in j:
 49                     w2.write(k)
 50                 #for k in g:
 51                     #w2.write(k)
 52             #continue;
 53         if ((matcher2!=[])and(flag ==1)):
 54             flag = 2
 55             #print type(tup1)
 56             #print tup1
 57             #flag = 2
 58             w2.write(",")
 59             for j in matcher2:
 60                 for k in j:
 61                     w2.write(k)
 62         if ((matcher3!=[])and(flag ==2)):
 63             flag = 3
 64             w2.write(",\"")
 65             for j in matcher3:
 66                 for k in j:
 67                     w2.write(k)
 68             w2.write("\"")
 69         if (matcher4!=[]):
 70             w2.write(",\"")
 71             for j in matcher4:
 72                 for k in j:
 73                     w2.write(k)
 74             flag = 4
 75         if ((matcher5!=[])and(flag ==4)):
 76             flag = 5
 77             w2.write(" ")
 78             for j in matcher5:
 79                 for k in j:
 80                     w2.write(k)
 81         if ((matcher6!=[])and(flag ==4 or 5)):
 82             flag = 6
 83             w2.write(" ")
 84             for j in matcher6:
 85                 for k in j:
 86                     w2.write(k)
 87             w2.write("\"")
 88     w2.close( )
 89 
 90 def trcd_note():
 91 
 92     p1 = r"^(?:\s{7}|X\s{6}|\W\s{6})([A-Z][0-9]{3})\s[A-Z].+$"#匹配1001
 93     p2 = r"^\s{7}Note:\s\n"#Note
 94     p3= r"^\s{13}([^ ].+)\n"#Note内容
 95     p4= r"^(?:-|컴)+\n"
 96     pattern1 = re.compile(p1)
 97     pattern2 = re.compile(p2)
 98     pattern3 = re.compile(p3)
 99     pattern4 = re.compile(p4)
100 
101 
102     fr = open(filename)
103     w2 = open(ss+'trcd_note%s.txt'%year,'a')#a代表追加 w代表重写
104     # temp = ();
105     flag = 0
106     flag1=0
107     for line in fr.readlines():
108         matcher1 = re.findall(pattern1,line)
109         matcher2 = re.findall(pattern2,line)
110         matcher3 = re.findall(pattern3,line)
111         matcher4 = re.findall(pattern4,line)
112 
113        
114         #print matcher
115 
116         if matcher1!=[]:
117             flag = 1
118             w2.write("\n")
119             # for j in matcher1:
120                 
121             #     w2.write(j)
122 
123         if ((matcher2!=[])and(flag == 1)):
124             flag = 2
125             flag1=1
126             # w2.write(",")
127         if flag1==1:
128             if ((matcher3!=[])and(flag ==2 or 3)):
129                 flag = 3
130                 w2.write(" ")
131                 for j in matcher3:
132                     
133                     w2.write(j)
134             # w2.write(")
135             if ((matcher4!=[])and(flag == 3)):
136                 flag=0
137                 flag1=0
138     w2.write("\n")
139     w2.close( )
140     fr.close()
141 
142 def join():
143 
144 
145 
146     f1= open(ss+'trcd_note%s.txt'%year)
147     f2 =open(ss+'trcd_nonote%s.txt'%year) 
148 
149     list_note=[]
150     for line1 in f1:
151         # print(line1)
152         if line1.isspace():
153             list_note.append('')
154         else:
155             list_note.append(line1)
156          
157     f1.close()
158 
159     # print(list_note)
160     f2_w= open(ss+'trcd%s.csv'%year,'a')  
161     # for i in range(len(list_note)):
162     i=0
163         # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
164     for line2 in f2:
165 
166         str11="%s,\"%s\"\n"%(line2.strip('\n'),list_note[i].strip('\n'))
167         i=i+1
168         # print(i)
169         # print(str11)
170         f2_w.write(str11)
171 
172 
173     f2_w.close() 
174     f2.close()
175 if __name__ == '__main__':
176     trcd_nonote()
177     trcd_note()
178     join()

 

posted on 2017-08-25 13:45  懵懂的菜鸟  阅读(195)  评论(0编辑  收藏  举报