懵懂的菜鸟

Stay hungry,Stay foolish.

导航

b4和tncl_extract_UNCL_new

  1 # -*- coding:utf-8 -*-
  2 import re
  3 
  4 
  5 '''
  6 适应新版本
  7 
  8 注意:
  9 1)17A文件改完后缀后,需要转为UTF-8无BOM格式,才能正确处理。
 10 2)fr = open(filename,encoding='utf-8')
 11 
 12 '''
 13 
 14 
 15 year='17A'#用户自定义
 16 ss='./data/'#根目录
 17 filename = ss+'UNCL%s.txt'%year#输入文件名
 18 
 19 
 20 
 21 def tncl_note():
 22 
 23 
 24 
 25     p4= r"^(?:\s{5}|X\s{4}|\W\s{4})(\w+)\s+\w.+\n"
 26     p1 = r"^(?:\s{5}|X\s{4}|\W\s{4})(\d\d\d\d)\s\s[A-Z].+\]$"#匹配tncl_id
 27     p2 = r"^(?:\s{5}|X\s{4}|\W\s{4})(\w+)\s+\w.+\n"#匹配tncl_tag
 28     p3 = r"^(?:\s{5}|X\s{4}|\W\s{4})\w+\s+(\w.+)\n"#匹配tncl_name
 29     p4 = r"^\s{14}([^ ].+)\n"#匹配tncl_desc和#Note内容
 30 
 31     p5 = r"^\s{11}Note:\s\n"#Note
 32  
 33 
 34     pattern1 = re.compile(p1)
 35     pattern2 = re.compile(p2)
 36     pattern3 = re.compile(p3)
 37     pattern4 = re.compile(p4)
 38 
 39     pattern5 = re.compile(p5)
 40 
 41 
 42     fr = open(filename,encoding='utf-8')
 43     temp = str();
 44     flag = 0
 45     w2 = open(ss+'tncl_ori%s.txt'%year,'a')#a代表追加 w代表重写
 46     flag1=0
 47     for line in fr.readlines():
 48         matcher1 = re.findall(pattern1,line)
 49         matcher2 = re.findall(pattern2,line)
 50         matcher3 = re.findall(pattern3,line)
 51         matcher4 = re.findall(pattern4,line)
 52         matcher5 = re.findall(pattern5,line)
 53 
 54         #print matcher
 55 
 56         if matcher1:
 57             for g in matcher1:
 58                 flag = 1
 59                 temp = g
 60 
 61             continue;
 62         if matcher2 and(flag==1 or 4)and(temp!=''):
 63 
 64             flag = 2
 65             w2.write("\"\n"+temp+",")
 66             for j in matcher2:
 67                 for k in j:
 68                     w2.write(k)
 69 
 70         if matcher3 and flag==2:
 71             flag = 3
 72             w2.write(",")
 73             for j in matcher3:
 74                 for k in j:
 75                     w2.write(k)
 76             w2.write(",\"")
 77         if matcher4 and (flag==3 or flag==4):
 78             flag=4
 79             for j in matcher4:
 80                 for k in j:
 81                     w2.write(k)
 82         
 83         if ((matcher5!=[])and(flag == 4)):
 84             # flag = 5
 85             w2.write("\",\"")
 86             # flag1=1
 87   
 88     w2.write("\"")
 89     w2.close( )
 90 def join():
 91 
 92 
 93 
 94     f1= open(ss+'tncl_ori%s.txt'%year)
 95 
 96     list_note=[]
 97     for line1 in f1:
 98         # print(line1)
 99 
100         list_note.append(line1)
101          
102     f1.close()
103     # print(list_note[1].split(','))
104     # print("%s_%s,%s\n"%(list_note[1].split(',')[0],list_note[1].split(',')[1],list_note[1].strip('\n')))
105     # list_note[i].strip('\n')
106     # print(list_note)
107     f2_w1= open(ss+'tred%s.csv'%year,'a')  
108     f2_w2= open(ss+'b4_%s.csv'%year,'a')  
109     # for i in range(len(list_note)):
110     # i=0
111     
112     for i in range(1,len(list_note)):
113 
114         str11="%s_%s,%s\n"%(list_note[i].split(',')[0],list_note[i].split(',')[1],list_note[i].strip('\n'))
115 
116         str12="%s_%s,%s,%s\n"%(list_note[i].split(',')[0],list_note[i].split(',')[1],list_note[i].split(',')[1],year)
117         f2_w1.write(str11)
118         f2_w2.write(str12)
119 
120 
121     f2_w1.close() 
122     f2_w2.close()
123     # f2.close()
124 
125 if __name__ == '__main__':
126 
127     tncl_note()
128     join()

 

posted on 2017-08-25 13:43  懵懂的菜鸟  阅读(233)  评论(0编辑  收藏  举报