1 import urllib
2 import urllib.request
3 import bs4
4 from bs4 import BeautifulSoup as bs
5 import re
6 import os
7
8 # year = '97A'
9 # ss="./data/%s/"%year
10 '''
11 适应网页爬取95B-96B
12
13 '''
14
15
16 '''
17 解决网页请求失败
18 resp = None
19 while (resp == None):
20 try:
21 resp = urllib.request.urlopen("http://baidu.com
22
23 ")
24 except:
25 pass
26
27 '''
28 def b0_trmd(year,ss):
29 if not os.path.exists(ss):
30 os.makedirs(ss)
31 # os.makedirs(ss)
32 p1=r"^([A-Z]{6})"
33
34 url = "http://www.stylusstudio.com/edifact/D%s/messages.htm"%year
35 resp=None
36 while(resp==None):
37 try:
38 resp = urllib.request.urlopen(url)
39 except:
40 pass
41 data = resp.read().decode('cp852')
42 soup = bs(data, 'html.parser')
43 segment11= soup.find_all('table')# ResultSet
44 segment1=segment11[0].find_all('td')[1:]#表示第几个table,此时表示进去html网页中的第7个table,[1:],<class 'list'>
45 # segment2= soup.find_all('table')
46 # print(type(segment1))#
47 f2=open(ss+'./trmd1%s.txt'%year,'a',encoding='utf-8')
48 f3=open(ss+'./b0%s.txt'%year,'a',encoding='utf-8')
49 f4=open(ss+'./trmd%s.txt'%year,'a',encoding='utf-8')
50 pattern1=re.compile(p1)
51 tag_list=[]
52 for item in segment1:
53 # print(item.string)#如果一个标签里面没有标签了,那么 .string 就会返回标签里面的内容。如果标签里面只有唯一的一个标签了,那么 .string 也会返回最里面的内容。
54 str1=item.get_text()
55 # if str1.strip()=="":用于判断字符串是否含空格
56 # break
57 if item.string==None:
58 # print("hhusssssssssssssssssssss")
59 break
60 matcher1=re.findall(pattern1,str1)
61 if matcher1:
62
63 f3.write(matcher1[0]+','+year+'\n')
64 tag_list.append(matcher1[0])
65 f4.write(matcher1[0]+',')
66 else:
67 f4.write(str1+'\n')
68
69
70 # print(type(str1))
71 # test1(str1)
72 # print(str1)#以文本方式呈现
73
74 # print(item.get_text())#获取具体标签内部内容
75 # print([text for text in item.stripped_strings] )#以列表方式呈现
76
77 # str2=str([text for text in item.stripped_strings])
78 # #print(type(str1[0][0]))
79 f2.writelines(str1+'\n')
80 f2.close()
81 return tag_list
82 def test1(code_tag,year,ss):
83
84 url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
85 resp=None
86 while(resp==None):
87 try:
88 resp = urllib.request.urlopen(url)
89 except:
90 pass
91 data = resp.read().decode('UTF-8')
92 soup = bs(data, 'html.parser')
93 segment11= soup.find_all('table')
94 segment1=segment11[6].find_all('tr')#表示第几个table,此时表示进去html网页中的第7个table
95
96
97 f2=open(ss+'./text1%s%s.txt'%(year,code_tag),'a',encoding='cp852')
98 for item in segment1:
99
100 # #print(item)
101 '''
102 <tr class="FrameTreeFont"><td><span class="FrameDrawFont">│
103 <span class="FrameHideFont">─</span>│<span class="FrameHideFont">─</span>├─</span>
104 <a class="FrameItemFont" href="DAM_.htm" target="classFrame" title="Damage">DAM</a>
105 Damage</td><td align="right"><span class="FrameDetailFont"> ×1
106 </span></td><td><span class="FrameDetailFont">(M)</span></td></tr>
107 '''
108 str12=item.get_text()
109 # #print(str12)#以文本方式呈现
110 # #print(type(str12))
111 '''
112 │─│─├─DAM Damage ×1 (M)
113 '''
114 # #print(item.td.span.get_text())#获取具体标签内部内容
115 # #print([text for text in item.stripped_strings] )#以列表方式呈现
116 '''
117 ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
118 '''
119 '''
120 soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。
121
122 soup.get_text("|", strip=True)#u'I linked to|example.com'
123 '''
124 str1=str([text for text in item.stripped_strings])
125 # #print(type(str1[0][0]))
126 f2.writelines(str12+'\n')
127
128 f2.close()
129 def test2(code_tag,year,ss):
130 # p1=r"^(?:├─|└─)(.+)\n"
131 p1=r"^\W{2}(\w.+)\n"#
132 # p1=r"^\W{2}(Segment\sGroup\s\w.+)\n"#segement为第一层
133 # p2=r"^(?:│─├─|│─└─)(.+)\n"
134 p2=r"^\W{4}(\w.+)\n"
135 # p3=r"^(?:│───├─|│───└─|│─│─├─|│─│─└─)(.+)\n"
136 p3=r"^\W{6}(\w.+)\n"
137 # p4=r"^(?:)(.+)\n"
138
139 p4=r"^\W{8}(\w.+)\n"
140 p5=r"^\W{10}(\w.+)\n"
141 p6=r"^\W{12}(\w.+)\n"
142 p7=r"^\W{14}(\w.+)\n"
143 p8=r"^\W{16}(\w.+)\n"
144
145 p9=r"Segment\sGroup\s(?:([0-9]|[0-9][0-9]))"
146 # p10="Segment Group "
147
148
149
150 pattern1=re.compile(p1)
151 pattern2=re.compile(p2)
152 pattern3=re.compile(p3)
153 pattern4=re.compile(p4)
154
155 pattern5=re.compile(p5)
156 pattern6=re.compile(p6)
157 pattern7=re.compile(p7)
158 pattern8=re.compile(p8)
159 pattern9=re.compile(p9)
160 # pattern10=re.compile(p10)
161
162 f1=open(ss+'./text1%s%s.txt'%(year,code_tag),'r',encoding='cp852')
163 f2=open(ss+'./text2%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
164 # c=int()
165 # d=int()
166 listp=[0,0,0,0,0,0,0,0]#用于记录父节点
167 for line in f1.readlines():
168
169 matcher1=re.findall(pattern1,line)
170 matcher2=re.findall(pattern2,line)
171 matcher3=re.findall(pattern3,line)
172 matcher4=re.findall(pattern4,line)
173
174 matcher5=re.findall(pattern5,line)
175 matcher6=re.findall(pattern6,line)
176 matcher7=re.findall(pattern7,line)
177 matcher8=re.findall(pattern8,line)
178 matcher9=re.findall(pattern9,line)
179 # #print(type(matcher1))
180
181 if matcher1:
182
183 a='SG'+str(listp[0])+' '+matcher1[0]+'\n'
184 f2.write(a)
185 if matcher9:
186 listp[1]=matcher9[0]
187 if matcher2:
188
189 b='SG'+str(listp[1])+' '+matcher2[0]+'\n'
190 f2.write(b)
191 if matcher9:
192 listp[2]=matcher9[0]
193 if matcher3:
194
195 c='SG'+str(listp[2])+' '+matcher3[0]+'\n'
196 f2.write(c)
197 #print(c)
198 if matcher9:
199 listp[3]=matcher9[0]
200 if matcher4:
201 d='SG'+str(listp[3])+' '+matcher4[0]+'\n'
202 f2.write(d)
203 #print(d)
204 if matcher9:
205 listp[4]=matcher9[0]
206 if matcher5:
207 e='SG'+str(listp[4])+' '+matcher5[0]+'\n'
208 f2.write(e)
209 #print(d)
210 if matcher9:
211 listp[5]=matcher9[0]
212 if matcher6:
213 f='SG'+str(listp[5])+' '+matcher6[0]+'\n'
214 f2.write(f)
215 #print(d)
216 if matcher9:
217 listp[6]=matcher9[0]
218 if matcher7:
219 g='SG'+str(listp[6])+' '+matcher7[0]+'\n'
220 f2.write(g)
221 #print(d)
222 if matcher9:
223 listp[7]=matcher9[0]
224 if matcher8:
225 h='SG'+str(listp[7])+' '+matcher8[0]+'\n'
226 f2.write(h)
227 #print(d)
228 if matcher9:
229 listp[8]=matcher9[0]
230 f2.close()
231 f1.close()
232 f3=open(ss+'./text3%s%s.txt'%(year,code_tag),'w',encoding='utf-8')
233 f4=open(ss+'./text2%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
234 for line1 in f4.readlines():
235 #print(line1)
236 # f3.write(line1.replace(" "," "))
237 f3.write(line1.replace("Segment Group ","SG"))
238 f4.close()
239 f3.close()
240 def test3(code_tag,year,ss):
241 f5=open(ss+'./text4%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
242 f6=open(ss+'./text3%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
243 p10=r"(^\w{3})\s(\w{3}).+×([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\s\((\w)\)$"
244 pattern10=re.compile(p10)
245 i=0
246 for line2 in f6.readlines():
247 i=i+1
248 matcher10=re.findall(pattern10,line2)
249 # print(matcher10)
250 # print(type(matcher10))
251 if matcher10:
252 f5.write(str(matcher10[0])+'\n')
253
254 f5.close()
255 f6.close()
256 # print(i)
257 return i
258 def test4(code_tag,year,ss):
259 url = "http://www.stylusstudio.com/edifact/D%s/%s.htm"%(year,code_tag)
260 resp=None
261 while(resp==None):
262 try:
263 resp = urllib.request.urlopen(url)
264 except:
265 pass
266 data = resp.read().decode('UTF-8')
267 soup = bs(data, 'html.parser')
268 segment11= soup.find_all('p')
269 # segment1=segment11[1].find_all('p')#表示第几个table,此时表示进去html网页中的第7个table
270 # #print(segment1)
271 f2=open(ss+'./text5%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
272 for item in segment11:
273 str12=item.get_text()
274 #print(str12)#以文本方式呈现
275 #print(type(str12))
276 '''
277 │─│─├─DAM Damage ×1 (M)
278 '''
279 # #print(item.td.span.get_text())#获取具体标签内部内容
280 #print([text for text in item.stripped_strings] )#以列表方式呈现
281 '''
282 ['│', '─', '│', '─', '├─', 'DAM', 'Damage', '×1', '(M)']
283 '''
284 '''
285 soup.get_text("|")#u'\nI linked to |example.com|\n'进一步,通过strip去除掉文本每个位的头尾空白。
286
287 soup.get_text("|", strip=True)#u'I linked to|example.com'
288 '''
289 str1=str([text for text in item.stripped_strings])
290 #print(type(str1[0][0]))
291 f2.writelines(str12+'\n')
292
293 f2.close()
294
295 # f2=open('./text1.txt','a',encoding='cp852')
296 # for item in segment1:
297 def test5(code_tag,num,year,ss):
298 f7=open(ss+'./text6%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
299 f8=open(ss+'./text5%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
300 p1=r"(^A\sservice\ssegment.+\n)"
301 # p2=r"((?:A\s\w|^Date|^This|^Document|^In\s|^Requirements\s|^Dimensions|^The|^If\s|^Through|^Instructions|^For|^An).+\n)"
302 p2=r"(^(?!Information.+\:|Note|It\sis\srecommended\sthat\swhere|ID\sshould\sbe\sspecified|All\sother\ssegments|A\sgroup\sof\ssegments\sthat\scontains\sa\sline\sitem\sand\sits\srelated\sinformation.+should\sbe\sconsigned.).+\n)"
303 pattern1=re.compile(p1)
304 pattern2=re.compile(p2)
305 # pattern3=re.compile(p3)
306 # pattern4=re.compile(p4)
307 flag=0
308 i=num
309 for line3 in f8.readlines():
310 matcher1=re.findall(pattern1,line3)
311 matcher2=re.findall(pattern2,line3)
312 # matcher3=re.findall(pattern3,line3)
313 # matcher4=re.findall(pattern4,line3)
314
315 # #print(matcher10)
316 if matcher1 and flag==0:
317 f7.write(matcher1[0])
318 flag=1
319 i=i-1
320 if i==0:
321 break
322 continue
323 if (matcher2 and (flag==1 or flag==2)):
324 f7.write(matcher2[0])
325 flag=2
326 i=i-1
327 continue
328 f7.close()
329 f8.close()
330
331 def join(code_tag,year,ss):
332
333
334 f1 =open(ss+'text6%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
335 f2= open(ss+'text4%s%s.txt'%(year,code_tag),'r',encoding='utf-8')
336
337
338 list_note=[]
339 for line1 in f1:
340 list_note.append(line1)
341 f1.close()
342 p11=r"^\W{2}(\w{3}).+\n"
343 p12=r"^\W{2}\w{3}\W{2}\s\W(\w{3}).+\n"
344 p13=r"^\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W([0-9]|[0-9]{2}|[0-9]{3}|[0-9]{4}|[0-9]{5})\W.+\n"
345 p14=r"\W{2}\w{3}\W{2}\s\W\w{3}\W{2}\s\W.+(C|M)"
346 # print(list_note)
347 f2_w= open(ss+'b1%s%s.txt'%(year,code_tag),'a',encoding='utf-8')
348 f3_w= open(ss+'b1%s.csv'%year,'a',encoding='utf-8')
349 # for i in range(len(list_note)):
350 i=0
351 pattern11=re.compile(p11)
352 pattern12=re.compile(p12)
353 pattern13=re.compile(p13)
354 pattern14=re.compile(p14)
355 # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
356 pos=[
357
358 '0010','0020','0030','0040','0050','0060','0070','0080','0090','0100','0110','0120','0130','0140','0150','0160','0170','0180','0190','0200',
359 '0210','0220','0230','0240','0250','0260','0270','0280','0290','0300','0310','0320','0330','0340','0350','0360','0370','0380','0390','0400',
360 '0410','0420','0430','0440','0450','0460','0470','0480','0490','0500','0510','0520','0530','0540','0550','0560','0570','0580','0590','0600',
361 '0610','0620','0630','0640','0650','0660','0670','0680','0690','0700','0710','0720','0730','0740','0750','0760','0770','0780','0790','0800',
362 '0810','0820','0830','0840','0850','0860','0870','0880','0890','0900','0910','0920','0930','0940','0950','0960','0970','0980','0990','1000',
363 '1010','1020','1030','1040','1050','1060','1070','1080','1090','1100','1110','1120','1130','1140','1150','1160','1170','1180','1190','1200',
364 '1210','1220','1230','1240','1250','1260','1270','1280','1290','1300','1310','1320','1330','1340','1350','1360','1370','1380','1390','1400',
365 '1410','1420','1430','1440','1450','1460','1470','1480','1490','1500','1510','1520','1530','1540','1550','1560','1570','1580','1590','1600',
366 '1610','1620','1630','1640','1650','1660','1670','1680','1690','1700','1710','1720','1730','1740','1750','1760','1770','1780','1790','1800',
367 '1810','1820','1830','1840','1850','1860','1870','1880','1890','1900','1910','1920','1930','1940','1950','1960','1970','1980','1990','2000',
368 '2010','2020','2030','2040','2050','2060','2070','2080','2090','2100','2110','2120','2130','2140','2150','2160','2170','2180','2190','2200',
369 '2210','2220','2230','2240','2250','2260','2270','2280','2290','2300','2310','2320','2330','2340','2350','2360','2370','2380','2390','2400',
370 '2410','2420','2430','2440','2450','2460','2470','2480','2490','2500','2510','2520','2530','2540','2550','2560','2570','2580','2590','2600',
371 '2610','2620','2630','2640','2650','2660','2670','2680','2690','2700','2710','2720','2730','2740','2750','2760','2770','2780','2790','2800',
372 '2810','2820','2830','2840','2850','2860','2870','2880','2890','2900','2910','2920','2930','2940','2950','2960','2970','2980','2990','3000',
373 '3010','3020','3030','3040','3050','3060','3070','3080','3090','3100','3110','3120','3130','3140','3150','3160','3170','3180','3190','3200',
374 '3210','3220','3230','3240','3250','3260','3270','3280','3290','3300','3310','3320','3330','3340','3350','3360','3370','3380','3390','3400',
375 '3410','3420','3430','3440','3450','3460','3470','3480','3490','3500','3510','3520','3530','3540','3550','3560','3570','3580','3590','3600',
376 '3610','3620','3630','3640','3650','3660','3670','3680','3690','3700','3710','3720','3730','3740','3750','3760','3770','3780','3790','3800',
377 '3810','3820','3830','3840','3850'
378
379 ]
380 for line2 in f2:
381 matcher11=re.findall(pattern11,line2)
382 matcher12=re.findall(pattern12,line2)
383 matcher13=re.findall(pattern13,line2)
384 matcher14=re.findall(pattern14,line2)
385 # print(matcher11[0])
386 # print(matcher12[0])
387 # print(matcher13[0])
388 # print(matcher14[0])
389 # print(matcher11[0])
390 # a=list(line2)
391 # print(a)
392 # b=str(a)
393 # print(b)
394 # print(line2.split(','))
395 try:
396 str11="%s,%s,%s,%s,%s,%s,%s,\"%s\"\n"%(pos[i],code_tag,matcher12[0],matcher11[0],year,matcher14[0],matcher13[0],list_note[i].strip('\n'))
397
398 i=i+1
399 # print(i)
400 # print(str11)
401 f2_w.write(str11)
402 f3_w.write(str11)
403 except:
404 print("---error---")
405 break
406
407 f2_w.close()
408 f2.close()
409
410 def test():#用户爬取网页,保存到本地
411 filename='./codeco.txt'
412 url = "http://www.stylusstudio.com/edifact/D95B/CODECO.htm"
413 resp = urllib.request.urlopen(url)
414 data = resp.read().decode('UTF-8')
415 # f1=open(filename,'w')
416 # f1.write(data)
417 # #print(type(data))
418 # #print(data)
419 f2=open('./text.txt','a')
420 soup = bs(data, 'html.parser')
421
422 # sw=soup.find_all('table',border=0,width="100%")
423 # #print(sw[0])
424 segment1= soup.find_all('h4')
425
426 segment2= soup.find_all('p')
427 # #print(type(segment))
428 #print(segment1)
429 #print(segment2)
430 nowplaying_list = []
431 for item in segment1:
432 #print(item)
433 # #print(item.name)
434 # #print(item.attrs)
435 # #print(type(item))
436 #print(item.get_text())
437 #print([text for text in item.stripped_strings] )
438 f2.writelines(str([text for text in item.stripped_strings])+'\n')
439 # nowplaying_dict = {}
440 # nowplaying_dict['id'] = item['a']
441 # for tag_img_item in item.find_all('img'):
442 # nowplaying_dict['name'] = tag_img_item['alt']
443 # nowplaying_list.append(nowplaying_dict)
444 # result= segment[0].find_all('h4')
445 # #print(result)
446
447 for item in segment2:
448
449 #print(item)
450 #print(item.get_text())
451 f2.writelines(str([text for text in item.stripped_strings] )+'\n')
452 f2.close()
453 # data={}
454 # data['word']='Jecvay Notes'
455
456 # url_values=urllib.parse.urlencode(data)
457 # url="http://www.baidu.com/s?"
458 # full_url=url+url_values
459
460 # data=urllib.request.urlopen(full_url).read()
461 # data=data.decode('UTF-8')
462 # #print(data)
463 if __name__=='__main__':
464 # '97A','97B','98A','98B','99A','99B'
465 year1=['00A','00B','01A','01B','01C','02A','02B','03A','03B','04A','04B']
466 for j in range(len(year1)):
467
468 year=year1[j]
469 ss="./data/%s/"%year
470 tag=b0_trmd(year,ss)
471 print(tag)
472 for i in range(len(tag)):
473 test1(tag[i],year,ss)
474 test2(tag[i],year,ss)
475 num=test3(tag[i],year,ss)
476 test4(tag[i],year,ss)
477 test5(tag[i],num,year,ss)
478 join(tag[i],year,ss)
479 print("------%s-----ok"%i)
480 # str1='APERAK'
481 # join(str1)