1 __author__ = 'minmin'
2 #coding:utf-8
3 import re,urllib,sgmllib
4
5 #根据当前的url获取html
6 def getHtml(url):
7 page = urllib.urlopen(url)
8 html = page.read()
9 page.close()
10 return html
11
12 #根据html获取想要的文章内容
13 def func(str):
14 result= re.findall(r"<p style=\"TEXT-INDENT: 30px; MARGIN: 0px 3px 15px\">([^<>]*)</p>",getHtml(url),re.M) or re.findall(r"<p>([^<>]*)</p>",getHtml(url),re.M)
15 # or re.findall( r"<p style=\"TEXT-JUSTIFY: distribute; TEXT-ALIGN: justify\" align=\"justify\">(.*?)</p>",getHtml(url),re.M)
16 artical =''
17 for j in result:
18 if len(j)<>0:
19 j = j.replace(" ","")
20 j = j.replace("<strong>"," ")#去掉<STRONG>,换成" "
21 j = j.replace("</strong>"," ")#去掉</STROGN>换成" "
22 artical = artical + j + '\n'
23 return artical
24
25 #html链接的标签是“a”,链接的属性是“href”,也就是要获得html中所有tag=a,attrs=href 值。
26 class URLPaser(sgmllib.SGMLParser):
27 def reset(blank):
28 sgmllib.SGMLParser.reset(blank)
29 blank.urls = []
30
31 def start_a(blank,attrs):
32 href = [v for k,v in attrs if k == 'href']
33 if href:
34 blank.urls.extend(href)
35
36 IParser = URLPaser()
37 socket = urllib.urlopen("http://travel.gmw.cn/node_39034.htm")#打开这个网页
38
39 #fout = file('qq_art_urls.txt','w')#要把这个链接写到这个文件中
40 IParser.feed(socket.read())#分析啦
41
42 reg = 'http://travel.gmw.cn/2015-.*' #这个是用来匹配符合条件的链接,使用正则表达式匹配
43 reg2= 'http://travel.gmw.cn/2014-.*'
44 pattern = re.compile(reg)
45 patter = re.compile(reg2)
46 i= 0
47 url2=[]
48 for url in IParser.urls:#链接都存在urls里
49 url = "http://travel.gmw.cn/" + url
50 if pattern.match(url):
51 if url not in url2:
52 url2.append(url)
53 print url
54 artical = func(url)
55 print artical
56 if len(artical)<>0:
57 i = i + 1
58 f = open("gmw/travel/"+str(i) + '.txt','a+')
59 f.write(artical)
60 f.close()
61
62 if patter.match(url):
63 if url not in url2:
64 url2.append(url)
65 print url
66 print artical
67 if len(artical)<>0:
68 i = i + 1
69 f = open("gmw/travel/"+str(i) + '.txt','a+')
70 f.write(artical)
71 f.close()