批量下载新浪博客文章(改进版)

恩,算是对前段时间写的那个的改进,重写了正则,同时支持翻页下载,还修改了一些bug.但还不支持多线程,打算过几天加上

 1 #!/usr/bin/python
 2 #-*- coding:utf-8 -*-
 3 #****************************
 4 
 5 #author:tmyyss
 6 #version:0.2
 7 
 8 #****************************
 9 
10 import urllib
11 import os
12 import re
13 
14 def article_format(usock,basedir):    
15     title_flag=True
16     context_start_flag=True
17     context_end_flag=True
18     for line in usock:
19         if title_flag:
20             title=re.findall(r'(?<=>).+(?=<)',line)
21             if title:
22                 title=title[0]
23                 filename=basedir+title
24                 print filename
25                 try:
26                     fobj=open(filename,'w+')
27                     fobj.write(title+'\n')
28                     title_flag=False
29                 except IOError,e:
30                     print "Open %s error:%s"%(filename,e)
31             else:
32                 pass
33         elif context_start_flag:
34             results1=re.findall(r'(<.+?正文开始.+?>)',line)
35             if results1:
36                 context_start_flag=False
37         elif context_end_flag:
38             results2=re.findall(r'(<.+?正文结束.+?)',line)
39             if results2:
40                 context_end_flag=False
41                 fobj.write('\nEND')
42                 fobj.close()
43                 break
44             else:    
45                 if 'div' in line or 'span' in line or  '<p>' in line:
46                     pass
47                 else:    
48                     line=re.sub('&#65292;',',',line)
49                     line=re.sub('&#65306;',':',line)
50                     line=re.sub('&#65281;','!',line)
51                     line=re.sub('&#65288;','(',line)
52                     line=re.sub('&#65289;',')',line)
53                     line=re.sub('&#8943;','...',line)
54                     line=re.sub('&#65311;','?',line)
55                     line=re.sub('&#65307;',';',line)
56                     line=re.sub(r'<wbr>','',line)
57                     line=re.sub(r'&nbsp;','',line)
58                     line=re.sub(r'<br\s+?/>','',line)
59                     fobj.write(line)
60         else:
61             print "*****************************************************************"
62 
63 def parser_page(pageurl):
64     total_url=[]
65     current_page=get_url(pageurl)
66     total_url.extend(current_page)
67     usock=urllib.urlopen(pageurl)
68         context=usock.read()
69     otherpage=re.findall(r'href.+?跳转',context)
70         for page in otherpage:
71                 page=re.findall(r'http.+?html',page)
72                 pageurl=page[0]
73         urllist=get_url(pageurl)
74         total_url.extend(urllist)
75     return total_url
76         
77         
78 def get_url(pageurl):
79     urllist=[]
80     usock=urllib.urlopen(pageurl)
81         context=usock.read()
82         raw_url_list=re.findall(r'(<a\s+title.+?href="http.+?html)',context)
83     for url in raw_url_list:
84                 url=re.findall('(http.+?html)',url)[0]
85         urllist.append(url)
86     return urllist
87 
88 
89 if __name__=='__main__':
90     basedir='/home/tmyyss/article/'
91     if not os.path.exists(basedir):
92         os.makedirs(basedir)
93     url_list=parser_page("http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html")
94     for url in url_list:
95         article_usock=urllib.urlopen(url)
96         article_format(article_usock,basedir)

 

posted @ 2014-12-29 20:29  tmyyss  阅读(1755)  评论(0编辑  收藏  举报