1 # -*- coding:utf-8 -*-
2 # *************************************
3 # 程序:学习蜘蛛协议的第一个例子
4 # 版本:1.0
5 # 作者:Silence
6 # 语言:Python 2.7
7 # 日期:2014-03-15
8 # 操作:就是下载贴吧里面某个贴吧的所有网页,并且存储为html文件
9 # *************************************
10
11 import string,urllib2,re
12 from urllib2 import HTTPError
13
14 def baidu_tieba(url,begin_page,end_page):
15 for i in range(begin_page,end_page+1):
16 sName = string.zfill(i,5) + '.html'
17 print '正在下载第',str(i),'个网页,并将其存储为',sName,'.....'
18 try:
19 m = urllib2.urlopen(url + str(i)).read()
20 except HTTPError, e:
21 print '亲,你给的地址出问题了。'
22 if hasattr(e,'reason'):
23 print 'Code:',e.code,';Reason',e.reason
24 pass
25
26 f = open(sName,'w')
27 try:
28 f.write(m)
29 except Exception, e:
30 print '存储网页',sName,'出错!'
31 pass
32 finally:
33 f.close()
34
35 if __name__ == '__main__':
36 bdurl = str(raw_input('请输入贴吧的地址,去掉pn=后面的数字:\n'))
37 #因为现在贴吧需要登录上去,并且点击页数才会出现pn=,所以在这里加个判断,自己补全pn=
38 pattern='.+pn=$'
39 m=re.match(pattern,bdurl)
40 if m == None:
41 bdurl += '?pn='
42 print bdurl
43 begin_page = int(raw_input('请输入开始的页数:\n'))
44 end_page = int(raw_input('请输入终点的页数:\n'))
45
46 baidu_tieba(bdurl,begin_page,end_page)