1 #coding:utf-8
2 import urllib2
3 import re
4 import threading
5
6 #图片下载
7 def loadImg(addr,x,y,artName):
8 data = urllib2.urlopen(addr).read()
9 f = open(artName.decode("utf-8")+str(y)+'.jpg', 'wb')
10 f.write(data)
11 f.close()
12
13 #具体帖子页面解析,得到图片链接地址,并使用loadImg下载 artName为帖子名
14 def getImgLink(html,x,artName):
15 relink = '<img src=".*" file="(.*)" width=".*" id=".*" alt=".*.jpg" />'
16 cinfo = re.findall(relink,html)
17 y = 0
18 for lin in cinfo:
19 imgAddr = 'http://www.xxx.com/'+lin
20 print "LoadImg:"+str(x),imgAddr+'\n'
21 t = threading.Thread(target=loadImg(imgAddr,x,y,artName)) #使用threading 多线程下载
22 t.start()
23 y = y+1
24
25 #论坛版块页面解析,得到具体帖子链接
26 def getArticleLink(html,page):
27 relink = '<a href="(viewthread\.php\?tid=.*3D.*)">(.*)</a>'
28 cinfo = re.findall(relink,html)
29 x = 1
30 for lin in cinfo:
31 #print lin,'\n'
32 url="http://www.xxx.com/"+lin[0]
33 headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}
34 req = urllib2.Request(url,headers=headers)
35 response= urllib2.urlopen(req)
36 html = response.read()
37 getImgLink(html,x,lin[1])
38 x = x+1
39
40 start = 1 #起始页
41 end = 100 #终止页
42 for page in range(end):
43 url="http://www.xxx.com/forumdisplay.php?fid=19&page="+str(page+start)
44 headers={"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1"}
45 req = urllib2.Request(url,headers=headers)
46 response= urllib2.urlopen(req)
47 html = response.read()
48 print'Start'
49 getArticleLink(html,page)