1 import requests
2 from lxml import etree
3 from urllib import request
4 import re
5 import os
6 from queue import Queue
7 import threading
8
9
10 '''
11 这个程序有bug
12 '''
13 class Produce(threading.Thread):
14 headers = {
15 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
16 }
17 def __init__(self, page_queue, image_queue, *args, **kwargs):
18 super(Produce, self).__init__(*args, **kwargs)
19 self.page_queue = page_queue
20 self.image_queue = image_queue
21
22 def run(self):
23 while 1:
24 if self.page_queue.empty():
25 break
26 url = self.page_queue.get()
27 self.parse_page(url)
28
29 def parse_page(self,url):
30 resp = requests.get(url, headers =self.headers)
31 html = etree.HTML(resp.text)
32 all_pic = html.xpath('//div[@class="col-xs-6 col-sm-3"]')
33 for pic in all_pic:
34 pic_url = pic.xpath('.//img//@data-original')[0]
35 pic_name = pic.xpath('.//img//@alt')[0]
36 pic_name =re.sub(r'[\??\.。,!!]', '', pic_name)
37 kuozhanming = os.path.splitext(pic_url)[1]
38 filmname = r'G:\picktrue'+'\\'+pic_name+kuozhanming
39 self.image_queue.put((pic_url, filmname))
40
41
42 class Consumer(threading.Thread):
43 def __init__(self, page_queue, image_queue, *args, **kwargs):
44 super(Consumer, self).__init__(*args, **kwargs)
45 self.page_queue = page_queue
46 self.image_queue = image_queue
47
48 def run(self):
49 while 1:
50 if self.image_queue.empty() and self.page_queue.empty():
51 break
52 pic_url, filmname = self.image_queue.get()
53 request.urlretrieve(pic_url, filmname)
54
55
56 def main():
57 page_queue= Queue(20)
58 image_queue = Queue(1000)
59 for x in range(1,3):
60 url = 'http://www.doutula.com/article/list/?page=%d'%x
61 page_queue.put(url)
62 for x in range(3):
63 t = Produce(page_queue, image_queue)
64 t.start()
65 for x in range(3):
66 t = Consumer(page_queue, image_queue)
67 t.start()
68
69 if __name__ == '__main__':
70 main()