1 # -*- coding:utf-8 -*-
2 # author:zxy
3 # Date:2018-10-21
4
5 import request
6 from lxml import etree
7 import threading
8 from queue import Queue
9 import csv
10 import requests
11
12 class Produce(threading.Thread):
13 headers = {
14 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
15 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
16 'Cookie': '__cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4;'
17 ' _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1'
18 }
19 def __init__(self,page_queue,joke_queue,*args,**kwargs):
20 super(Produce, self).__init__(*args,**kwargs)
21 self.base_domain="http://www.budejie.com"
22 self.page_queue = page_queue
23 self.joke_queue = joke_queue
24 def run(self):
25 while True:
26 if self.page_queue.empty():
27 break
28 url=self.page_queue.get()
29 self.parse_url(url)
30
31 def parse_url(self,url):
32 reponse=requests.get(url,headers=self.headers)
33 text=reponse.text
34 html=etree.HTML(text)
35 descs=html.xpath("//div[@class='j-r-list-c-desc']")
36 for desc in descs:
37 jokes=desc.xpath(".//text()")
38 joke="\n".join(jokes).strip()
39 link=self.base_domain+desc.xpath(".//a/@href")[0]
40 self.joke_queue.put((joke,link))
41 print("="*30+"第%s页下载完成!"%url.split('/')[-1]+"="*30)
42
43
44 class Consumer(threading.Thread):
45 headers = {
46 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) '
47 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
48 'Cookie': '__cfduid=ddb28ef1934faef742f7fb8911d7b33bd1540080067; UM_distinctid=16693ece9945b2-0b031da4b19f32-333b5602-1fa400-16693ece9958e4;'
49 ' _ga=GA1.2.1950184368.1540080070; _gid=GA1.2.1249143498.1540080070; _gat=1'
50 }
51 def __init__(self,joke_queue,write,gLock,*args,**kwargs):
52 super(Consumer, self).__init__(*args,**kwargs)
53 self.joke_queue=joke_queue
54 self.write=write
55 self.gLock=gLock
56
57 def run(self):
58 while True:
59 try:
60 joke_info=self.joke_queue.get(timeout=40)
61 joke,link=joke_info
62 self.gLock.acquire()
63 self.write.writerow((joke,link))
64 self.gLock.release()
65 except:
66 break
67
68
69 def main():
70 page_queue=Queue(100)
71 joke_queue=Queue(1000)
72 gLock=threading.Lock()
73 fp=open('baisibudejie.csv','a',newline='',encoding='utf-8')
74 writer=csv.writer(fp)
75 writer.writerow(('content','link'))
76
77 for x in range(1,11):
78 url="http://www.budejie.com/%d"%x
79 page_queue.put(url)
80
81 for x in range(5):
82 t=Produce(page_queue,joke_queue)
83 t.start()
84
85 for x in range(3):
86 t=Consumer(joke_queue,writer,gLock)
87 t.start()
88
89
90 if __name__ == '__main__':
91 main()