'''
爬取糗事百科的段子,将内容和连接爬取下来,写入scv
使用技术:多线程,锁,队列,xpath,csv
'''
import requests
import csv
from queue import Queue
from lxml import etree
import threading
class Creeper(threading.Thread):
def __init__(self,url_queue,content_queue,*args,**kwargs):
super().__init__(*args,**kwargs)
self.url_queue = url_queue
self.content_queue = content_queue
def run(self):
while True:
if self.url_queue.empty():
break
url = self.url_queue.get()
self.parse_page(url)
def parse_page(self,url):
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"}
response = requests.get(url,headers=headers)
text = etree.HTML(response.text)
divEle = text.xpath('//div[contains(@class,"article block")]')
for div in divEle:
content = div.xpath('.//a[@class="contentHerf"]//span[1]//text()')
new_content = "\n".join(list(map(lambda x:x.replace('\n',''),content)))
a_url = "https://www.qiushibaike.com" + div.xpath('.//a[@class="contentHerf"]/@href')[0]
self.content_queue.put((new_content,a_url))
class SaveFile(threading.Thread):
def __init__(self,content_queue,writer,lock,*args,**kwargs):
super().__init__(*args,**kwargs)
self.content_queue = content_queue
self.writer = writer
self.lock = lock
def run(self):
while True:
try:
content,link = self.content_queue.get(timeout=30) # 设置超时时间
# 写入文件必须加锁
self.lock.acquire()
self.writer.writerow((content,link))
self.lock.release()
print('保存一条')
except:
break
def main():
url_queue = Queue(100)
content_queue = Queue(300)
base_url = "https://www.qiushibaike.com/text/page/{}/"
gLock = threading.Lock()
# 解决写入中文乱码
f = open('糗事百科.csv','a',encoding='utf-8-sig',newline="")
header = ['content','link']
writer = csv.writer(f)
writer.writerow(header)
for i in range(1,13):
url = base_url.format(i)
url_queue.put(url)
for i in range(2):
c = Creeper(url_queue, content_queue)
c.start()
for i in range(2):
s = SaveFile(content_queue,writer,gLock)
s.start()
if __name__ == '__main__':
main()