import threading
import time
import requests
import json
from queue import Queue
from lxml import etree
class CrawlThread(threading.Thread):
'''doc string for crawl thread'''
def __init__(self, name, page_queue, data_queue):
super(CrawlThread, self).__init__()
self.name = name
self.page_queue = page_queue
self.data_queue = data_queue
self.url = 'http://www.ifanjian.net/latest-{}'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
}
def run(self):
print('%s ------- 线程启动' % self.name)
while True:
# 判断采集线程何时结束
if self.page_queue.empty():
break
# 从队列中取出页面
page = self.page_queue.get()
print("===采集===开始第%s页数据" % page)
# 拼接url,发送请求
url = self.url.format(page)
time.sleep(1)
r = requests.get(url, headers=self.headers)
# 将响应内容存放到data_queue
self.data_queue.put({
"index": str(page),
"text": r.text
})
print("===采集===结束第%s页数据" % page)
print('%s ------- 线程结束' % self.name)
class ParserThread(threading.Thread):
'''doc string for parse thread'''
def __init__(self, name, data_queue, fp, lock):
super(ParserThread, self).__init__()
self.name = name
self.data_queue = data_queue
self.fp = fp
self.lock = lock
def run(self):
print('%s ------- 线程启动' % self.name)
while True:
try:
# 从data_queue中取出数据
data = self.data_queue.get(True, 10)
print("===解析===开始第%s页数据" % data["index"])
# 解析内容
self.parse_content(data['text'])
print("===解析===开始第%s页数据" % data["index"])
except Exception:
break
print('%s ------- 线程结束' % self.name)
# 解析内容
def parse_content(self, data):
tree = etree.HTML(data)
# 先找出所有的li,然后再从li中找到所有的标题和图片
li_list = tree.xpath("//ul[@class='cont-list']/li")
items = []
for li in li_list:
# 获取标题
title = li.xpath(".//h2[@class='cont-list-title']/a/text()")[0]
# 获取图片
img_url = li.xpath(
".//div[contains(@class,'cont-list-main')]//img/@data-src")
item = {
'标题': title,
'图片链接': img_url
}
items.append(item)
# 写到文件中
self.lock.acquire()
self.fp.write(json.dumps(items, ensure_ascii=False) + '\n')
self.lock.release()
# 用来存放采集线程
g_crawl_list = []
# 用来存放解析线程
g_parser_list = []
def create_crawl_thread(page_queue, data_queue):
crawl_names = ['采集线程1', '采集线程2', '采集线程3']
for name in crawl_names:
tcrawl = CrawlThread(name, page_queue, data_queue)
g_crawl_list.append(tcrawl)
def create_parser_thread(data_queue, fp, lock):
parse_names = ['解析线程1', '解析线程2', '解析线程3']
for name in parse_names:
tparse = ParserThread(name, data_queue, fp, lock)
g_parser_list.append(tparse)
def create_queue():
page_queue = Queue()
for page in range(1, 10):
page_queue.put(page)
data_queue = Queue()
return page_queue, data_queue
def main():
# 创建队列
page_queue, data_queue = create_queue()
# 打开文件
fp = open('jian.json', 'a', encoding='utf-8')
# 创建锁
lock = threading.Lock()
# 创建采集线程
create_crawl_thread(page_queue, data_queue)
# 创建解析线程
create_parser_thread(data_queue, fp, lock)
# 启动所有采集线程
for tcrawl in g_crawl_list:
tcrawl.start()
# 启动所有解析线程
for tparser in g_parser_list:
tparser.start()
# 主线程等待子线程结束
for tcrawl in g_crawl_list:
tcrawl.join()
for tparser in g_parser_list:
tparser.join()
# 关闭文件
fp.close()
print("主线程和子线程全部结束.....")
if __name__ == "__main__":
main()