python面试题使用多线程、生产者消费者模式爬取数据模板
1.基础爬虫题:
目标网址:https://www.3839.com/top/hot.html
项目需求:爬取人气榜单所有游戏信息,游戏评价
题目要求:使用多线程、生产者消费者模式爬取,不得使用任何爬虫框架
游戏信息字段:
游戏idgame_id83294
游戏名game_name我的世界
游戏logologohttps://fs.img4399.com/sykb~sykb/20200116/09145170713
游戏介绍introduce原生游戏全面更新,冒险主题全面开启!
评分score8.3
评价数comment_count13567
评论信息字段:
评论idomment_id244841561
用户iduser_id19328128
用户名usernameIeo
用户头像portraithttps://imga.3839.com/19328128
评论时间creat_time2020-03-0109:32
评论内容content这四星都是给mc的,并不是给网易..........
点赞数like_count61
回复数reply_count13
答案:
import requests, queue, threading
from lxml import etree
from queue import Queue
import re,json
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"Cookie": "Hm_lvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958055; UM_distinctid=17230e2979964b-0e3af17765eaa-396a4507-100200-17230e2979a69d;"
" CNZZDATA1000292083=215119917-1589957264-%7C1589957264; CNZZDATA30039538=cnzz_eid%3D928846934-1589957153-%26ntime%3D1589957153; "
"Hm_lpvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958969"}
class Product(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"Cookie": "Hm_lvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958055; UM_distinctid=17230e2979964b-0e3af17765eaa-396a4507-100200-17230e2979a69d;"
" CNZZDATA1000292083=215119917-1589957264-%7C1589957264; CNZZDATA30039538=cnzz_eid%3D928846934-1589957153-%26ntime%3D1589957153; "
"Hm_lpvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958969"}
# 初始化
def __init__(self, page_url, img_url):
super(Product, self).__init__()
self.page_url = page_url
self.img_url = img_url
# 子线程运行
def run(self):
while True:
# 如果图片url爬去完毕,则跳出循环,结束子线程
if self.page_url.empty():
break
# 获取将爬取的页面url,并解析
html = self.page_url.get()
# print("html",html)
self.parse_html(html)
# 获取图片的url
def parse_html(self, html):
response_t = requests.get(url=html, headers=headers)
response_t.encoding = "utf-8"
tree = etree.HTML(response_t.text)
print('-----------------------------------------------------------------------------')
print("游戏名game_name:", tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[1]/h1/text()')[0])
print("游戏logo:", 'https:' + tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/img/@src')[0])
print("游戏介绍:",tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[3]/div[5]/div/div/text()')[0] if tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[3]/div[5]/div/div/text()') else '暂无介绍')
print("评分:", tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div[1]/p[2]/text()')[0])
print("评价数", tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div[1]/p[4]/text()')[0])
href = re.findall("var iframeUrl =(.*)", response_t.text)
z = 'https:' + href[0].replace(".htm?dm=' + window.location.host;",
'-htmlsafe-1-urltype-1-audit-1.htm').strip().replace("htm", "json", 1)
href = z.replace("'", "")
self.img_url.put(href)
class Consume(threading.Thread):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
"Cookie": "Hm_lvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958055; UM_distinctid=17230e2979964b-0e3af17765eaa-396a4507-100200-17230e2979a69d;"
" CNZZDATA1000292083=215119917-1589957264-%7C1589957264; CNZZDATA30039538=cnzz_eid%3D928846934-1589957153-%26ntime%3D1589957153; "
"Hm_lpvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958969"}
def __init__(self, page_url, img_url=None):
super(Consume, self).__init__()
self.page_url = page_url
self.img_url = img_url
def run(self):
# print(5555)
while self.img_url:
# 如果图片队列为空且页面队列为空,则推出循环
comment_url = self.img_url.get()
# print(6666,comment_url)
#获取详情页的数据
#控制评论页数量
tag = 1
while tag==1:
try:
response_comment = requests.get(url=comment_url.replace("p-1","p-{}".format(tag)),headers=self.headers)
for w in json.loads(response_comment.text)['content']:
print(
"评论id", w["id"]+'\n'
"用户id", w["uid"]+'\n'
"用户名", w["username"]+'\n'
"用户头像", "https:" + w["avatar"]+'\n'
"评论时间",w["time"]+'\n'
"评论内容", w["comment"][0:100]+'\n'
"点赞数", w["good_num"]+'\n'
"回复数", len(w["reply"]))
tag+=1
except Exception:
break
def main():
# 设置页面队列为10,图片队列为200
page_url = Queue()
img_url = Queue()
# 基本url
base_url = "https://www.3839.com/top/hot.html"
response = requests.get(base_url,headers=headers)
response.encoding = "utf-8"
tree = etree.HTML(response.text)
tree_list = tree.xpath("/html/body/div[1]/div[4]/ul/li")
for i in tree_list[0:50]:
# 详情页
url="https:" + i.xpath("./div[1]/div[2]/a/@href")[0]
# print(url)
# 把需要爬取的网页url放入队列中
page_url.put(url)
# 设置生产者,开始爬取每个网页的图片地址
for i in range(5):
pro = Product(page_url, img_url)
pro.start()
# 设置消费者,根据图片队列中的url爬取图片
for j in range(5):
con = Consume(page_url, img_url)
con.start()
if __name__ == '__main__':
main()

浙公网安备 33010602011771号