python面试题使用多线程、生产者消费者模式爬取数据模板

1.基础爬虫题:
目标网址:https://www.3839.com/top/hot.html
项目需求:爬取人气榜单所有游戏信息,游戏评价
题目要求:使用多线程、生产者消费者模式爬取,不得使用任何爬虫框架
游戏信息字段:
游戏idgame_id83294
游戏名game_name我的世界
游戏logologohttps://fs.img4399.com/sykb~sykb/20200116/09145170713
游戏介绍introduce原生游戏全面更新,冒险主题全面开启!
评分score8.3
评价数comment_count13567
评论信息字段:
评论idomment_id244841561
用户iduser_id19328128
用户名usernameIeo
用户头像portraithttps://imga.3839.com/19328128
评论时间creat_time2020-03-0109:32
评论内容content这四星都是给mc的,并不是给网易..........
点赞数like_count61
回复数reply_count13

答案:

import requests, queue, threading
from lxml import etree
from queue import Queue
import re,json
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
        "Cookie": "Hm_lvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958055; UM_distinctid=17230e2979964b-0e3af17765eaa-396a4507-100200-17230e2979a69d;"
                  " CNZZDATA1000292083=215119917-1589957264-%7C1589957264; CNZZDATA30039538=cnzz_eid%3D928846934-1589957153-%26ntime%3D1589957153; "
                  "Hm_lpvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958969"}


class Product(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
        "Cookie": "Hm_lvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958055; UM_distinctid=17230e2979964b-0e3af17765eaa-396a4507-100200-17230e2979a69d;"
                  " CNZZDATA1000292083=215119917-1589957264-%7C1589957264; CNZZDATA30039538=cnzz_eid%3D928846934-1589957153-%26ntime%3D1589957153; "
                  "Hm_lpvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958969"}
    # 初始化
    def __init__(self, page_url, img_url):
        super(Product, self).__init__()
        self.page_url = page_url
        self.img_url = img_url
      # 子线程运行

    def run(self):

        while True:
        # 如果图片url爬去完毕,则跳出循环,结束子线程
            if self.page_url.empty():
                break
          # 获取将爬取的页面url,并解析
            html = self.page_url.get()
            # print("html",html)
            self.parse_html(html)

    # 获取图片的url
    def parse_html(self, html):
        response_t = requests.get(url=html, headers=headers)
        response_t.encoding = "utf-8"
        tree = etree.HTML(response_t.text)
        print('-----------------------------------------------------------------------------')
        print("游戏名game_name:", tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[1]/h1/text()')[0])
        print("游戏logo:", 'https:' + tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/img/@src')[0])
        print("游戏介绍:",tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[3]/div[5]/div/div/text()')[0] if tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[3]/div[5]/div/div/text()') else '暂无介绍')
        print("评分:", tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div[1]/p[2]/text()')[0])
        print("评价数", tree.xpath('/html/body/div[1]/div[3]/div[1]/div[1]/div[1]/div/div[2]/div[1]/p[4]/text()')[0])
        href = re.findall("var iframeUrl =(.*)", response_t.text)
        z = 'https:' + href[0].replace(".htm?dm=' + window.location.host;",
                                      '-htmlsafe-1-urltype-1-audit-1.htm').strip().replace("htm", "json", 1)
        href = z.replace("'", "")
        self.img_url.put(href)

class Consume(threading.Thread):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
        "Cookie": "Hm_lvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958055; UM_distinctid=17230e2979964b-0e3af17765eaa-396a4507-100200-17230e2979a69d;"
                  " CNZZDATA1000292083=215119917-1589957264-%7C1589957264; CNZZDATA30039538=cnzz_eid%3D928846934-1589957153-%26ntime%3D1589957153; "
                  "Hm_lpvt_f1fb60d2559a83c8fa1ee6125a352bd7=1589958969"}

    def __init__(self, page_url, img_url=None):
        super(Consume, self).__init__()
        self.page_url = page_url
        self.img_url = img_url

    def run(self):
        # print(5555)
        while self.img_url:
              # 如果图片队列为空且页面队列为空,则推出循环
            comment_url = self.img_url.get()
            # print(6666,comment_url)
            #获取详情页的数据
        #控制评论页数量
            tag = 1
            while tag==1:
                try:
                    response_comment = requests.get(url=comment_url.replace("p-1","p-{}".format(tag)),headers=self.headers)
                    for w in json.loads(response_comment.text)['content']:
                        print(
                            "评论id", w["id"]+'\n'
                              "用户id", w["uid"]+'\n'
                              "用户名", w["username"]+'\n'
                              "用户头像", "https:" + w["avatar"]+'\n'
                              "评论时间",w["time"]+'\n'
                              "评论内容", w["comment"][0:100]+'\n'
                              "点赞数", w["good_num"]+'\n'
                              "回复数", len(w["reply"]))
                    tag+=1
                except Exception:
                    break
def main():
    # 设置页面队列为10,图片队列为200
    page_url = Queue()
    img_url = Queue()
      # 基本url
    base_url = "https://www.3839.com/top/hot.html"
    response = requests.get(base_url,headers=headers)

    response.encoding = "utf-8"
    tree = etree.HTML(response.text)
    tree_list = tree.xpath("/html/body/div[1]/div[4]/ul/li")
    for i in tree_list[0:50]:
      # 详情页
        url="https:" + i.xpath("./div[1]/div[2]/a/@href")[0]
        # print(url)
      # 把需要爬取的网页url放入队列中
        page_url.put(url)

     # 设置生产者,开始爬取每个网页的图片地址
    for i in range(5):
        pro = Product(page_url, img_url)
        pro.start()


     # 设置消费者,根据图片队列中的url爬取图片
    for j in range(5):
        con = Consume(page_url, img_url)
        con.start()

if __name__ == '__main__':
    main()
posted @ 2020-06-05 21:16  python爬虫工程师  阅读(421)  评论(0)    收藏  举报