爬取梨视频


import re
import redis
import requests,time

from setting import PAGE,CATEGORY_ID,START,MAIN_URL,DETAIL_URL

from concurrent.futures import ThreadPoolExecutor

from myredis import POOL
class CrawlVideo():
    pools = ThreadPoolExecutor(100)

    def __init__(self, page=PAGE):
        self.page = page
        self.video_info_dic_list = []
        self.conn = redis.Redis(connection_pool=POOL)

    def async_download(self,video_dic):

        video_link = video_dic["video_link"]
        if self.conn.get(video_link):
            return
        video_name = video_dic["title"][:3]
        response = requests.get(video_link)
        if response.status_code == 200:
            with open("%s.mp4" % video_name, "wb")as f:
                f.write(response.content)
            self.conn.set(video_link,video_link)


    def download_video(self, category_id=CATEGORY_ID, start=START, num=PAGE):

        crawl_ids_list= self.crawl_videolist(category_id, start, num)
        print(len(crawl_ids_list))
        self.get_video_info(crawl_ids_list)
        i = 0
        while i < len(crawl_ids_list):
            try:
                video_dic = self.video_info_dic_list.pop()
                self.pools.submit(self.async_download,video_dic)
                i += 1
            except Exception as e:
                time.sleep(0.2)



    def get_video_ids(self, category_id, start):
        main_url = MAIN_URL.format(category_id, start)
        try:
            response = requests.get(main_url)
            video_id_list = re.findall('<a href="(video_\d+)"', response.text)
            return video_id_list
        except Exception as e:
            pass

    # 爬取单个视频的id的列表,可以通过此列表发请求
    def crawl_videolist(self, category_id, start, num):
        crawl_ids_list = []
        page_num = self.get_page_num(num)
        for i in range(page_num):
            video_id_list = self.get_video_ids(category_id, start)
            crawl_ids_list.extend(video_id_list)
            start += self.page
        while len(crawl_ids_list) > num:
            crawl_ids_list.pop()
        return crawl_ids_list


    def get_detail(self, obj):
        response = obj.result()
        dic = {}
        title = re.search('<title>(.*?)</title>', response.text).group(1)
        video_link = re.search('srcUrl="(.*?)"', response.text).group(1)
        dic["title"] = title
        dic["video_link"] = video_link
        self.video_info_dic_list.append(dic)



    def async_request(self,url,video_addr):
        response = requests.get(url.format(video_addr))
        return response

    def get_video_info(self, video_id_list):
        url = DETAIL_URL
        try:
            for video_addr in video_id_list:
                obj = self.pools.submit(self.async_request,url,video_addr)
                obj.add_done_callback(self.get_detail)
        except Exception as e:
            print(e)

    def get_page_num(self, num):
        if num % self.page == 0:
            page_num = num / self.page
        elif num <= self.page:
            page_num = 1
        else:
            page_num = num // self.page + 1
        return int(page_num)



crawl = CrawlVideo()
crawl.download_video(start=1,num=2)

posted @ 2019-04-10 20:07  robertzhou  阅读(207)  评论(0编辑  收藏  举报