爬虫案例

多进程和多线程爬虫案例

import os.path
import time
from multiprocessing import Process
from threading import Thread
import requests
from lxml import etree
from fake_useragent import UserAgent


class BaseSpider(object):
    def __init__(self):
        self.url_list = self.create_url_list()
        # self.url_list = ['https://pic.netbian.com/4kdongman/']
        self.headers = {
            'User-Agent': UserAgent().random
        }
        self.BASE_DIR = os.path.dirname(__file__)
        self.file_name_path = self.create_file_name()

    # 创建url列表
    def create_url_list(self):
        url_list = []
        for i in range(1, 10):
            if i == 1:
                index_url = 'https://pic.netbian.com/4kdongman/'
                url_list.append(index_url)
            else:
                index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html'
                url_list.append(index_url)
        return url_list

    def get_tree(self, page_text):
        tree = etree.HTML(page_text)
        return tree

    def get_page_text(self, url, encoding='gbk'):
        response = requests.get(url, headers=self.headers)
        response.encoding = encoding
        return response.text

    def create_file_name(self, path='img'):
        file_name_path = os.path.join(self.BASE_DIR, path)
        os.makedirs(file_name_path, exist_ok=True)
        return file_name_path


class SpiderImg(BaseSpider):

    def __init__(self):
        super().__init__()

    @staticmethod
    def timer(func):
        def inner(*args, **kwargs):
            start_time = time.time()
            res = func(*args, **kwargs)
            print(f" {func.__name__} | 总耗时 :>>>> {time.time() - start_time} s")
            return res

        return inner

    def spider_index_tree(self):
        tree_list = []
        for url in self.url_list:
            # 获取每一页的页面源码
            page_text = self.get_page_text(url=url)
            tree = self.get_tree(page_text=page_text)
            tree_list.append(tree)
        return tree_list

    def __get_tree_data(self, tree):
        img_data_list = []
        li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
        for li in li_list:
            # ./a/img
            img_title = li.xpath('./a/img/@alt')[0]
            img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
            img_data_list.append({'img_title': img_title, 'img_src': img_src})
        return img_data_list

    def spider_index_img_data(self):
        img_data_list = []
        tree_list = self.spider_index_tree()
        for tree in tree_list:
            img_list = self.__get_tree_data(tree=tree)
            # [{},{}]
            img_data_list.extend(img_list)
        return img_data_list

    def download(self, img_src, img_title):
        response = requests.get(url=img_src)
        file_path = os.path.join(self.file_name_path, f'{img_title}.png')
        with open(file_path, mode='wb') as fp:
            for data in response.iter_content():
                fp.write(data)
        print(f"当前图片 :>>>> {img_title} 保存成功!")

    @timer
    def download_normal(self):
        img_data_list = self.spider_index_img_data()
        for img_data in img_data_list:
            img_title = img_data.get('img_title')
            img_src = img_data.get('img_src')
            self.download(img_src=img_src, img_title=img_title)

    @timer
    def download_process(self):
        img_data_list = self.spider_index_img_data()
        task_list = [Process(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for
                     img_data in img_data_list]
        [task.start() for task in task_list]
        [task.join() for task in task_list]

    @timer
    def download_thread(self):
        img_data_list = self.spider_index_img_data()
        task_list = [Thread(target=self.download, args=(img_data.get('img_src'), img_data.get('img_title'))) for
                     img_data in img_data_list]
        [task.start() for task in task_list]
        [task.join() for task in task_list]


if __name__ == '__main__':
    spider = SpiderImg()
    # spider.download_normal() #  download_normal | 总耗时 :>>>> 31.3393292427063 s
    # spider.download_process() #  download_process | 总耗时 :>>>> 34.51722550392151 s
    spider.download_thread()  # download_thread | 总耗时 :>>>> 15.272460699081421 s

'''
    # num_list_one = [1, 2, 3, 4]
    # num_list_two = [7, 8, 9, 10]
    # num_list_new = []
    # print(num_list_new)
    # num_list_new.extend(num_list_one)
    # num_list_new.extend(num_list_two)
    # print(num_list_new)
'''

协程案例

import asyncio
import os
import time

from fake_useragent import UserAgent
import aiohttp
from lxml import etree

headers = {
    'User-Agent': UserAgent().random
}
BASE_DIR = os.path.dirname(__file__)


def create_file_name(path='img'):
    file_name_path = os.path.join(BASE_DIR, path)
    os.makedirs(file_name_path, exist_ok=True)
    return file_name_path


file_name_path = create_file_name()


async def create_url_list():
    url_list = []
    for i in range(1, 10):
        if i == 1:
            index_url = 'https://pic.netbian.com/4kdongman/'
            url_list.append(index_url)
        else:
            index_url = f'https://pic.netbian.com/4kdongman/index_{i}.html'
            url_list.append(index_url)
    return url_list


async def get_tree(page_text):
    tree = etree.HTML(page_text)
    return tree


async def get_page_text(tag_url, encoding='gbk'):
    async with aiohttp.ClientSession() as session:
        # 如果遇到 ssl error 这种错,一般都是 ssl=False
        async with session.get(url=tag_url, headers=headers, ssl=False) as response:
            page_text = await response.text(encoding='gbk')
    return page_text


async def spider_index_tree():
    tree_list = []
    url_list = await create_url_list()
    # url_list = ['https://pic.netbian.com/4kdongman/']
    for url in url_list:
        # 获取每一页的页面源码
        page_text = await get_page_text(tag_url=url)
        tree = await get_tree(page_text=page_text)
        tree_list.append(tree)
    return tree_list


async def get_tree_data(tree):
    img_data_list = []
    li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
    for li in li_list:
        # ./a/img
        img_title = li.xpath('./a/img/@alt')[0]
        img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
        img_data_list.append({'img_title': img_title, 'img_src': img_src})
    return img_data_list


async def spider_index_img_data():
    img_data_list = []
    tree_list = await spider_index_tree()
    for tree in tree_list:
        img_list = await get_tree_data(tree=tree)
        # [{},{}]
        img_data_list.extend(img_list)
    return img_data_list


async def download(img_src, img_title):
    async with aiohttp.ClientSession() as session:
        async with session.get(url=img_src, headers=headers, ssl=False) as response:
            data_all = await response.read()
            file_path = os.path.join(file_name_path, f'{img_title}.png')
            with open(file_path, mode='wb') as fp:
                fp.write(data_all)
            print(f"当前图片 :>>>> {img_title} 保存成功!")


async def main():
    img_data_list = await spider_index_img_data()
    # 创建Task对象列表
    task_list = [asyncio.create_task(download(img_src=img_data.get('img_src'), img_title=img_data.get('img_title'))) for
                 img_data in img_data_list]
    # 等待任务完成
    await asyncio.wait(task_list)


if __name__ == '__main__':
    start_time = time.time()
    # 启协程
    asyncio.run(main())
    print(f"总耗时 :>>>> {time.time() - start_time} s")

    # 总耗时 :>>>> 6.5860209465026855 s
posted @ 2024-02-14 23:13  ssrheart  阅读(4)  评论(0编辑  收藏  举报