Loading

Python爬"Cosplay_中国"所有图片 coser图片

import os
import time
import re
import requests
from lxml import etree
from requests.adapters import HTTPAdapter


# 获取页面源码
def request_text(url, encode):
    reqs = requests.Session()
    reqs.mount('http://', HTTPAdapter(max_retries=3))
    reqs.mount('https://', HTTPAdapter(max_retries=3))
    try:
        reqs = reqs.get(url=url, headers={
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome"
                           "/89.0.4389.82 Safari/537.36 Edg/89.0.774.50"
        }, timeout=5)
        reqs.encoding = encode
        page_code = reqs.text
        reqs.close()
        return page_code
    except requests.exceptions.RequestException as e:
        print(e)


# 获取页面资源二进制数据
def request_content(url):
    reqs = requests.Session()
    reqs.mount('http://', HTTPAdapter(max_retries=3))
    reqs.mount('https://', HTTPAdapter(max_retries=3))   # 设置超时重试次数为3次
    try:
        reqs = reqs.get(url=url, headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome"
                          "/89.0.4389.82 Safari/537.36 Edg/89.0.774.50"
        }, timeout=5)
        image_data = reqs.content
        reqs.close()
        return image_data
    except requests.exceptions.RequestException as e:
        print(e)


# 保存图片
def write_image(file_path, data):
    try:
        with open(file_path, mode="wb") as f:
            f.write(data)
        f.close()
    except:
        print("出现错误")
        return "continue"


# 对文件名、目录名进行过滤,匹配子页数、图片名、图片url
def re_info(obj, page_code):
    info_dict = dict()
    r = '[/\\\\.*?<>|:"中国"]+'
    info = obj.search(page_code)
    name = info.group("name")
    name = re.sub(r, " ", name)
    src = info.group("src")
    sub_page_num = info.group("page_num")
    info_dict["name"] = name
    info_dict["src"] = src
    info_dict["sub_page_num"] = int(sub_page_num)
    return info_dict


def main():
    start = time.perf_counter()
    page_num = 1
    count = 1
    domain = "http://www.cosplay8.com"
    obj = re.compile(r"<title>(?P<name>.*?)</title>"
                     r".*?<img src='(?P<src>.*?)'"
                     r".*?<span>共(?P<page_num>\d+)页", re.S
                     )
    obj1 = re.compile(r"(?P<sub_to_url>.*?).html", re.S)
    while page_num <= 90:
        url = f"http://www.cosplay8.com/pic/chinacos/list_22_{page_num}.html"
        main_page = request_text(url, "utf-8")
        html = etree.HTML(main_page)
        lis = html.xpath("/html/body/div[3]/ul/li")  # 获取当前页面所有<li>的xpath路径
        sub_url_list = list()

        # 遍历<li>的xpath路径,获取每个<li>的href地址并放入列表中
        for li in lis:
            sub_url = li.xpath("./a/@href")[0]
            sub_url_list.append(sub_url)

        # 遍历<li>的href地址, 获取每个<li>的页面源码,并进行正则匹配,获取img_src和子页面数量,对当前页图片下载
        for sub in sub_url_list:
            sub_urls = domain + sub
            sub_page = request_text(sub_urls, "utf-8")
            info_dict = re_info(obj, sub_page)
            img_src = domain + info_dict["src"]
            folder_path = "D:/img/" + info_dict["name"] + "_" + str(page_num)
            if not os.path.exists(folder_path):
                os.mkdir(folder_path)
            file_name = "/" + info_dict["name"] + ".jpg"
            file_path = folder_path + file_name
            write_image(file_path, request_content(img_src))
            count += 1
            time.sleep(0.5)

            # 遍历每个子页面,进行图片下载
            for i in range(2, info_dict["sub_page_num"] + 1):
                sub_to = obj1.search(sub_urls)
                sub_to_url = sub_to.group("sub_to_url")
                sub_to_urls = "%s_%d.html" % (sub_to_url, i)  # 对子页面链接进行修整
                sub_to_page = request_text(sub_to_urls, "utf-8")
                info_sub_dict = re_info(obj, sub_to_page)
                print(info_sub_dict["name"])
                img_src = domain + info_sub_dict["src"]
                file_path = folder_path + "/" + info_sub_dict["name"] + ".jpg"
                if write_image(file_path, request_content(img_src)) == "continue":
                    continue
                else:
                    write_image(file_path, request_content(img_src))
                count += 1
                time.sleep(0.5)
        page_num += 1
    end = time.perf_counter()
    print('耗时: %s Seconds' % (end - start))
    print("共下载完成%d图片" % count)


if __name__ == '__main__':
    main()

 

 

 

 

 

 差不多有1万4千多张,没有全部扒完。扒到80页的时候发现后面的小姐姐没有前面的好看,就没扒了(嘻嘻)

下面附上几个好看的小姐姐

 

posted @ 2021-04-01 21:21  Yann的日常笔记  阅读(185)  评论(0)    收藏  举报