爬取图片

import os,random
import time
from urllib.parse import *
from operator import ge
import requests
from lxml import etree
import datetime
import sys
import logging
import re
from logging import handlers
from faker import Faker


uas = Faker()

def downloadHtml(url):  # 发起访问请求,获取页面,可重复使用
    time.sleep(1)
    headers = {
        'User-Agent': uas.user_agent()
    }
    response = requests.get(url=url, headers=headers, timeout=50)
    # print(response)
    response.encoding = 'utf-8'
    if response.status_code != 200:
        logging.error('没有获取到页面,是否还未上传?状态码:{}'.format(response.status_code))
        # sys.exit(0)

    else:
        html = etree.HTML(response.text)
        return html


def get_pic(html):
    img_url = html.xpath('//*[@id="main-wrapper"]/div[2]/p/a/img/@src')[0]
    print(img_url)
    return img_url


# 日志
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(funcName)s -> %(message)s')


def Download_Image(
        downloadUrl: str or list, saveImagePath: str, headers: dict = None, proxies: dict = None
) -> bool or str:
    """
    1、下载图片

    :param downloadUrl: 下载的图片链接或者列表
    :param saveImagePath: 保存路径
    :param headers: 自定义头部信息
    :param proxies: 自定义代理
    :return:
    """

    if isinstance(downloadUrl, str):
        downloadUrlParse = urlparse(downloadUrl)
        if headers is None:
            headers = {
                'User-Agent': uas.user_agent(),
                'Referer': f'{downloadUrlParse.scheme}://{downloadUrlParse.netloc}',
                'Host': downloadUrlParse.netloc,
            }

        # 下载
        try:
            response = requests.get(
                downloadUrl, headers=headers, timeout=20, proxies=proxies).content
        except TimeoutError:
            logging.info(f'下载图片超时:{downloadUrl}')
            return downloadUrl
        except Exception as e:
            logging.info(f'下载图片失败:{downloadUrl} -> 原因:{e}')
            return downloadUrl

        # 新保存路径
        if os.path.isdir(saveImagePath):
            newSaveImagePath = saveImagePath + r'\0.jpg'
        else:
            newSaveImagePath = os.path.splitext(saveImagePath)[0] + '.jpg'
        with open(newSaveImagePath, 'wb') as f:
            f.write(response)

    elif isinstance(downloadUrl, list):
        # 循环下载
        for i in range(len(downloadUrl)):
            time.sleep(2)
            downloadUrlParse = urlparse(downloadUrl[i])
            if headers is None:
                headers = {
                    'User-Agent': uas.user_agent(),
                    'Referer': f'{downloadUrlParse.scheme}://{downloadUrlParse.netloc}',
                    'Host': downloadUrlParse.netloc,
                }
            if downloadUrl[i] == '':
                continue

            # 下载
            try:
                time.sleep(2)
                response = requests.get(
                    downloadUrl[i], headers=headers, timeout=20, proxies=proxies).content
            except TimeoutError:
                logging.info(f'下载图片超时:{downloadUrl[i]}')
                return downloadUrl[i]
            except Exception as e:
                logging.info(f'下载图片失败:{downloadUrl[i]} -> 原因:{e}')
                continue

            # 新保存路径
            if os.path.isdir(saveImagePath):
                newSaveImagePath = saveImagePath + fr'\{i}.jpg'
            else:
                newSaveImagePath = os.path.splitext(saveImagePath)[0] + '.jpg'
            with open(newSaveImagePath, 'wb') as f:
                f.write(response)

    else:
        logging.info('无法下载')
        return '无法下载'

    return True

  访问网页并获取还是用之前的downloadhtml()函数,下载图片借用了网上的一个download_image()函数,带日志,感觉挺高大上。有一些参数我也不是很懂,比如Faker()函数,随机user_agent

if __name__ == '__main__':
    l = [,,]
        for j in l:
        k = j.lstrip('/').rstrip('.html')
        time.sleep(random.randint(1, 3))
        url = "https://www.xxx.com"+j
        s = downloadHtml(url).xpath(
            '//*[@id="page"]/div/span/text()')[0].lstrip('1/')
        # pattern = re.compile('_[0-9_]{1,2}')
        num = int(s)
        print(k, '共'+s+'页')

        target_path = r'E:\\'+k
        flag = os.path.exists(target_path)
        if not flag:
            os.makedirs(target_path)

        file_count = len(glob.glob(r'E:\\'+k+'\\*.jpg'))
        print(file_count)

          for i in range(file_count+1, num):
              url = "https://www.xx.com" + j.rstrip('.html')+'/page/'+str(i)+'.html'
            
              time.sleep(random.randint(1, 4))
              img_html = downloadHtml(url)
              img_url = get_pic()

              Download_Image(
                  downloadUrl=img_url,
                  saveImagePath=target_path+"\\"+str(i)
              )

        print('finish')

  主程序,提前准备了一个列表,要爬取的网页地址后几位。先获取总页数,再循环每一页。这里有一个亮点,爬取时间久了,会断开,但是有的文件夹已经快下好,重新覆盖费时费力。由于每一页一张图,可以先获取当前文件夹中已有多少张图片,然后从断开的地方继续。用了一个glob()函数,四舍五入等于断点续传。

主要还是xpath的运用,字符串的构造。

def save_image(img_link, filename):
    time.sleep(random.randint(1, 4))
    res = requests.get(img_link, headers=headers, proxies=proxy)
    if res.status_code == 404:
        print(f"图片{img_link}下载出错------->")
    with open(filename, "wb") as f:
        f.write(res.content)
        # print("存储路径:" + filename)

  换了一个网站,Download_image()用不了,写了一个小而精的save_image()

主要的问题还是爬久了会断,这个一时没有太好的办法。用了随机睡眠,减缓访问速度。

posted @ 2022-11-08 15:01  CP喜欢晒太阳  阅读(59)  评论(0)    收藏  举报