0 0 刚学爬虫写的爬取瑟图代码


# 套图之家爬取套图.py

import requests
import bs4
import os
import random


def open_url(url):
    # 基础设置
    global headers
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'
    }
    res = requests.get(url, headers=headers)
    soup = bs4.BeautifulSoup(res.text, "html.parser")

    return soup


def get_urls(soup):
    # 获取图片的 url
    list_urls = [
        url.get('data-lazy-src')
        for url in soup.select('div.single-content >p >img')
    ]

    return list_urls


def img_info(soup):
    # 获取图片信息(暂仅获取套图名)
    series_name = soup.find("h1", class_="entry-title").text

    return series_name


def save_imgs(list_urls, series_name):
    # 保存图片, timeout 默认为 5, 如经常发生 timeout 错误可适当增加 timeout 值
    os.makedirs(series_name, exist_ok=True)
    imgNum = len(list_urls)
    print("开始下载套图: %s" % (series_name))
    for i in range(imgNum):
        imgs = requests.get(list_urls[i], headers=headers, timeout=5).content
        imgName = str(i) + '.jpg'
        with open(os.path.join(series_name, imgName), 'wb') as f:
            f.write(imgs)
        f.close()
        print("已下载: %s" % (imgName))
    print("下载完成!")


def main():
    # 主程序
    series_Num = str(input("请输入神秘代码(为空则随机获取): "))
    if not series_Num:
        series_Num = random.randint(100, 15000)
    url = 'https://taotuhome.com/' + str(series_Num) + '.html'
    m = open_url(url)
    m_list = get_urls(m)
    series_name = img_info(m)
    save_imgs(m_list, series_name)


if __name__ == "__main__":
    main()

posted @ 2020-05-25 15:23  melodia  阅读(233)  评论(0)    收藏  举报