爬虫博客http://www.ci123.com/blog/?user_id=1863532&page=1

import requests
from bs4 import BeautifulSoup
from pathlib import Path

for i in range(1, 56):
    i = str(i)
    url = 'http://www.ci123.com/blog/?user_id=1863532&page=' + i

    headers = {
        "user-agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }

    resp = requests.get(url, headers=headers)
    text = resp.text
    resp.close()
    soup = BeautifulSoup(text, 'html.parser')
    all_li = soup.find_all("li")
    for li in all_li:
        href = li.a['href']
        index = href.find('=')
        id = href[index + 1:]
        print(id)

        dir = 'html/' + i + '/' + id

        # 指定要创建的文件夹路径
        folder_path = Path(dir)

        # 创建文件夹
        if not folder_path.exists():
            folder_path.mkdir(parents=True, exist_ok=True)
            print(f"Folder '{folder_path}' created successfully.")
        else:
            print(f"Folder '{folder_path}' already exists.")

        href = 'http://www.ci123.com/blog/' + href

        resp = requests.get(href, headers=headers)
        text = resp.text
        resp.close()
        # print(text)

        # 保存图片
        soup = BeautifulSoup(text, 'html.parser')
        imgs = soup.find_all("img")
        for img in imgs:
            src = img['src']
            index = src.rindex('/')
            img_file_name = src[index + 1:]

            print(src)

            # 跳过特殊文件
            if "uploadfiles" not in src:
                break

            resp = requests.get(src, headers=headers)

            with open('html/' + i + '/' + id + '/' + img_file_name, "wb") as f:
                f.write(resp.content)
                resp.close()
                img['src'] = img_file_name

        with open('html/' + i + '/' + id + '/' + id + '.html', "w", encoding="utf-8") as f:
            f.write(soup.prettify())
posted @ 2025-03-26 13:42  zqxLonely  阅读(13)  评论(0)    收藏  举报