简易代码
import requests
from bs4 import BeautifulSoup
import re
import html2text
import os
session = requests.session()
cookies = {
#换成自己的cookies
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
total_page=31 # 总页数加1
name="angelyan" # 博客名字
for page in range(1, total_page):
params = {
'page': page,
}
response = session.get('https://www.cnblogs.com/%s'%name, cookies=cookies, headers=headers, params=params)
soup = BeautifulSoup(response.text, "lxml")
days = soup.find_all("div", class_="day")
for d in days:
a_url = d.find("a", class_=re.compile('^postTitle2')).attrs["href"]
print(a_url)
res = session.get(a_url, cookies=cookies, headers=headers)
sup = BeautifulSoup(res.text, "lxml")
try:
title = sup.find("h1", class_="postTitle").text.strip()
except:
continue
html = sup.find("div", class_="post")
print(title)
markdown = html2text.html2text(str(html))
# print(markdown)
with open(os.path.join(r"./博客园", "%s.md" % title), "w", encoding="utf-8") as f:
f.write(markdown)