import requests
from bs4 import BeautifulSoup
from pathlib import Path
for i in range(1, 56):
i = str(i)
url = 'http://www.ci123.com/blog/?user_id=1863532&page=' + i
headers = {
"user-agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
}
resp = requests.get(url, headers=headers)
text = resp.text
resp.close()
soup = BeautifulSoup(text, 'html.parser')
all_li = soup.find_all("li")
for li in all_li:
href = li.a['href']
index = href.find('=')
id = href[index + 1:]
print(id)
dir = 'html/' + i + '/' + id
# 指定要创建的文件夹路径
folder_path = Path(dir)
# 创建文件夹
if not folder_path.exists():
folder_path.mkdir(parents=True, exist_ok=True)
print(f"Folder '{folder_path}' created successfully.")
else:
print(f"Folder '{folder_path}' already exists.")
href = 'http://www.ci123.com/blog/' + href
resp = requests.get(href, headers=headers)
text = resp.text
resp.close()
# print(text)
# 保存图片
soup = BeautifulSoup(text, 'html.parser')
imgs = soup.find_all("img")
for img in imgs:
src = img['src']
index = src.rindex('/')
img_file_name = src[index + 1:]
print(src)
# 跳过特殊文件
if "uploadfiles" not in src:
break
resp = requests.get(src, headers=headers)
with open('html/' + i + '/' + id + '/' + img_file_name, "wb") as f:
f.write(resp.content)
resp.close()
img['src'] = img_file_name
with open('html/' + i + '/' + id + '/' + id + '.html', "w", encoding="utf-8") as f:
f.write(soup.prettify())