# _*_ coding: utf-8 _*_
# __author__: slv
# 2020/8/26
import requests
import re
import os
urlweb = 'https://www.xs4.cc/4_4289/'
bookid = re.findall(f'cc/(.*?)/', urlweb)[0]
print(bookid)
bookrootpath = r'D:\books'
rebook = requests.get(url=urlweb)
rebook.encoding = 'gbk'
bookname = re.findall(f'<h1>(.*?)</h1>', rebook.text, re.S)[0] #找到书名
bookpath = f'{bookrootpath}/{bookname}' #设置书的全路径
if os.path.exists(bookpath):
print('this book has existed!')
else:
os.mkdir(bookpath) #以书名为名创建文件夹
chapterurllist = re.findall(f'<a href="/{bookid}/(.*?).html', rebook.text, re.S) #找到每章的url地址
chapternamelist = re.findall('.html">(.*?)</a></dd>', rebook.text, re.S) #找到每一章的名字
#将内容写入文件
def writecontent(chaptername, content):
with open(f'{bookpath}/{chaptername}', 'w') as f:
f.write(content)
#获取章节内容,传参:章节url
def getbookcontent(chapterlink):
recontent = requests.get(url=chapterlink)
recontent.encoding = 'gbk'
content = re.findall('<div id="content">(.*?)</div>', recontent.text, re.S)[0]
return content
count = 0
for i in range(9, len(chapternamelist)):
chapterlink=f'{urlweb}{chapterurllist[i]}.html'
print(chapterlink)
#获取每一章的内容
content=getbookcontent(chapterlink).replace(" ","").replace("<br />","")
#将每一章的内容写入文件
writecontent(chapternamelist[i],content)