初识网络爬虫-requests库

# _*_ coding: utf-8 _*_
# __author__: slv
# 2020/8/26
import requests
import re
import os

urlweb = 'https://www.xs4.cc/4_4289/'
bookid = re.findall(f'cc/(.*?)/', urlweb)[0]
print(bookid)
bookrootpath = r'D:\books'
rebook = requests.get(url=urlweb)
rebook.encoding = 'gbk'
bookname = re.findall(f'<h1>(.*?)</h1>', rebook.text, re.S)[0]  #找到书名
bookpath = f'{bookrootpath}/{bookname}' #设置书的全路径
if os.path.exists(bookpath):
    print('this book has existed!')
else:
    os.mkdir(bookpath) #以书名为名创建文件夹

chapterurllist = re.findall(f'<a href="/{bookid}/(.*?).html', rebook.text, re.S) #找到每章的url地址

chapternamelist = re.findall('.html">(.*?)</a></dd>', rebook.text, re.S) #找到每一章的名字

#将内容写入文件
def writecontent(chaptername, content):
    with open(f'{bookpath}/{chaptername}', 'w') as f:
        f.write(content)

#获取章节内容,传参:章节url
def getbookcontent(chapterlink):
    recontent = requests.get(url=chapterlink)
    recontent.encoding = 'gbk'
    content = re.findall('<div id="content">(.*?)</div>', recontent.text, re.S)[0]
    return content


count = 0
for i in range(9, len(chapternamelist)):
    chapterlink=f'{urlweb}{chapterurllist[i]}.html'
    print(chapterlink)
    #获取每一章的内容
    content=getbookcontent(chapterlink).replace("&nbsp;","").replace("<br />","")
  #将每一章的内容写入文件
    writecontent(chapternamelist[i],content)

 

posted @ 2020-08-26 11:11  RoseLv  阅读(285)  评论(1)    收藏  举报