Python爬虫入门实战项目--爬取新笔趣阁小说

1、网页查看

在这里插入图片描述

进入到全部小说，这就是我们要爬取的小说，这些够看很长时间了
在这里插入图片描述

2、完整代码及注释分析

import requests
from bs4 import BeautifulSoup
import os
import re

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}

#保存路径
path = "./小说"
#如果路径不存在就创建
if not os.path.exists(path):
    os.mkdir(path)

#访问的url
url = "http://www.xbiquge.la/xiaoshuodaquan/"
#发起get请求
response = requests.get(url=url, headers=headers)
#注意设置编码，不然为乱码
response.encoding = "utf-8"
#解析网页
data = BeautifulSoup(response.text, "html.parser")

#参考图1，获取ul下的所有li
ul = data.find(class_="novellist").find_all("li")

#遍历
for li in ul:
    li_data = BeautifulSoup(str(li), "html.parser")
    #参考图2
    #小说名称
    name = li_data.find("a").text
    #详情页url
    page_url = li_data.find("a")["href"]
    
    #拼接路径
    path = path + "/" + name
    print("正在爬取："+name)
    if not os.path.exists(path):
        os.mkdir(path)
    
    #向详情页发起请求
    page_response = requests.get(url=page_url, headers=headers)
    page_response.encoding = "utf-8"
    page_data = BeautifulSoup(page_response.text, "html.parser")

    #参考图3
    dl = page_data.find("dl").find_all("dd")

    #遍历dl
    for dd in dl:
        dd_data = BeautifulSoup(str(dd),"html.parser")
        #参考图4
        chapter = dd_data.find("a").text
        chapter_url = "http://www.xbiquge.la" + dd_data.find("a")["href"]
        
        #对每一章节url发起请求
        res = requests.get(url=chapter_url,headers=headers)
        res.encoding = "utf-8"
        
        try:
            #参考图5
            #获取每一章节中的文本内容，使用select选择器进行定位
            text = BeautifulSoup(res.text,"html.parser").select("#content")[0].text
        except:
            pass
        #使用正则进行替换
        section_text = re.sub('\s+', '\r\n\t', text).strip('\r\n').replace("亲,点击进去,给个好评呗,分数越高更新越快,据说给新笔趣阁打满分的最后都找到了漂亮的老婆哦!手机站全新改版升级地址：http://m.xbiquge.la，数据和书签与电脑站同步，无广告清新阅读！","")
        
        #保存文件
        with open(path +"/"+chapter+".txt",'wb') as f:
            f.write(section_text.encode("UTF-8"))