Loading

【爬虫】项目篇-在https://www.kanunu8.com/book2抓取电子书

1)使用正则表达式

#使用requests库和正则表达式抓取在https://www.kanunu8.com/book3/任选的一本电子书
import requests
import re
import os
import time

header = {
    'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}


def main(base_url,book_name):
    os.makedirs(book_name,exist_ok=True)
    #请求网页
    req=requests.get(base_url,headers=header)
    content=req.content.decode(req.apparent_encoding)
    content = re.findall('正文(.*?)</tbody>', content, re.S)
    chap_url_list=get_condition(content)[0]
    chap_name_list=get_condition(content)[1]
    x=0
    for i in range(len(chap_name_list)):
        chap_name=chap_name_list[i]
        chap_url=chap_url_list[i]
        chap_txt=get_txt(base_url,chap_url)
        save(book_name,chap_name,chap_txt)
        if chap_name=='前言':
            print("前言保存成功")
            x=x+1
        else:
            print(f"第{x}章保存成功")
            x=x+1
# 获取 章节标题列表 和 章节url列表
def get_condition(content):
    for item in content:
        chap_url_list=re.findall('[0-9]{6}.html',item)
    for item in content:
        chap_name_list=re.findall('<a href="1831[0-9][0-9].html">(.+)</a></td>',item)
    return chap_url_list,chap_name_list

#获取章节内容
def get_txt(base_url,chap_url):
    base_url=re.search('https://www.kanunu8.com/book3/[0-9]{4}/',base_url).group(0)
    url=base_url+chap_url

    #请求每个章节的url
    req=requests.get(url,headers=header)
    chap_txt=req.content.decode('gbk')
    #选取源码中的书本内容
    #chap_content=re.search("<p>(.+)</p>",chap_content,re.S).group(0)
    chap_txt=re.findall(r'<p>(.*?)</p>',chap_txt,re.S)[0]
    #数据清洗 处理&nbsp、</br>等字符

    chap_txt=chap_txt.replace('&nbsp;',"")
    chap_txt=chap_txt.replace('<br />',"")

    return chap_txt

#保存到当前目录
def save(book_name,chap_name,chap_txt):
    chap_name=chap_name+'.txt'
    with open(os.path.join(book_name,chap_name),'w',encoding='gbk') as file:
       file.write(chap_txt)


if __name__ == '__main__':
    base_url="https://www.kanunu8.com/book3/8259/index.html"
    book_name="孽海花"
    main(base_url,book_name)

2)使用bs4

#使用requests库和beautifulsoup4库在https://www.kanunu8.com/book2上抓取一本电子书

import requests
from lxml import html
from fake_useragent import UserAgent
import cchardet
import os
import re

#获取随机请求头
header={
    'user-agent':UserAgent().random
}
#获取网页源码
def getSource(url):
    req=requests.get(url,headers=header)
    req.encoding=cchardet.detect(req.content)['encoding']
    return req.text.replace("<br />","")

#解析网页
def getUrl(source):
    bs=BeautifulSoup(source,'lxml')

    #获取作者
    actor=bs.select('tr>td[align="center"][height="30"][valign="middle"]')[0].string
    actor=re.search('作者:(.*?) ',str(actor).strip()).group(1)

    #获取书名
    book_name=bs.select('h1>strong>font[color="#dc143c"]')[0].string

    #获取内容简介
    introduction=bs.select('tr[align="left"]>td[class="p10-24"]')[0].strings
    introduction=''.join([i.replace("内容简介:","").strip() for i in introduction])

    print("书名:",book_name,"\n作者:",actor,"\n内容简介:",introduction)
    #获取每个章节链接和章节名
    a=bs.select('tr[bgcolor="#ffffff"]>td>a[href]')
    for i in a:
        chap_name=i.string
        chap_url=i['href']
        chap_txt=getContent(chap_url)
        save(book_name,chap_name,chap_txt)
        print(f'{chap_name}保存成功')

#获取章节链接下的章节内容
def getContent(chap_url):
    url=base_url+chap_url
    print(url)
    source=getSource(url)
    bs=BeautifulSoup(source,'lxml')
    content=str(bs.select('p')[0].string).replace('&nbsp;',"")
    #print(content)
    return content

#保存
def save(book_name,chap_name,chap_txt):
    chap_name=chap_name+'.txt'
    #判断文件夹是否存在
    if not os.path.exists(book_name):
        os.mkdir(book_name)
    with open(os.path.join(book_name,chap_name),'w+',encoding='utf-8') as f:
        f.write(chap_txt)

if __name__ == '__main__':
    base_url = "https://www.kanunu8.com/book3/8196/"
    source=getSource(base_url)
    getUrl(source)
posted @ 2024-04-05 22:43  踩坑大王  阅读(401)  评论(0)    收藏  举报