python爬虫实战（一）--TXT小说下载

第一次做爬虫，练手网站是笔趣阁（http://www.ibiqu.net/），反正他们也是爬别人的 ^_^!

将源码贴出来给和我一样的菜鸟参考，代码有点乱，没有写def，也没有做什么优化。

有两个引用的库得单独安装一下

pip install beautifulsoup4

pip install requests

手册地址：http://beautifulsoup.readthedocs.io/zh_CN/latest/

from bs4 import BeautifulSoup 
import requests
import re
import time

if __name__ == '__main__':
    t = 1
    while t ==1:
        
        bookname = input('请输入要下载的书名：')    
        target = 'http://www.ibiqu.net//modules/article/search.php?searchkey='+ bookname  #搜索引擎路径
        req = requests.get(url = target)
        html = req.text
        bf = BeautifulSoup(html,'html.parser')
        texts = bf.find_all('a')
        x = 1
        for a in texts:
            if a.string == bookname:
                url_a = a.get('href')
                target2 = 'http://www.ibiqu.net' + url_a  #目录路径
                req2 = requests.get(url = target2)
                html2 = req2.text
                q = re.search('正文',html2).end() + 5
                h = re.search('</dl>',html2).start()
                m = html2[q:h]
                bf2 = BeautifulSoup(m,'html.parser')
                texts2 = bf2.find_all('a')
                print('本书共找到到'+ str(len(texts2)) + '个章节')
                n = int(input('请输入开始下载章节（阿拉伯数字）：'))    #缓存计数
                path = 'D:/pydown/' + bookname + '.txt'
                f = open(path,mode='a',encoding='utf-8')  #创建下载文件
                
                for a in texts2[n-1:]:
                    url_b = a.get('href')
                    name_b = a.string  #章名
                    f.write(name_b + '\n')  #写入章节名
                    target3 = 'http://www.ibiqu.net' + url_b  #章路径
                    req3 = requests.get(url = target3)
                    html3 = req3.text
                    bf3 = BeautifulSoup(html3,'html.parser')
                    d = bf3.find_all('div',id = 'content')
                    p0 = d[0]
                    p1 = p0.find_all('p')
                    print('开始写入'+ name_b)
                    
                    for a in p1:
                        if a.string:   #去除空段落
                            f.write(a.string + '\n')  #写入章节内容
                            
                    n +=1
                    
                    if n%500 == 0:   #定时存盘
                        f.close()
                        f = open(path,mode='a',encoding='utf-8')
                        print('************缓存清理完成！************')
                        
                    time.sleep(2)  #暂停两秒，别把人家服务器挤崩了
                    '''
                    y = input('1跳出 >>>')
                    if y:
                        break
                    '''
                print('下载结束！')
                f.close()  #关闭文件
                x = 0

        if x:
            print('找不到此书，请重新输入正确书名！')

posted @ 2021-12-10 16:22 方寸堂阅读(683) 评论(0) 收藏举报

刷新页面返回顶部

方寸堂

python爬虫实战（一）--TXT小说下载

公告