python爬虫实战(一)--TXT小说下载
学习了Python3 爬虫实战教程_w3cschool的教程
第一次做爬虫,练手网站是笔趣阁(http://www.ibiqu.net/),反正他们也是爬别人的 ^_^!
将源码贴出来给和我一样的菜鸟参考,代码有点乱,没有写def,也没有做什么优化。
有两个引用的库得单独安装一下
pip install beautifulsoup4
pip install requests
手册地址:http://beautifulsoup.readthedocs.io/zh_CN/latest/
from bs4 import BeautifulSoup
import requests
import re
import time
if __name__ == '__main__':
t = 1
while t ==1:
bookname = input('请输入要下载的书名:')
target = 'http://www.ibiqu.net//modules/article/search.php?searchkey='+ bookname #搜索引擎路径
req = requests.get(url = target)
html = req.text
bf = BeautifulSoup(html,'html.parser')
texts = bf.find_all('a')
x = 1
for a in texts:
if a.string == bookname:
url_a = a.get('href')
target2 = 'http://www.ibiqu.net' + url_a #目录路径
req2 = requests.get(url = target2)
html2 = req2.text
q = re.search('正文',html2).end() + 5
h = re.search('</dl>',html2).start()
m = html2[q:h]
bf2 = BeautifulSoup(m,'html.parser')
texts2 = bf2.find_all('a')
print('本书共找到到'+ str(len(texts2)) + '个章节')
n = int(input('请输入开始下载章节(阿拉伯数字):')) #缓存计数
path = 'D:/pydown/' + bookname + '.txt'
f = open(path,mode='a',encoding='utf-8') #创建下载文件
for a in texts2[n-1:]:
url_b = a.get('href')
name_b = a.string #章名
f.write(name_b + '\n') #写入章节名
target3 = 'http://www.ibiqu.net' + url_b #章路径
req3 = requests.get(url = target3)
html3 = req3.text
bf3 = BeautifulSoup(html3,'html.parser')
d = bf3.find_all('div',id = 'content')
p0 = d[0]
p1 = p0.find_all('p')
print('开始写入'+ name_b)
for a in p1:
if a.string: #去除空段落
f.write(a.string + '\n') #写入章节内容
n +=1
if n%500 == 0: #定时存盘
f.close()
f = open(path,mode='a',encoding='utf-8')
print('************缓存清理完成!************')
time.sleep(2) #暂停两秒,别把人家服务器挤崩了
'''
y = input('1跳出 >>>')
if y:
break
'''
print('下载结束!')
f.close() #关闭文件
x = 0
if x:
print('找不到此书,请重新输入正确书名!')

浙公网安备 33010602011771号