一、小说下载
小说网址是:http://www.biqukan.com
import requests
from bs4 import BeautifulSoup
class downloader(object):
def __init__(self):
self.url = 'http://www.biqukan.com/1_1408/'
self.serve = 'http://www.biqukan.com'
self.page_url = []
self.page_name = []
#获取每个章节的链接和章节名字
def get_page_url(self):
html = requests.get(self.url)
soup = BeautifulSoup(html.text,'lxml')
url_list = soup.find_all('div',class_="listmain")
url_list = BeautifulSoup(str(url_list[0]))
a = url_list.find_all('a')
for each in a[12:]:
self.page_url.append(self.serve + each.get('href'))
self.page_name.append(each.string)
#小说页面的内容
def get_html(self,url):
html = requests.get(url)
soup = BeautifulSoup(html.text,'lxml')
content = soup.find_all('div',class_="showtxt")
content = content[0].text
content = content.replace('<br/><br/>','\n\n')
return content
#写入txt文件中
def writer(self,path,name,text):
with open(path,'a',encoding='utf-8') as f:
f.write(name+'\n')
f.write(text)
f.write('\n\n')
if __name__ == '__main__':
dl = downloader() #实例化类
dl.get_page_url() #运行获取章节名称,url的函数
name = dl.page_name #获取到的章节名称和url赋值给name,url
url = dl.page_url
for i in range(len(name)):
dl.writer('小说.txt',name[i],dl.get_html(url[i]))