python爬虫之爬取小说(四)
爬取《坏蛋是怎样练成的》
# 导入第三方库 import requests from bs4 import BeautifulSoup # 模拟反爬 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102\ Safari/537.36' } # 请求网页 def open_url(url): r = requests.get(url, headers=headers) r.encoding = r.apparent_encoding return r.text # 提取文本标题 def get_title(tit): soup = BeautifulSoup(tit, 'lxml') title = soup.find_all('div', class_="post_title")[0].h2.get_text() return '\n' + title + '\n' # 提取文章内容 def get_text(txt): soup = BeautifulSoup(txt, 'lxml') texts = soup.find_all('div', class_="post_entry") for i in texts: text = i.find_all('p') return text # 保存标题 def save_title(filename, tit): with open(filename + '.txt', 'a+') as file: file.write(tit) # 保存文本 def save_text(filename, tex, num): with open(filename + '.txt', 'a+') as file: file.write(tex) # 主程序框架 def main(): num = input('你想要下载《坏蛋是怎样炼成的》第几章?(共346章节)') num = int(num) filename = '坏蛋是怎样炼成的' url = 'http://www.huaidan1.com/' + str(num) + '.html' text = open_url(url) tit = get_title(text) tex = get_text(text) save_title(filename, tit) for i in tex: txt = ' ' + i.get_text() + '\n' save_text(filename, txt, num) print('第{}章已经下载完成!'.format(num)) if __name__ == '__main__': main()

浙公网安备 33010602011771号