博客园  :: 首页  :: 新随笔  :: 联系 :: 订阅 订阅  :: 管理

BeautifulSoup之诗词名句网站中三国演义小说内容抓取

Posted on 2019-03-20 16:40  TigerAt  阅读(176)  评论(0)    收藏  举报
 1 #导包
 2 import requests
 3 from bs4 import BeautifulSoup
 4 
 5 
 6 
 7 #指定url
 8 url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
 9 
10 #发起请求
11 headers = {
12         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
13     }
14 
15 #此方法用来获取具体章节的具体内容
16 def get_content(url):
17     #发起请求
18     response = requests.get(url=url,headers=headers)
19     #获取页面数据
20     page_text = response.text
21     
22     #创建BeatufulSoup对象
23     soup = BeautifulSoup(page_text,'lxml')
24     content = soup.find('div',class_="chapter_content").text
25     return content
26     
27     
28     
29     
30     
31 response = requests.get(url=url,headers=headers)
32 
33 #获取页面数据
34 page_text = response.text
35 
36 #数据分析
37 #创建BeatufulSoup对象
38 soup = BeautifulSoup(page_text,'lxml')
39 contents = soup.select('.book-mulu > ul > li > a')
40 
41 #持久化存储
42 fp=open('./三国演义.txt','w',encoding='utf-8')
43 num = 1
44 for content in contents:
45     content_url='http://www.shicimingju.com'+content['href']
46     title = content.string
47     print("开始下载第%d章:%s"%(num,title))
48     content_detail = get_content(content_url)
49     fp.write(title+':'+content_detail +"\n\n\n")
50     print("开始下载第%d章下载完毕"%num)
51     num+=1
52     
53     
54 print("全部数据写入完毕")

注:此代码仅供学习参阅