利用python爬取百度百科python词条相关的1000个页面数据

1.分析目标,获取抓取策略

1)入口页

记录该网址:https://baike.baidu.com/item/Python/407313?fr=aladdin

 

2)URL格式

 

 

 

<a href="/item/%E5%8D%A1%E8%80%90%E5%9F%BA%E6%A2%85%E9%9A%86%E5%A4%A7%E5%AD%A6" target="_blank">卡耐基梅隆大学</a>

<a href="/item/MATLAB" target="_blank">MATLAB</a>

其词条的超链接是一个不完整的的网址,需要进行字符串的拼接才能进行后续的访问

 

3)数据格式

 

<dd class="lemmaWgt-lemmaTitle-title">
    <h1>Python</h1>
    ......
</dd>

 

 

<div class="lemma-summary" label-module="lemmaSummary">
    <div class="para" label-module="para">Python是一种跨平台的
    ......
    </div>
</div> 

 

标题是由<dd class="lemmaWgt-lemmaTitle-title"><h1>...<h1></dd>组成,内容是由<div class="lemma-summary">...</div>组成

 

4)页面编码:UTF-8

 

2.代码编写

1)调度程序

 1 from practice.spider.baike_spider import url_manager, html_parser, html_outputer, html_downloader
 2 
 3 
 4 class SpiderMain(object):
 5 
 6     def __init__(self):
 7         self.urls = url_manager.UrlManager()
 8         self.downloader = html_downloader.HtmlDownloader()
 9         self.parser = html_parser.HtmlParser()
10         self.outputer = html_outputer.HtmlOutputer()
11 
12     def craw(self, root_url):
13         count = 1
14         self.urls.add_new_url(root_url)
15         while self.urls.has_new_url():
16             try:
17                 new_url = self.urls.get_new_url()
18                 print(f'craw {count}:{new_url}')
19                 html_cont = str(self.downloader.download(new_url), 'utf-8')
20                 # print(html_cont)
21                 new_urls, new_data = self.parser.parse(new_url, html_cont)
22                 self.urls.add_new_urls(new_urls)
23                 self.outputer.collect_data(new_data)
24 
25                 if count == 1000:
26                     break
27 
28                 count = count + 1
29             except:
30                 print('craw failed')
31 
32         self.outputer.output_html()
33 
34 
35 if __name__ == '__main__':
36     root_url = 'https://baike.baidu.com/item/Python/407313'
37     obj_spider = SpiderMain()
38     obj_spider.craw(root_url)

 

2)URL管理器

 1 class UrlManager(object):
 2 
 3     def __init__(self):
 4         self.new_urls = set()
 5         self.old_urls = set()
 6 
 7     def add_new_url(self, url):
 8         if url is None:
 9             return
10 
11         if url not in self.new_urls and url not in self.old_urls:
12             self.new_urls.add(url)
13 
14     def add_new_urls(self, urls):
15         if urls is None or len(urls) == 0:
16             return
17 
18         for url in urls:
19             self.add_new_url(url)
20         # print('new_urls: ', self.new_urls)
21 
22     def get_new_url(self):
23         new_url = self.new_urls.pop()
24         self.old_urls.add(new_url)
25 
26         return new_url
27 
28     def has_new_url(self):
29         return len(self.new_urls) != 0

 

3)HTML下载器

 1 # import urllib.request
 2 from urllib import request
 3 
 4 
 5 class HtmlDownloader(object):
 6 
 7     def download(self, url):
 8         if url is None:
 9             return None
10 
11         response = request.urlopen(url)
12         if response.getcode() != 200:
13             return None
14 
15         return response.read()

 

4)HTML解析器

 1 from bs4 import BeautifulSoup
 2 from urllib.parse import urljoin
 3 import urllib.parse
 4 import re
 5 
 6 
 7 class HtmlParser(object):
 8 
 9     def _get_new_urls(self, page_url, soup):
10         new_urls = set()
11 
12         # 获取所有链接,例如a 标签
13         '''<a target="_blank" href="/item/Unix%20shell">Unix shell</a>'''
14         links = soup.find_all('a', href=re.compile(r'/item/'))
15 
16         for link in links:
17             new_url = link['href']
18             new_full_url = urllib.parse.urljoin(page_url, new_url)
19             new_urls.add(new_full_url)
20 
21         return new_urls
22 
23     def _get_new_data(self, page_url, soup):
24         res_data = {}
25 
26         # url
27         res_data['url'] = page_url
28 
29         # <dd class="lemmaWgt-lemmaTitle-title">
30         # <h1>Python</h1>
31         title_node = soup.find('dd', class_='lemmaWgt-lemmaTitle-title').find('h1')
32         res_data['title'] = title_node.get_text()
33 
34         # <div class="lemma-summary" label-module="lemmaSummary">
35         summary_node = soup.find('div', class_='lemma-summary')
36         res_data['summary'] = summary_node.get_text()
37 
38         return res_data
39 
40     def parse(self, page_url, html_cont):
41         if page_url is None or html_cont is None:
42             return
43 
44         soup = BeautifulSoup(html_cont, 'html.parser')
45 
46         new_urls = self._get_new_urls(page_url, soup)
47         # print('new_urls: ', new_urls)
48         new_data = self._get_new_data(page_url, soup)
49         # print('new_data: ', new_data)
50 
51         return new_urls, new_data

 

5)HTML输出器

 1 from urllib.parse import unquote
 2 
 3 
 4 class HtmlOutputer(object):
 5 
 6     def __init__(self):
 7         self.datas = []
 8 
 9     def output_html(self):
10 
11         with open('output.html', 'w', encoding='utf-8') as fout:
12             fout.write('<html>')
13             fout.write('<body>')
14             fout.write('<table border="1" cellspacing="0" cellpadding="0">')
15 
16             for data in self.datas:
17                 fout.write('<tr>')
18                 fout.write(f'<td>{unquote(data["url"], encoding="utf-8")}</td>')
19                 fout.write(f'<td>{data["title"]}</td>')
20                 fout.write(f'<td>{data["summary"]}</td>')
21                 fout.write('</tr>')
22 
23             fout.write('</table>')
24             fout.write('</body>')
25             fout.write('</html>')
26 
27     def collect_data(self, data):
28         if data is None:
29             return
30 
31         self.datas.append(data)
32         # print('self.datas: ', self.datas)

 

3.爬取结果展示

 

源码下载地址:https://github.com/Nie-quan/spider.git

 

posted @ 2020-10-10 17:58  NieQuan  阅读(577)  评论(0)    收藏  举报