python简易爬虫

这两天看了一下python的基础语法，跟着网上的教程爬了一下百度百科和python相关联的词条。采用了beautifulsoup4插件

下面是流程

首先是一个url管理器，负责增加/获取/判断是否有url

 1 # coding:UTF8
 2 #url管理器
 3 class UrlManager(object):
 4     def __init__(self):
 5         self.new_urls=set()
 6         self.old_urls=set()
 7     
 8     def add_new_url(self,url):
 9         if url is None:
10             return
11         
12         if url not in self.new_urls and url not in self.old_urls:
13             self.new_urls.add(url)
14 
15     def add_new_urls(self,urls):
16         if urls is None or len(urls)==0:
17             return
18         
19         for url in urls:
20             self.add_new_url(url)
21     
22     def has_new_url(self):
23         return len(self.new_urls)!=0
24 
25     
26     def get_new_url(self):
27         new_url=self.new_urls.pop()
28         self.old_urls.add(new_url)
29         return new_url

url_manager.py

然后是一个网页下载器，实现网页的下载

 1 import urllib2
 2 
 3 class HtmlDownloader(object):
 4     
 5     
 6     def download(self, url):
 7         if url is None:
 8             return None
 9         
10         #下载网页
11         response = urllib2.urlopen(url)
12         
13         #如果状态不为200 即失败，返回空
14         if response.getcode() != 200:
15             return None
16         
17         #以字符串形式返回网页
18         return response.read()

html_downloader.py

然后是网页解析器，实现获取网页中的url和内容的保存

 1 from bs4 import BeautifulSoup
 2 import re
 3 import urlparse
 4 
 5 
 6 class HtmlParser(object):
 7     
 8     
 9     def _get_new_url(self, page_url, soup):
10         new_urls=set()
11         # /view/123.htm
12         links=soup.find_all('a',href=re.compile(r"/view/\d+\.htm"))
13         for link in links:
14             new_url=link['href']
15             new_full_url=urlparse.urljoin(page_url,new_url)
16             new_urls.add(new_full_url)
17             
18         return new_urls
19             
20     def _get_new_data(self, page_url, soup):
21         res_data={}
22         
23         #url
24         res_data['url']=page_url
25         
26         #<dd class="lemmaWgt-lemmaTitle-title"> <h1>Python</h1>
27         title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
28         res_data['title']=title_node.get_text()
29         
30         # <div class="lemma-summary">
31         summary_node=soup.find('div',class_="lemma-summary")
32         res_data['summary']=summary_node.get_text()
33         
34         return res_data
35     
36     def parse(self,page_url,html_cont):
37         if page_url is None or html_cont is None:
38             return
39         
40         soup =BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
41         new_urls=self._get_new_url(page_url,soup)
42         new_data=self._get_new_data(page_url,soup)
43         return new_urls,new_data

html_parser.py

然后是网页内容输出，将我们需要的内容保存下来

 1 class HtmlOutputer(object):
 2     def __init__(self):
 3         self.datas=[]
 4     
 5     def collect_data(self,data):
 6         if data is None:
 7             return
 8         self.datas.append(data)
 9 
10     
11     def output_html(self):
12         fout=open('output.html','w')
13         
14         fout.write("<html>")        
15         fout.write("<body>")
16         fout.write("<table>")
17         
18         #ascii
19         for data in self.datas:
20             fout.write("<tr>")
21             fout.write("<tr>%s</td>" % data['url'])
22             fout.write("<tr>%s</td>" % data['title'].encode('utf-8'))
23             fout.write("<tr>%s</td>" % data['summary'].encode('utf-8'))
24             fout.write("/tr")
25     
26         fout.write("</html>")        
27         fout.write("</body>")
28         fout.write("</table>")
29         
30         fout.close()

html_outputer

最后是主函数的调用,注释还算明白

 1 # coding:UTF8
 2 # 以入口url为参数爬取相关页面
 3 from baike_spider import url_manager, html_downloader, html_parser, html_outputer
 4 
 5 
 6 class SpiderMain(object):
 7     #构造函数声明url管理器，网页下载器，网页解析器，网页输入器
 8     def __init__(self):
 9         self.urls = url_manager.UrlManager()
10         self.downloader = html_downloader.HtmlDownloader()
11         self.parser = html_parser.HtmlParser()
12         self.outputer = html_outputer.HtmlOutputer()
13         
14     def craw(self, root_url):
15         #记录爬取的网页数
16         count = 1
17         
18         #把入口url传入
19         self.urls.add_new_url(root_url)
20         #当new_urls里面还有元素时
21         while self.urls.has_new_url():
22             try:
23                 #获取一个新的url
24                 new_url = self.urls.get_new_url()
25                 print 'craw %d:%s' % (count, new_url)
26                 
27                 #接收网页的内容
28                 html_cont = self.downloader.download(new_url)
29                 
30                 #获取新的url 和需要的网页上的内容
31                 new_urls, new_data = self.parser.parse(new_url, html_cont)
32                 self.urls.add_new_urls(new_urls)
33                 
34                 #拼接数据字符串
35                 self.outputer.collect_data(new_data)
36         
37                 if count == 1000:
38                     break
39             
40                 count = count + 1
41                 
42             except:
43                 print 'craw failed'
44                 
45                 #向output.html写入数据
46                 self.outputer.output_html()
47                 
48 if __name__ == "__main__":
49     root_url = "http://baike.baidu.com/view/21087.htm"
50     obj_spider = SpiderMain()
51     obj_spider.craw(root_url)

spider_main.py

然后是输出结果的截图

另外网页的信息保存到了output.html文件中

posted @ 2016-07-22 21:07 docyard 阅读(416) 评论(0) 收藏举报

刷新页面返回顶部

穿越银河系

python简易爬虫

公告