python抓取百度百科点赞数等动态数据
利用selenium 模拟浏览器打开页面,加载后抓取数据
#!/usr/bin/env python # coding=utf-8 import urllib2 import re from bs4 import BeautifulSoup from selenium import webdriver import time import sys reload(sys) sys.setdefaultencoding('utf-8') class BaikeSpider(): def __init__(self): self.queue = ["http://baike.baidu.com/view/8095.htm", "http://baike.baidu.com/view/2227.htm"] self.base = "http://baike.baidu.com" self.crawled = set() self.crawled_word = set() # client = MongoClient("localhost",27017) # self.db = client["baike_db"]["html"] def crawl(self): browser = webdriver.Chrome() cnt = 0 fw = open('./baike_keywords.txt','wb') while self.queue: url = self.queue.pop(0) if url in self.crawled : continue self.crawled.add(url) try: browser.get(url) res = {} links = BeautifulSoup(urllib2.urlopen(url).read(),'lxml').find_all("a") links = list(set(links)) for link in links: if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])<8: continue url = link['href'] if re.search(u"baike\.baidu\.com/view/\d+|baike\.baidu\.com/subview/\d+/\d+.htm",url) and url not in self.crawled: self.queue.append(url) elif re.match(u"view/\d+",url): url = self.base+ url if url not in self.crawled: self.queue.append(url) cnt += 1 print cnt if cnt % 10 == 0: print 'queue',len(self.queue) fw.close() fw = open('./baike_keywords.txt','a+') res['url'] = url res['title'] = browser.title.split(u"_")[0] if res['title'] in self.crawled_word: print 'title',res['title'],'has crawled' continue vote = browser.find_element_by_class_name("vote-count") view = browser.find_element_by_id("j-lemmaStatistics-pv") res['voted'] = vote.text res['viewed'] = view.text line = [] line.append(res['title']) line.append(res['viewed']) line.append(res['voted']) line.append(res['url']) line = '\t'.join(line) fw.write(line+'\n') self.crawled_word.add(res["title"]) except Exception,e: print e continue if __name__=='__main__': test = BaikeSpider() test.crawl()
另外,使用chrome加载会比firefox快,且少报错,异常退出!
    每天一小步,人生一大步!Good luck~
 
                     
                    
                 
                    
                 
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号