python抓取百度百科点赞数等动态数据

利用selenium 模拟浏览器打开页面,加载后抓取数据

#!/usr/bin/env python
# coding=utf-8

import urllib2
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import time 

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class BaikeSpider():

    def __init__(self):
        self.queue = ["http://baike.baidu.com/view/8095.htm",
                      "http://baike.baidu.com/view/2227.htm"]
        self.base = "http://baike.baidu.com"
        self.crawled = set()
        self.crawled_word = set()

#        client = MongoClient("localhost",27017)
#        self.db = client["baike_db"]["html"]

    def crawl(self):
        browser = webdriver.Chrome()
        cnt = 0
        fw = open('./baike_keywords.txt','wb')
        while self.queue:
            url = self.queue.pop(0)
            if url in self.crawled :
                continue
            self.crawled.add(url)
            try:
                browser.get(url)
                res = {}
                links = BeautifulSoup(urllib2.urlopen(url).read(),'lxml').find_all("a")
                links = list(set(links))
                for link in links:
                    if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])<8:
                        continue
                    url = link['href']
                    if re.search(u"baike\.baidu\.com/view/\d+|baike\.baidu\.com/subview/\d+/\d+.htm",url) and url not in self.crawled:
                        self.queue.append(url)
                    elif re.match(u"view/\d+",url):
                        url = self.base+ url
                        if url not in self.crawled:                            
                            self.queue.append(url)
                
                cnt += 1
                print cnt
                if cnt % 10 == 0:
                    print 'queue',len(self.queue)
                    fw.close()
                    fw = open('./baike_keywords.txt','a+')
            
                res['url'] = url
                res['title'] = browser.title.split(u"_")[0]

                if res['title'] in self.crawled_word:
                    print 'title',res['title'],'has crawled'
                    continue
                
                vote = browser.find_element_by_class_name("vote-count")
                view = browser.find_element_by_id("j-lemmaStatistics-pv")

                res['voted'] = vote.text
                res['viewed'] = view.text
                
                line = []
                line.append(res['title'])
                line.append(res['viewed'])
                line.append(res['voted'])
                line.append(res['url'])
                
                line = '\t'.join(line)
                fw.write(line+'\n')
                self.crawled_word.add(res["title"])

            except Exception,e:
                print e
                continue


if __name__=='__main__':
    test = BaikeSpider()
    test.crawl()

 

另外,使用chrome加载会比firefox快,且少报错,异常退出!

posted on 2015-12-24 17:08  星空守望者--jkmiao  阅读(543)  评论(0)    收藏  举报