python 多线程抓取动态数据

利用多线程动态抓取数据,网上也有不少教程,但发现过于繁杂,就不能精简再精简?!

不多解释,直接上代码,基本上还是很好懂的。

#!/usr/bin/env python
# coding=utf-8

import urllib2
import re,sys
from bs4 import BeautifulSoup
from selenium import webdriver
import threading
import time
reload(sys)
sys.setdefaultencoding("utf-8")

queue = [
    "http://baike.baidu.com/view/8332.htm",
    "http://baike.baidu.com/view/145819.htm",
    "http://baike.baidu.com/view/643415.htm",
    "http://baike.baidu.com/view/157424.htm",
    "http://baike.baidu.com/view/149759.htm",]

crawled_url = set()
crawled_word = set()

cnt = 0

class BaikeSpider(threading.Thread):
    """
    模拟浏览器打开页面,多线程爬取数据
    """

    def __init__(self,name):
        threading.Thread.__init__(self)
        self.name = str(name)

        self.browser = webdriver.Chrome()
# 将抓取数据写入各自的文件 self.fw = open("baike_words_"+self.name+".txt","wb") def run(self): global queue global crawled_url global crawled_word global cnt while queue: url = queue.pop(0) try: self.browser.get(url) # 休眠0.5s,等待数据加载 time.sleep(0.5) links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a") vote = self.browser.find_element_by_class_name("vote-count").text view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text word = self.browser.title.split(u"_")[0] if word in crawled_word or url in crawled_url: continue else: for link in links: if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href']) <8: continue tmpurl = link["href"] if re.search("baike.baidu.com/view/\d+|baike.baidu.com/subview/\d+(/\d+)?",tmpurl) and tmpurl n ot in crawled_url: queue.append(tmpurl) crawled_url.add(url) linedata = word+"\t"+view+"\t"+vote+"\t"+url+"\n" self.fw.write(linedata) except Exception,e: print 'error',e continue cnt += 1 print cnt,self.name,'len',len(queue) def __exit__(self): self.fw.close() if __name__=='__main__': """ 开5个线程 """ for i in range(5): t = BaikeSpider(i) t.start()

 

posted on 2015-12-24 17:13  星空守望者--jkmiao  阅读(764)  评论(0编辑  收藏  举报