python 多线程抓取动态数据

利用多线程动态抓取数据，网上也有不少教程，但发现过于繁杂，就不能精简再精简？！

不多解释，直接上代码，基本上还是很好懂的。

#!/usr/bin/env python
# coding=utf-8

import urllib2
import re,sys
from bs4 import BeautifulSoup
from selenium import webdriver
import threading
import time
reload(sys)
sys.setdefaultencoding("utf-8")

queue = [
    "http://baike.baidu.com/view/8332.htm",
    "http://baike.baidu.com/view/145819.htm",
    "http://baike.baidu.com/view/643415.htm",
    "http://baike.baidu.com/view/157424.htm",
    "http://baike.baidu.com/view/149759.htm",]

crawled_url = set()
crawled_word = set()

cnt = 0

class BaikeSpider(threading.Thread):
    """
    模拟浏览器打开页面，多线程爬取数据
    """

    def __init__(self,name):
        threading.Thread.__init__(self)
        self.name = str(name)

        self.browser = webdriver.Chrome()

        # 将抓取数据写入各自的文件
        self.fw = open("baike_words_"+self.name+".txt","wb")

    def run(self):
        global queue
        global crawled_url
        global crawled_word
        global cnt

        while queue:
            url = queue.pop(0)
            
            try:
                self.browser.get(url)
                # 休眠0.5s，等待数据加载
                time.sleep(0.5)
                links = BeautifulSoup(urllib2.urlopen(url).read(),"lxml").find_all("a")
                vote = self.browser.find_element_by_class_name("vote-count").text
                view = self.browser.find_element_by_id("j-lemmaStatistics-pv").text
                word = self.browser.title.split(u"_")[0]

                if word in crawled_word or url in crawled_url:
                    continue                
                else:
                    for link in links:
                        if 'href' not in dict(link.attrs) or re.search(u"javascript",link['href']) or len(link['href'])
<8:
                            continue
                        tmpurl = link["href"]
                        if re.search("baike.baidu.com/view/\d+|baike.baidu.com/subview/\d+(/\d+)?",tmpurl) and tmpurl n
ot in crawled_url:
                            queue.append(tmpurl)

                crawled_url.add(url)
                linedata = word+"\t"+view+"\t"+vote+"\t"+url+"\n"
                self.fw.write(linedata)

            except Exception,e:
                print 'error',e
                continue

            cnt += 1
            print cnt,self.name,'len',len(queue)


    def __exit__(self):
        self.fw.close()

if __name__=='__main__':
    """
    开５个线程
    """
    for i in range(5):
        t = BaikeSpider(i)
        t.start()

posted on 2015-12-24 17:13 星空守望者--jkmiao 阅读(764) 评论(0) 编辑收藏举报