在循环中启动scrapy爬虫

最近在做一个ai推荐明星项目,首先需要进行数据的收集
所以这些天都在着手爬虫

其中需要我完善师兄爬的数据,自己爬百度百科去完善数据库里的词条
在启动爬虫的时候碰到一个问题
需要我遍历表单,提取明星名、作品名分别更改爬虫关键词启动爬虫→收集数据
但是碰到一个问题,这需要我在循环中调用爬虫
自己写的时候 一直碰到ReactorNotRestartable之类的问题,百度也无果
后来问了师兄捣鼓了好久才弄成功
在这写一下博客分享一下实现的办法 希望能帮到以后如果有同样困扰的朋友

解决思路:

根据不同的task值生成不同的runner爬虫任务,把爬虫任务添加到集dfs中,然后通过把dfs添加到DefferedList进行启动

核心函数crawl_run函数代码如下:

def crawl_run(proc,spider):
    runner = CrawlerRunner(settings)
    dfs = set()
    for task in proc:
        try:
            print(task)
            d = runner.crawl(spider, task)
            dfs.add(d)
        except Exception:
            info = sys.exc_info()
            print('{0}:{1}'.format(info[0], info[1]))

    defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
    reactor.run()

爬虫启用核心代码:

 task = {'keyword': key_search, 'tid': 200, 'pid': 2001, } #关键词,key_search根据数据集进行变化
        #爬虫启用 
        if need_find is True:
            proc = []
            proc.append(task)
            if proc.__len__() >= 1:
                p = Process(target=crawl_run, args=(proc,爬虫名称,), )
                p.start()
                p.join(timeout=180)
                proc.clear()
            print("setup crawl!")
        else :
            print('waiting...')

最终完整代码实现如下:

from scrapy.conf import settings
from baidu.spiders.baidus import *
from baidu.spiders.baidubaike import *
from scrapy.crawler import CrawlerProcess
import mysql.connector
from twisted.internet import reactor,defer
from scrapy.cmdline import execute
import os
import csv
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerRunner
from multiprocessing import Process

def crawl_run(proc,spider):
    runner = CrawlerRunner(settings)
    dfs = set()
    for task in proc:
        try:
            print(task)
            d = runner.crawl(spider, task)
            dfs.add(d)
        except Exception:
            info = sys.exc_info()
            print('{0}:{1}'.format(info[0], info[1]))

    defer.DeferredList(dfs).addBoth(lambda _: reactor.stop())
    reactor.run()

def connect_mysql():
    conn=mysql.connector.connect(host="localhost",user='root',password='123456',database="superstar",charset='utf8')
    cursor = conn.cursor()
    cursor.execute('select * from test_star')
    D=cursor.fetchall()
    for d in D:
        print(type(d))
        print(d[1])
        print(d[2])


# connect_mysql()
if __name__ == '__main__':
    runner = CrawlerRunner(settings)

    #访问数据库
    conn = mysql.connector.connect(host="localhost", user='root', password='123456', database="superstar",
                                   charset='utf8')
    cursor = conn.cursor()
    cursor.execute('select workname,star,role,id from tbl_works where id >=4316 and id <=6315')
    D = cursor.fetchall()
    # 对于读出的指定id数据条进行遍历
    for d in D:
        #是否要进行数据爬取的判断
        flag_workname =True
        flag_star = True
        flag_role = True
        if flag_workname and flag_star and flag_role :
            need_find = True
        else:
            need_find = False

        work_name = ""
        star_name =""
        if d[0] == "" :
            flag_workname = False
        if d[1] == "" :
            flag_star = False
        if d[2] == "" :
            flag_role = False

        if flag_star :
            separate = re.findall(r'\S+', d[1])
            star_name = separate[0]
        if flag_workname :
            work_name = d[0]
        if flag_workname and flag_star and flag_role :
            need_find = True
        else:
            need_find = False
        if flag_role :
            role_separate = re.findall(r'\S+',d[2])
            if len(role_separate) > 2:
                if role_separate[1] == "(饰" :
                    need_find = False

        key_search = work_name + ' ' +star_name + ' ' +d[3]
        print(key_search)

        task = {'keyword': key_search, 'tid': 200, 'pid': 2001, }
        #爬虫启用
        if need_find is True:
            proc = []
            proc.append(task)
            if proc.__len__() >= 1:
                p = Process(target=crawl_run, args=(proc, BaidubaikeSpider,), )
                p.start()
                p.join(timeout=180)
                proc.clear()
            print("setup crawl!")
        else :
            print('waiting...')

    print('waiting.1..')

 

posted @ 2020-09-28 16:20  -DP-  阅读(500)  评论(0编辑  收藏  举报