crawlspider - .Tang

公告

crawlspider提取url

创建一个crawlspider爬虫

scrapy genspider --t crawl baidu baidu.com

py脚本启动多个spider

from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess


def main():
    setting = get_project_settings()
    process = CrawlerProcess(setting)
    didntWorkSpider = []
    WorkSpider = ['sample']
    # func1 启动WorkSpider 里的spider
    for spider_name in WorkSpider:
        print("Running spider %s" % (spider_name))
        process.crawl(spider_name)
    process.start()
    # # func2 启动除didntWorkSpider 以外的spider
    # for spider_name in process.spiders.list():
    #     if spider_name in didntWorkSpider:
    #         continue
    #     print("Running spider %s" % (spider_name))
    #     process.crawl(spider_name)
    # process.start()



rules = (
        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

allow  正则匹配
restrict_css  css匹配， 自动提取url
restrict_xpath xpath匹配, 自动提取url

# restrict_xpaths=("//div[@class='a'/li]") 能够去除li下面所有a标签的url地址并进行请求
Rule(LinkExtractor(restrict_xpaths=("//div[@class='a'/li]")), callback='parse_item', follow=True),

创建的爬虫

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class CfSpider(CrawlSpider):
    name = 'cf'
    allowed_domains = ['circ.gov.cn']
    start_urls = ['http://circ.gov.cn/']
    # 提取规则 follow=True 继续提取（提取下一页地址 需要True）
    rules = (
        Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        return item

爬去腾讯招聘职位

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class HrSpider(CrawlSpider):
    name = 'hr'
    allowed_domains = ['tencent.com']
    start_urls = ['https://hr.tencent.com/index.php']
    rules = (
        # https://hr.tencent.com/social.php
        Rule(LinkExtractor(allow=r'https://hr.tencent.com/position.php'),
             callback='parse_item', follow=True),
　　　　 # next page 
        Rule(LinkExtractor(allow=r'https://hr.tencent.com/position.php?keywords=&tid=0&start=\d{1}0#a'),
             follow=True),
    )

    def parse_item(self, response):
        item = {}
        tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1]
        for tr in tr_list:
            item['title'] = tr.xpath("./td[1]/a/text()").extract_first()
            item['position'] = tr.xpath("./td[2]/text()").extract_first()
            item['pub_date'] = tr.xpath("./td[5]/text()").extract_first()
            yield item

posted on 2019-04-27 14:40 .Tang 阅读(105) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部