腾讯招聘网站爬虫

  1. scrapy startproject Qqspider
  2. cd Qqspider
  3. cd Qqspider
  4. ls
  5. vi items.py
  6.   1 # -*- coding: utf-8 -*-
      2 
      3 # Define here the models for your scraped items
      4 #
      5 # See documentation in:
      6 # http://doc.scrapy.org/en/latest/topics/items.html
      7 
      8 import scrapy
      9 
     10 
     11 class QqspiderItem(scrapy.Item):
     12     # define the fields for your item here like:
     13     positionName = scrapy.Field()
     14     positionLink = scrapy.Field()
     15     positionType = scrapy.Field()
     16     peopleNum = scrapy.Field()
     17     workLocation = scrapy.Field()
     18     publishTime = scrapy.Field()
    ~                                          

    7.vi  pipelines.py

  7.   1 # -*- coding: utf-8 -*-
      2 
      3 # Define your item pipelines here
      4 #
      5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
      6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
      7 
      8 import json
      9 class QqspiderPipeline(object):
     10 
     11 
     12     def __init__(self):
     13         self.filename = open("tencent.json", "w")
     14 
     15     def process_item(self, item, spider):
     16         text = json.dumps(dict(item), ensure_ascii = False)
     17         self.filename.write(text.encode("utf-8"))
     18         return item
     19 
     20     def close_spider(self, spider):
     21         self.filename.close()
     22 

    8.vi settings.py

  8.  42 DEFAULT_REQUEST_HEADERS = {
     43     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
     44     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     45 #   'Accept-Language': 'en',
     46 }

    68 ITEM_PIPELINES = {
     69     'QqSpider.pipelines.QqspiderPipeline': 300,
     70 }



    9.cd spiders

  9. scrapy genspider t crawl qqcent tencent.com
  10. vi qqcent.py
  11.  1 # -*- coding: utf-8 -*-
      2 import scrapy
      3 from scrapy.linkextractors import LinkExtractor
      4 from scrapy.spiders import CrawlSpider, Rule
      5 from QqSpider.items import QqspiderItem
      6 
      1 # -*- coding: utf-8 -*-
      2 import scrapy
      3 from scrapy.linkextractors import LinkExtractor
      4 from scrapy.spiders import CrawlSpider, Rule
      5 from QqSpider.items import QqspiderItem
      6 
      7 class QqcentSpider(CrawlSpider):
      8     name = 'qqcent'
      9     allowed_domains = ['tencent.com']
     10     start_urls = ['http://hr.tencent.com/position.php?&start=0#a']
     11     pagelink = LinkExtractor(allow=("start=\d+"))
     12     rules = [
     13         Rule(pagelink, callback='parseQqcent', follow=True)
     14     ]
     15 
     16     def parseQqcent(self, response):
     17         #i = {}
     18         #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
     19         #i['name'] = response.xpath('//div[@id="name"]').extract()
     20         #i['description'] = response.xpath('//div[@id="description"]').extract()
     21 
     22         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
     23             item = QqspiderItem()
     24 
     25             item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
     26             item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
     27 
     28             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
     29 
     30             item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
     31 
     32             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
     33 
     34             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
     35 
     36             yield item

    12 scrapy crawl qqcent

  12. vi  tencent.json
posted @ 2017-11-14 20:40  zy--  阅读(277)  评论(0)    收藏  举报