腾讯招聘网站爬虫

scrapy startproject Qqspider
cd Qqspider
cd Qqspider
ls
vi items.py

  1 # -*- coding: utf-8 -*-
  2 
  3 # Define here the models for your scraped items
  4 #
  5 # See documentation in:
  6 # http://doc.scrapy.org/en/latest/topics/items.html
  7 
  8 import scrapy
  9 
 10 
 11 class QqspiderItem(scrapy.Item):
 12     # define the fields for your item here like:
 13     positionName = scrapy.Field()
 14     positionLink = scrapy.Field()
 15     positionType = scrapy.Field()
 16     peopleNum = scrapy.Field()
 17     workLocation = scrapy.Field()
 18     publishTime = scrapy.Field()
~

7.vi pipelines.py

  1 # -*- coding: utf-8 -*-
  2 
  3 # Define your item pipelines here
  4 #
  5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
  7 
  8 import json
  9 class QqspiderPipeline(object):
 10 
 11 
 12     def __init__(self):
 13         self.filename = open("tencent.json", "w")
 14 
 15     def process_item(self, item, spider):
 16         text = json.dumps(dict(item), ensure_ascii = False)
 17         self.filename.write(text.encode("utf-8"))
 18         return item
 19 
 20     def close_spider(self, spider):
 21         self.filename.close()
 22

8.vi settings.py

 42 DEFAULT_REQUEST_HEADERS = {
 43     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 44     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 45 #   'Accept-Language': 'en',
 46 }

 68 ITEM_PIPELINES = {
 69     'QqSpider.pipelines.QqspiderPipeline': 300,
 70 }

9.cd spiders

scrapy genspider t crawl qqcent tencent.com
vi qqcent.py

 1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from scrapy.linkextractors import LinkExtractor
  4 from scrapy.spiders import CrawlSpider, Rule
  5 from QqSpider.items import QqspiderItem
  6 
  1 # -*- coding: utf-8 -*-
  2 import scrapy
  3 from scrapy.linkextractors import LinkExtractor
  4 from scrapy.spiders import CrawlSpider, Rule
  5 from QqSpider.items import QqspiderItem
  6 
  7 class QqcentSpider(CrawlSpider):
  8     name = 'qqcent'
  9     allowed_domains = ['tencent.com']
 10     start_urls = ['http://hr.tencent.com/position.php?&start=0#a']
 11     pagelink = LinkExtractor(allow=("start=\d+"))
 12     rules = [
 13         Rule(pagelink, callback='parseQqcent', follow=True)
 14     ]
 15 
 16     def parseQqcent(self, response):
 17         #i = {}
 18         #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
 19         #i['name'] = response.xpath('//div[@id="name"]').extract()
 20         #i['description'] = response.xpath('//div[@id="description"]').extract()
 21 
 22         for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):
 23             item = QqspiderItem()
 24 
 25             item['positionName'] = each.xpath("./td[1]/a/text()").extract()[0]
 26             item['positionLink'] = each.xpath("./td[1]/a/@href").extract()[0]
 27 
 28             item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
 29 
 30             item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
 31 
 32             item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
 33 
 34             item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]
 35 
 36             yield item

12 scrapy crawl qqcent

vi tencent.json

posted @ 2017-11-14 20:40 zy-- 阅读(279) 评论(0) 收藏举报

刷新页面返回顶部

hi

慢就是快，去改变世界

腾讯招聘网站爬虫

公告