scrapy startproject insist #创建项目
scrapy genspider teng carees.tencent.com#创建爬虫(爬虫名字+域名)
items.py
#需要爬取的信息
import scrapy
class InsistItem(scrapy.Item):
# define the fields for your item here like:
positionname = scrapy.Field()
type=scrapy.Field()
place=scrapy.Field()
mian=scrapy.Field()
time=scrapy.Field()
#pass
pipelines.py
#保存数据到数据库或者json文件
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import json
class InsistPipeline(object):
def __init__(self):
self.f=open('teng.json','w',encoding='utf-8')#编码
def process_item(self, item, spider):
#item(Item对象,被爬取的item)
#这个方法必须实现
content=json.dumps(dict(item),ensure_ascii=False)+",\n"
self.f.write(content)
return item
teng.py
import scrapy
import json
from insist.items import InsistItem
class TengSpider(scrapy.Spider):
name = 'teng'
allowed_domains = ['careers.tencent.com']
#start_urls = ['http://careers.tencent.com/']
baseURL = 'https://careers.tencent.com/tencentcareer/api/post/Query?pageSize=10&pageIndex='
offset = 1
start_urls = [baseURL + str(offset)]
def parse(self, response):
contents=json.loads(response.text)
jobs=contents['Data']['Posts']
item=InsistItem()
for job in jobs:
item['positionname']=job['RecruitPostName']
item['type']=job['BGName']
item['place']=job['LocationName']
item['mian']=job['CategoryName']
item['time']=job['LastUpdateTime']
yield item
if self.offset<=10:
self.offset += 1
yield scrapy.Request(self.baseURL + str(self.offset), callback=self.parse)