#items.py
import scrapy
class InsistItem(scrapy.Item):
comment=scrapy.Field()
#pipelines.py
import json
class InsistPipeline(object):
def __init__(self):
self.f=open('tencent.json','w',encoding='gbk')
def process_item(self, item, spider):
#item(Item对象,被爬取的item)
#这个方
content=json.dumps(dict(item),ensure_ascii=False)+",\n"
self.f.write(content)
return item
#tengxun.py
import scrapy
from insist.items import InsistItem
import json
class TengxunSpider(scrapy.Spider):
name = 'tengxun'
allowed_domains = ['sclub.jd.com']
#start_urls = ['https://item.jd.com/4432058.html']
baseURL = 'https://sclub.jd.com/comment/productPageComments.action?productId=4432058&score=0&sortType=5&pageSize=10&isShadowSku=0&rid=0&fold=1&page='
offset = 0
start_urls = [baseURL + str(offset)]
def parse(self, response):
com=json.loads(response.body.decode('gbk'))
comment=com['comments']
for co in comment:
item = InsistItem()
item['comment']=co['content']
yield item
if self.offset<100:
self.offset+=1
yield scrapy.Request(self.baseURL+str(self.offset),callback=self.parse)