# -*- coding: utf-8 -*-
# 分析动态网页请求爬取腾讯视频评论
import scrapy
import re
import json
import time
from tencent.items import TencentItem
class TenspiderSpider(scrapy.Spider):
name = "tenspider"
# allowed_domains = ["v.qq.com/x/cover/ga7nei8pd5i9mek.html"]
start_urls = ['http://v.qq.com/x/cover/ga7nei8pd5i9mek.html/']
#为了生成comment_id
comment_url = 'https://ncgi.video.qq.com/fcgi-bin/video_comment_id?otype=json&op=3&cid='
#为了生成评论页js的url
base_url = 'https://coral.qq.com/article/{comment_id}/comment?commentid=0&reqnum=1000'
def start_requests(self):
movie_cid = re.search(r'cover/(.*?).html', self.start_urls[0]).group(1)
video_comment_url = self.comment_url + movie_cid
# print(video_comment_url)
yield scrapy.Request(url = video_comment_url, callback= self.parse_video)
def parse_video(self, response):
html = re.search(r'=(.*?);',response.text).group(1)
data = json.loads(html)
comment_id = data.get('comment_id')
f_comment_url = self.base_url.format(comment_id=comment_id)
yield scrapy.Request(url = f_comment_url, callback = self.parse_comment)
def parse_comment(self, response):
item = TencentItem()
data = json.loads(response.text)
for each in data.get('data').get('commentid'):
comment = each['content']
timestamp = each['time']
# 转换成localtime
time_local = time.localtime(timestamp)
# 转换成新的时间格式(2016-05-05 20:28:54)
date = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
user = each['userinfo']['nick']
region = each['userinfo']['region']
userid = each['userinfo']['userid']
item['comment'] = comment
item['user'] = user
item['date'] = date
item['region'] = region
item['userid'] = userid
yield item