爬取京东评论信息
需要设置代理:
import urllib.request import json import random import time as time0 import re, os import pandas as pd # 设置代理 agents = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def product_reviews(product_id=None, p=0, maxPage=99): root_dir = 'pinglun' # 判断之前是否爬取过这个型号手机的评论(一种型号的手机,颜色和内存不同,但评论共享) os.makedirs(root_dir, exist_ok=True) phone_list = os.listdir(root_dir) phone_txt = str(product_id) + '.txt' if phone_txt in phone_list: print(product_id) return [] # 对每一页循环爬取 # "maxPage": 45 k_head = 0 while p < maxPage: # 所有品牌评论 # url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&rid=0&fold=1' # 只看当前商品的评论 url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&fold=1' url = url.format(product_id, p, maxPage) # print(url) # 仿造请求头,骗过浏览器 # cookie可以查找自己浏览器中的cookie,直接复制过来 cookie = '' headers = { 'User-Agent': ''.join(random.sample(agents, 1)), 'Referer': 'https://item.jd.com/100018902008.html', 'Cookie': cookie } # 发起请求 request = urllib.request.Request(url=url, headers=headers) time0.sleep(2.5) # 得到响应ti'm try: content = urllib.request.urlopen(request).read().decode('gbk') except: print('第%d页评论代码出错' % p) p = p + 1 continue # 去掉多余得到json格式 content = content.strip('fetchJSON_comment98vv995();') # 评论的最大页数 try: maxPage = int(re.findall('"maxPage":(.*?),"', content, re.S)[0]) except: pass try: obj = json.loads(content) except: print('信号不好,再次尝试!') print([content]) print(url) continue comments = obj['comments'] # 产品评论总结 productCommentSummary = obj['productCommentSummary'] dict_pars_info = {} # 平均分 dict_pars_info['平均分'] = str(productCommentSummary['averageScore']) # 好评率 dict_pars_info['好评率'] = str(productCommentSummary['goodRate']) # 当前总评论数 dict_pars_info['当前评论数'] = str(productCommentSummary['commentCount']) # 默认评论数 dict_pars_info['默认评论数'] = str(productCommentSummary['defaultGoodCount']) # 追评、好评、中评、差评 dict_pars_info['追评数'] = str(productCommentSummary['afterCount']) dict_pars_info['好评数'] = str(productCommentSummary['goodCount']) dict_pars_info['中评数'] = str(productCommentSummary['generalCount']) dict_pars_info['差评数'] = str(productCommentSummary['poorCount']) if len(comments) > 0: # print(comments) for comment in comments: # print(comment) name = comment['referenceName'] id = comment['id'] guid=comment['guid'] con = comment['content'] time = comment['creationTime'] istTop=comment['isTop'] userClient=comment['userClient'] score = comment['score'] likes = comment['usefulVoteCount'] # userLevelName=comment['userLevelName'] replyCount = comment['replyCount'] # userClientShow=comment['userClientShow'] # # isMobile=comment['isMobile'] days=comment['days'] try: productColor = comment['productColor'] except: productColor = '' try: productSize = comment['productSize'] except: productSize = '' nickname=comment['nickname'] isHuiYuan=comment['plusAvailable'] if(isHuiYuan!="201"): isHuiYuan=0 else: isHuiYuan=1 isMobile=comment['mobileVersion'] if(isMobile!=""): isMobile="0" else: isMobile="1" item = { # 'name': name, 'id': id, 'score': score, 'guid':guid, 'con': con, 'time': time, 'isTop':istTop, 'userClient':userClient, # 'userLevelName':userLevelName, 'productColor': productColor, 'productSize': productSize, 'likes': likes, 'replyCount': replyCount, 'nickname':nickname, # 'userClientShow':userClientShow, 'userClientShow':"来自京东iPhone客户端", 'isMobile':isMobile, 'isHuiYuan':isHuiYuan, 'days':days } item.update(dict_pars_info) # print(item) string = str(item) # 1.保存为csv格式 item_dataframe = pd.DataFrame([item]) # print(item_dataframe) if k_head == 0: item_dataframe.to_csv(root_dir + '/%d.csv' % product_id, mode='w', header=True, index=False, encoding='GBK') k_head += 1 else: item_dataframe.to_csv(root_dir + '/%d.csv' % product_id, mode='a', header=False, index=False, encoding='GBK') # 2.保存成txt fp = open(root_dir + '/%d.txt' % product_id, 'a', encoding='UTF-8') fp.write(string + '\n') fp.close() print('%s-page---finish(%s/%s)' % (p, p, maxPage)) else: return [] p = p + 1 if __name__ == '__main__': phone_id = 100018902008 product_reviews(product_id=phone_id)