爬取京东评论信息

需要设置代理:

import urllib.request
import json
import random
import time as time0
import re, os
import pandas as pd
# 设置代理
agents = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
def product_reviews(product_id=None, p=0, maxPage=99):
    root_dir = 'pinglun'
    # 判断之前是否爬取过这个型号手机的评论(一种型号的手机,颜色和内存不同,但评论共享)
    os.makedirs(root_dir, exist_ok=True)
    phone_list = os.listdir(root_dir)
    phone_txt = str(product_id) + '.txt'
    if phone_txt in phone_list:
        print(product_id)
        return []

    # 对每一页循环爬取
    # "maxPage": 45
    k_head = 0
    while p < maxPage:
        # 所有品牌评论
        # url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&rid=0&fold=1'
        # 只看当前商品的评论
        url = 'https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={}&score=0&sortType=5&page={}&pageSize={}&isShadowSku=0&fold=1'
        url = url.format(product_id, p, maxPage)
        # print(url)
        # 仿造请求头,骗过浏览器
        # cookie可以查找自己浏览器中的cookie,直接复制过来
        cookie = ''
        headers = {
            'User-Agent': ''.join(random.sample(agents, 1)),
            'Referer': 'https://item.jd.com/100018902008.html',
            'Cookie': cookie
        }
        # 发起请求
        request = urllib.request.Request(url=url, headers=headers)
        time0.sleep(2.5)
        # 得到响应ti'm
        try:
            content = urllib.request.urlopen(request).read().decode('gbk')
        except:
            print('第%d页评论代码出错' % p)
            p = p + 1
            continue
        # 去掉多余得到json格式
        content = content.strip('fetchJSON_comment98vv995();')

        # 评论的最大页数
        try:
            maxPage = int(re.findall('"maxPage":(.*?),"', content, re.S)[0])
        except:
            pass

        try:
            obj = json.loads(content)
        except:
            print('信号不好,再次尝试!')
            print([content])
            print(url)
            continue

        comments = obj['comments']
        # 产品评论总结
        productCommentSummary = obj['productCommentSummary']
        dict_pars_info = {}
        # 平均分
        dict_pars_info['平均分'] = str(productCommentSummary['averageScore'])
        # 好评率
        dict_pars_info['好评率'] = str(productCommentSummary['goodRate'])
        # 当前总评论数
        dict_pars_info['当前评论数'] = str(productCommentSummary['commentCount'])
        # 默认评论数
        dict_pars_info['默认评论数'] = str(productCommentSummary['defaultGoodCount'])
        # 追评、好评、中评、差评
        dict_pars_info['追评数'] = str(productCommentSummary['afterCount'])
        dict_pars_info['好评数'] = str(productCommentSummary['goodCount'])
        dict_pars_info['中评数'] = str(productCommentSummary['generalCount'])
        dict_pars_info['差评数'] = str(productCommentSummary['poorCount'])

        if len(comments) > 0:
            # print(comments)
            for comment in comments:
                # print(comment)
                name = comment['referenceName']

                id = comment['id']

                guid=comment['guid']

                con = comment['content']

                time = comment['creationTime']

                istTop=comment['isTop']

                userClient=comment['userClient']

                score = comment['score']

                likes = comment['usefulVoteCount']

                # userLevelName=comment['userLevelName']

                replyCount = comment['replyCount']

                # userClientShow=comment['userClientShow']
                #
                # isMobile=comment['isMobile']

                days=comment['days']

                try:
                    productColor = comment['productColor']
                except:
                    productColor = ''

                try:
                    productSize = comment['productSize']
                except:
                    productSize = ''
                nickname=comment['nickname']
                isHuiYuan=comment['plusAvailable']
                if(isHuiYuan!="201"):
                    isHuiYuan=0
                else:
                    isHuiYuan=1
                isMobile=comment['mobileVersion']
                if(isMobile!=""):
                    isMobile="0"
                else:
                    isMobile="1"
                item = {
                    # 'name': name,
                    'id': id,
                    'score': score,
                    'guid':guid,
                    'con': con,
                    'time': time,
                    'isTop':istTop,
                    'userClient':userClient,
                    # 'userLevelName':userLevelName,
                    'productColor': productColor,
                    'productSize': productSize,
                    'likes': likes,
                    'replyCount': replyCount,
                    'nickname':nickname,
                    # 'userClientShow':userClientShow,
                    'userClientShow':"来自京东iPhone客户端",
                    'isMobile':isMobile,
                    'isHuiYuan':isHuiYuan,
                    'days':days
                }
                item.update(dict_pars_info)
                # print(item)
                string = str(item)

                # 1.保存为csv格式
                item_dataframe = pd.DataFrame([item])
                # print(item_dataframe)
                if k_head == 0:
                    item_dataframe.to_csv(root_dir + '/%d.csv' % product_id, mode='w', header=True, index=False,
                                          encoding='GBK')
                    k_head += 1
                else:
                    item_dataframe.to_csv(root_dir + '/%d.csv' % product_id, mode='a', header=False, index=False,
                                          encoding='GBK')

                # 2.保存成txt
                fp = open(root_dir + '/%d.txt' % product_id, 'a', encoding='UTF-8')
                fp.write(string + '\n')
                fp.close()
            print('%s-page---finish(%s/%s)' % (p, p, maxPage))
        else:
            return []
        p = p + 1


if __name__ == '__main__':
    phone_id = 100018902008
    product_reviews(product_id=phone_id)

 

posted @ 2022-03-25 23:04  哦心有  阅读(56)  评论(0编辑  收藏  举报