#-*-coding:utf-8-*-
#@Time :2022/3/14 12:49
#@Author:shuaichao
#@File :.py
#@Software: PyCharm
import openpyxl as op
import urllib.request
from bs4 import BeautifulSoup # 网页解析,获悉数据.231
import urllib.request, urllib.error # 制定URL,获取网页数据
import time
import random
import json
list_goodid = [] #商品id
list_id = [] # 产品ID
list_content = [] # 评论内容
list_time = [] # 时间
list_score = [] # 评分
list_name = [] # 名字
list_mobileVersion = [] # 是否移动端
list_plusAvailable = [] # 会员等级
list_days = [] # 收货间隔
def askUrl(url):
headers = {
# "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
# 'Host': 'movie.douban.com',
# "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
# "Connection": "keep-alive",
# "Cache-Control": "max-age = 0",
# "Accept-Language": "zh - CN, zh;q = 0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
request = urllib.request.Request(url, headers=headers)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reasen)
return html
# 爬取网页信息
def get_info(baseurl):
html = askUrl(baseurl)
bs = BeautifulSoup(html, "html.parser")
return bs
# soup处理并转换成字符串
def transport(bs, info):
ex_info = bs.find_all(class_=info)
info = str(ex_info)
return ex_info, info
def askUrl2(url):
headers = {
# "Cookie": 'bid=ySWyT3eWKHI; ll="118088"; __utma=30149280.292149151.1637469049.1637469049.1637469049.1; __utmc=30149280; __utmz=30149280.1637469049.1.1.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.1.10.1637469049; ap_v=0,6.0; __utma=223695111.1326316524.1637469080.1637469080.1637469080.1; __utmb=223695111.0.10.1637469080; __utmc=223695111; __utmz=223695111.1637469080.1.1.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1637469080%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; _vwo_uuid_v2=D84C2319507104E7EA8DA14C2D366B708|08f1b95ebe80ed5b6c33ac030c3151e7; dbcl2="250389712:+jECS9wlK5g"; ck=ieh6; _pk_id.100001.4cf6=13045fc7b4b26386.1637469080.1.1637469126.1637469080.; push_noty_num=0; push_doumail_num=0',
# 'Host': 'movie.douban.com',
# "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
# "Connection": "keep-alive",
# "Cache-Control": "max-age = 0",
# "Accept-Language": "zh - CN, zh;q = 0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
}
request = urllib.request.Request(url, headers=headers)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gb2312")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reasen)
return html
def get_info2(baseurl):
html = askUrl2(baseurl)
bs = BeautifulSoup(html, "html.parser")
return bs
if __name__ == '__main__':
pagesize = 100 # 某一页包含的商品数量
list_Goods = [] # 所有商品的ID
'''
获取商品ID
'''
print("开始")
for i in range(1,2):
url = 'https://floor.jd.com/user-v20/feed/get?' \
'page=' + str(i) + '&pagesize=' + str(pagesize) + \
'&area=1_2802_0_0&source=pc-home&callback=jsonpMore2Goods&_=1647233103435'
time.sleep(random.randint(2, 5))
res = get_info(url)
time.sleep(random.randint(2,5))
response_data =json.loads(res.text.replace('jsonpMore2Goods(','')[:-1])['data']
for v in response_data:
list_Goods.append(v['sku'])
'''
获取商品的评论和相关信息
'''
print("开始获取评论等用户信息")
for v in list_Goods:
try:
time.sleep(random.randint(2, 5))
print("完成一个商品购买信息收集")
url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \
'&productId=' + str(v) + '&score=0&sortType=10&page=10&pageSize=100'
res = get_info2(url_comment)
time.sleep(random.randint(2, 5))
response_data = json.loads(res.text)['maxPage']
pageCount = response_data
for i in range(1, pageCount):
try:
print("中循环成功运行一次")
url_comment = 'https://club.jd.com/comment/productPageComments.action?callback=' \
'&productId='+str(v)+'&score=0&sortType=10&page='+str(i)+'&pageSize=100'
time.sleep(random.randint(2, 5))
res = get_info2(url_comment)
time.sleep(random.randint(2, 5))
response_data = json.loads(res.text)['comments']
except:
print("中循环报错一次")
try:
time.sleep(random.randint(2, 5))
res = get_info(url_comment)
time.sleep(random.randint(2, 5))
response_data = json.loads(res.text)['comments']
except:
print("中循环二次报错")
continue
for value in response_data:
list_goodid.append(v)
list_id.append(value['id'])
list_content.append(value['content'])
list_time.append(value['creationTime'])
list_score.append(value['score'])
list_days.append(value['days'])
if value['mobileVersion'] != "":
list_mobileVersion.append(value['mobileVersion'])
else:
list_mobileVersion.append("pc")
list_name.append(value['nickname'])
list_plusAvailable.append(value['plusAvailable'])
except:
print("大循环报错一次")
continue
finally:
wb = op.Workbook() # 创建工作簿对象
ws = wb['Sheet'] # 创建子表
for i in range(len(list_id)):
d = list_goodid[i],list_id[i], list_name[i], list_content[i], list_time[i], list_mobileVersion[i], list_plusAvailable[
i], list_score[i],list_days[i]
ws.append(d)
wb.save("./comment1.xlsx")
wb = op.Workbook() # 创建工作簿对象
ws = wb['Sheet'] # 创建子表
for i in range(len(list_id)):
d = list_goodid[i],list_id[i],list_name[i],list_content[i],list_time[i],list_mobileVersion[i],list_plusAvailable[i],list_score[i],list_days[i]
ws.append(d)
wb.save("./comment1.xlsx")