爬取Macy网用户评价日志(3): 根据url爬取产品信息(一): 爬取comment的设计(具体执行)
- 思路:
1)在这里,我在考虑review的爬取的时候,考虑了两种方法。
①. 直接将review爬取并下载为.json文件。(最终选择方法)
②. 爬取review,然后将review中的字典进行匹配,并直接插入数据库里面。
- 具体执行过程:
- main函数:main函数分为三个部分。
1)mysql抽取:从mysql中抓取所有未请求的url; 创建url列表;
1)查看rank3爬取的mysql数据,即具体产品页面url的数量。目前我爬取的数据已经超过了10000条以上。
因此,需要考虑创建的“rank3 mysql提取类”的提取方法和顺序,以及提取的数量是否python的list可以放得下。
① 考虑python list的容量。
1----------32位python的限制是 536870912 个元素。
2----------64位python的限制是 1152921504606846975 个元素。
就目前来看,64位python的数量是可以放下10万条以上mysql的list的。所以暂时还是考虑使用cursor.fetchall()的方法。
2) comment爬取:依次向基于review的package的request url发送request;
1) 因为评论为package,存在next page,因此要不断向next page发送请求。若有next page,则继续下载并保存基于review包。
2)首先,查看不同产品review的package的url的区别。再对比不同page的review的next page的url的区别。找到request url规律。发现是webId和limit和offset的区别。
除了第一页review的数值为38,next page之后就有limit逐渐增加30的数值。因此只要不断向新url发送请求,若返回package有review dict,则是有效url.更换webid
向新的prod的review url发送请求。
prod_review = {
'''
https://www.macys.com/shop/product/dkny-pleated-tie-neck-top?ID=7052689&CategoryID=255
https://www.macys.com/xapi/digital/v1/product/7052689/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=8
https://www.macys.com/xapi/digital/v1/product/7052689/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=8
https://www.macys.com/shop/product/dkny-pleated-tie-neck-top?ID=7052689&RVI=PDP_5&tdp=cm_choiceId~z7052689~xcm_pos~zPos5
https://www.macys.com/xapi/digital/v1/product/7052689/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=8
https://www.macys.com/xapi/digital/v1/product/7052689/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=8
https://www.macys.com/shop/product/style-co-sherpa-lined-zip-up-hoodie-created-for-macys?ID=12647912&CategoryID=255
https://www.macys.com/xapi/digital/v1/product/12647912/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=8
https://www.macys.com/xapi/digital/v1/product/12647912/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=8
https://www.macys.com/shop/product/dkny-printed-faux-wrap-top?ID=11835517&CategoryID=255&swatchColor=Black%2Fivory
https://www.macys.com/xapi/digital/v1/product/11835517/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=8
https://www.macys.com/xapi/digital/v1/product/11835517/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=8
https://www.macys.com/shop/product/calvin-klein-womens-faux-fur-trim-hooded-puffer-coat-created-for-macys?ID=12459475&CategoryID=269&swatchColor=Dark%20Chianti
https://www.macys.com/xapi/digital/v1/product/12459475/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=8
https://www.macys.com/xapi/digital/v1/product/12459475/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=8
https://www.macys.com/xapi/digital/v1/product/12459475/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=38
https://www.macys.com/shop/product/cole-haan-womens-box-quilt-down-puffer-coat?ID=2813247&CategoryID=269&swatchColor=Navy
https://www.macys.com/xapi/digital/v1/product/2813247/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=8
https://www.macys.com/xapi/digital/v1/product/2813247/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=8
https://www.macys.com/xapi/digital/v1/product/2813247/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=38
https://www.macys.com/xapi/digital/v1/product/2813247/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=68
https://www.macys.com/xapi/digital/v1/product/2813247/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=98
page20:
https://www.macys.com/xapi/digital/v1/product/2813247/reviews?_shoppingMode=SITE&_regionCode=US¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset=548
'''
}
3) 具体代码如下:具体查看github. https://github.com/AtwoodZhang/Crawler_of_Product_Comment
① 具体爬虫类:
_04_spider_of_rank4_review_write_txt.py
import random
import requests
from _00_record_of_agent_pool import ua_list
from _00_record_of_small_functions import judge_url_whole2
class MacyRank4Write(object):
def __init__(self, url, prod_id, name_count):
self.url = url
self.prod_id = prod_id
self.name_count = name_count # 用来记录write_review的时候review的name。
self.response_content = False
def get_html(self, url):
url_new = judge_url_whole2(url)
try:
network_status = self.support_request(url=url_new)
except Exception as e:
print(e)
network_status = False
if network_status is False:
# 此时是请求超时
for i in range(1, 5):
print('request over time,it is %s time repeat request time' % i)
network_status = self.support_request(url=url_new)
count = 10 # 重复10次请求
while network_status is False and count > 0:
network_status = self.support_request(url=url_new)
count = count - 1
def support_request(self, url):
headers = {'User-Agent': random.choice(ua_list)}
response = requests.get(url=url, headers=headers, timeout=3)
if response.status_code == 200 and response.text != []:
response.encoding = "utf-8"
print(response)
self.parse_html(response.text)
response.close()
resp_status = True
else:
print("this time request failed")
resp_status = False
return resp_status
def parse_html(self, html):
if html[-14:-2] == "\"reviews\":[]":
string_output = "The {} product's this review page is empty:".format(self.prod_id)
print(string_output)
self.response_content = False
else:
prod_id_str = str(self.prod_id)
file_name0 = prod_id_str.zfill(10)
file_name1 = self.name_count
file_name = "./prod_review/" + file_name0 + "_" + file_name1+".json"
print("file_name:", file_name)
with open(file_name, "w", encoding="utf-8")as f:
f.write(html)
self.response_content = True
def run(self):
self.get_html(self.url)
return self.response_content
② main函数:
_04_main.py
import time
import os
import sys
import random
from _00_record_of_small_functions import *
from _04_mysql_of_rank4 import MacyRank4Mysql
from concurrent.futures import ThreadPoolExecutor # 用来构建线程池
from _04_spider_of_rank4_review_write_txt import MacyRank4Write
name_count_1 = 1 # 线程
name_count_2 = 1 # 请求次数
# def run():
# # step1. 从数据库中取出需要request的url;
# r4_sql = MacyRank4Mysql()
# r4_sql.select_upper_no_request(table_name='rank3_cate_urls')
# r3_mysql_list = [i for i in r4_sql.cursor.fetchall()]
# r4_sql.database_commit_close()
# print(len(r3_mysql_list))
#
# # step1.2. 首先使用一条数据进行测试;
# # r2_mysql_list = [r2_mysql_list[21]]
# # print(r2_mysql_list)
# # print(len(r2_mysql_list))
#
# # step2. 对url_list中的每一条数据逐一发送爬取请求;
# # 开启多线程;
# with ThreadPoolExecutor(10) as t:
# for i in r3_mysql_list:
# t.submit(send_request, i)
# time.sleep(random.uniform(1, 3))
#
#
# def send_request(url_address):
# m4_write_spider = MacyRank4Write(url=url_address)
# m4_write_spider.run()
def run_write_review():
# step1. 从数据库中取出需要request的url;
r4_sql = MacyRank4Mysql()
r4_sql.select_upper_no_request(table_name='rank4_prod_specific_info')
id_mysql_list = [i for i in r4_sql.cursor.fetchall()]
r4_sql.database_commit_close()
print("id_mysql_list length: ", len(id_mysql_list))
# step1.2 测试一条数据
# id_mysql_list = id_mysql_list[1:5]
# step2. 对url_list中的每一条数据逐一发送爬取请求;
# 开启多线程;
with ThreadPoolExecutor(40) as t:
for i in id_mysql_list:
print("mysql_data(in thread):", i)
t.submit(send_request, i)
global name_count_1
name_count_1 = name_count_1+1
time.sleep(random.uniform(1, 2)) # 不随机休眠,因为write本身花费了一定的时间
def send_request(one_mysql_data):
# step1. 尝试发起第一次评论请求;
# print(one_mysql_data)
url_address = "https://www.macys.com/xapi/digital/v1/product/{}/reviews?_shoppingMode=SITE&_regionCode=US" \
"¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=8"
url_address = url_address.format(one_mysql_data[0])
print("url_address: ", url_address)
global name_count_1, name_count_2
name_count1 = str(name_count_1).zfill(8) + "_" + str(name_count_2).zfill(8)
m4_write_spider = MacyRank4Write(url=url_address, prod_id=one_mysql_data[0], name_count=name_count1)
resp = m4_write_spider.run()
print(resp)
name_count_2 = name_count_2 + 1
# step2. 若第一次请求成功,则尝试发起第二次请求;
offset = 8
while resp is True:
url_address = "https://www.macys.com/xapi/digital/v1/product/{0}/reviews?_shoppingMode=SITE&_regionCode=US" \
"¤cyCode=USD&_customerState=GUEST&_deviceType=DESKTOP&sort=NEWEST&limit=30&offset={1}"
url_address = url_address.format(one_mysql_data[0], offset)
print("url_address_while: ", url_address)
name_count1 = str(name_count_1).zfill(8) + "_" + str(name_count_2).zfill(8)
m4_write_spider = MacyRank4Write(url=url_address, prod_id=one_mysql_data[0], name_count=name_count1)
resp = m4_write_spider.run()
offset = offset + 30
name_count_2 = name_count_2 + 1
r4_sql = MacyRank4Mysql()
r4_sql.update_rank4_request_situation(table_name='rank4_prod_specific_info', prod_id=one_mysql_data[0])
r4_sql.database_commit_close()
if __name__ == "__main__":
# step1. 写入爬取日志
log_path = './prod_crawl_log/'
if not os.path.exists(log_path):
os.makedirs(log_path)
log_file_name = log_path + 'log-' + time.strftime("%Y%m%d-%H%M%S", time.localtime())+'.log'
sys.stdout = Logger(log_file_name)
sys.stderr = Logger(log_file_name)
# step2. 运行爬取过程;
start = time.time()
run_write_review()
end = time.time()
spend_time = end - start
print("finish crawl rank4:", spend_time)