import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
from requests import adapters
import time
import requests
from spiders.index_url_title_collection import index_title_script
from spiders.script import headers
requests.packages.urllib3.disable_warnings()
class IndexKeyword(object):
def __init__(self, keyword):
self.headers = headers
self.keyword = keyword # 关键词
self.data_list = [] # 返回的数据列表
def spider(self, url):
for i in range(3):
try:
response = requests.get(url, headers=self.headers, verify=False)
json_data = response.content.decode()
except (json.JSONDecodeError, adapters.SSLError):
continue
else:
break
else:
return {}
return json_data
def save_content_list(self, data):
"""
数据保存
:return:
"""
data_str = re.findall(r'productKeywords\",\"value\":\"(.*?)\"', data)
if len(data_str) > 0:
self.data_list.append(data_str[0].split(','))
def run(self, start_page, end_page, section_id):
with ThreadPoolExecutor(max_workers=8) as t:
obj_list = []
begin = time.time()
url_list = index_title_script('proc_url', self.keyword, start_page, end_page, section_id)
print(url_list)
for url in url_list:
obj = t.submit(self.spider, url) # 第一个参数是函数名 以后的是函数里面需要传的参数
obj_list.append(obj)
for future in as_completed(obj_list):
data = future.result()
self.save_content_list(data)
times = time.time() - begin
print(times)
return self.data_list