import requests
from urllib.parse import urlencode
import os
from hashlib import md5
from multiprocessing.pool import Pool
def get_page(offset):
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
'cookie':'tt_webid=6724223385113069069; WEATHER_CITY=%E5%8C%97%E4%BA%AC; tt_webid=6724223385113069069; csrftoken=9e9d6c3be6aabc313dce0c4f1a116047; sso_uid_tt=27219b1c2d00b8a6021444d85d83dc38; toutiao_sso_user=7562e682c093b193cce298f25dd396ba; login_flag=8391d980bfc8a8908e7c6c80596a016c; __tea_sdk__ssid=undefined; _ga=GA1.2.931504366.1565662966; sid_guard=7562e682c093b193cce298f25dd396ba%7C1565663040%7C5126263%7CFri%2C+11-Oct-2019+10%3A21%3A43+GMT; uid_tt=27219b1c2d00b8a6021444d85d83dc38; sid_tt=7562e682c093b193cce298f25dd396ba; sessionid=7562e682c093b193cce298f25dd396ba; uuid="w:443dcb551552404fbfde212f1054c781"; __tasessionId=i5j7qcydf1569292028372; s_v_web_id=1e7e3b52d7bc46698bb26079c99fd83d',
'pragma':'no-cache',
'referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'x-requested-with':'XMLHttpRequest'
}
params={
'aid':'24',
'app_name':'web_search',
'offset':offset,
'format':'json',
'keyword':'街拍',
'autoload':'true',
'count':'20',
'en_qc':'1',
'cur_tab':'1',
#'from':'search_tab',
#'pd':'synthesis',
}
print(urlencode(params))
url='https://www.toutiao.com/api/search/content/?'+urlencode(params)
try:
response=requests.get(url,headers=headers)
if response.status_code==200:
print(response.json())
return response.json()
except requests.ConnectionError:
return 'No response'
def get_image(json):
if json.get('data'):
for item in json.get('data'):
if 'title' in item and 'image_list' in item and item['image_list']!=[]:
title=item.get('title')
images=item.get('image_list')
for image in images:
print(title)
print(image)
yield {
'image':image.get('url'),
'title':title
}
else:
print('Not parse')
def save_image(item):
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
response=requests.get(item.get('image'))
if response.status_code==200:
file_path='{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(response.content)
else:
print('Already Downloaded',file_path)
except requests.ConnectionError:
print('Failed to Save Image')
def main(offset):
json=get_page(offset)
for item in get_image(json):
print(item)
save_image(item)
GROUP_START=1
GROUP_END=1
if __name__=='__main__':
pool=Pool()
groups=([x*20 for x in range(GROUP_START,GROUP_END+1)])
pool.map(main,groups)
pool.close()
pool.join()