#!/usr/bin/env python
# -*- coding:utf-8-*-
# file: {NAME}.py
# @author: jory.d
# @contact: dangxusheng163@163.com
# @time: 2021/06/29 22:09
# @desc:
import os, os.path as osp
import requests
import json
import time
import re
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
'Host': 'image.baidu.com',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin', 'X-Requested-With': 'XMLHttpRequest',
'Accept': 'text/plain, */*; q=0.01',
}
SAVE_ROOT_PATH = './image_face'
def download_img_from_baidu(keyword='手机正面'):
url = 'https://image.baidu.com/search/acjson'
per_page_num = 30
params = {
"tn": "resultjson_com",
"logid": "8357122664305590518",
"ipn": "rj",
"ct": "201326592",
"is": "",
"fp": "result",
"queryWord": keyword,
"cl": "2",
"lm": "-1",
"ie": "utf-8",
"oe": "utf-8",
"adpicid": "",
"st": "-1",
"z": "",
"ic": "0",
"hd": "",
"latest": "",
"copyright": "",
"word": keyword,
"s": "",
"se": "",
"tab": "",
"width": "",
"height": "",
"face": "0",
"istype": "2",
"qc": "",
"nc": "1",
"fr": "",
"expermode": "",
"nojc": "",
"pn": 30,
"rn": f'{per_page_num}',
"gsm": "1e",
"1624980770280": ""
}
page_num = 30
idx = 0
page_idx = 0
failed_list = []
while page_idx < page_num:
params['pn'] = per_page_num * (page_idx + 1)
r = requests.get(url, params, headers=headers)
if r.status_code == 200:
# cont = r.text.replace('\\', '\\\\')
cont = r.content.decode('utf-8')
# print(cont)
try:
print(f'page_idx: {page_idx+1}/{page_num}')
json_data = json.loads(cont, encoding='utf-8')
# print(json_data)
total = json_data['displayNum']
page_num = total
if 'data' in json_data.keys():
data_list = json_data['data']
for d in data_list:
idx += 1
img_url = d['middleURL']
save_filepath = f'{SAVE_ROOT_PATH}/{keyword}/{idx}.jpg'
os.makedirs(osp.dirname(save_filepath), exist_ok=True)
imgContent = requests.get(img_url).content
open(save_filepath, 'wb').write(imgContent) # 写入
if osp.getsize(save_filepath) > 10 * 1024:
print(f'{save_filepath} 下载成功!')
time.sleep(2)
else:
failed_list.append(img_url)
page_idx += 1
except:
print(f'json.loads() exception. page_idx: {page_idx + 1}')
continue
# 重新下载失败的列表
for j, url in enumerate(u for u in failed_list):
imgContent = requests.get(url).content
idx += 1
save_filepath = f'{SAVE_ROOT_PATH}/{keyword}/{idx}.jpg'
open(save_filepath, 'wb').write(imgContent) # 写入
if osp.getsize(save_filepath) > 5 * 1024:
print(f'{save_filepath} 下载成功!')
time.sleep(2)
failed_list.pop(j)
else:
failed_list.append(url)
print(f'fail: {failed_list}')
print('done.')
def decode_json_downoad():
html_path = './phone_1.json'
keyword = 'phone'
with open(html_path, 'r', encoding='utf-8') as rf:
html = rf.read()
pic_url = re.findall('"objURL":"(.*?)",', html, re.S)
print(pic_url)
print(len(pic_url))
for i, each in enumerate(pic_url):
print('正在下载第' + str(i) + '张图片,图片地址:' + str(each))
try:
pic = requests.get(each, timeout=10)
except requests.exceptions.ConnectionError:
print('【错误】当前图片无法下载')
continue
dir = './images/' + keyword + '_' + str(i) + '.jpg'
os.makedirs(osp.dirname(dir), exist_ok=True)
fp = open(dir, 'wb')
fp.write(pic.content)
fp.close()
i += 1
if __name__ == '__main__':
download_img_from_baidu(keyword='中国男明星高清人脸')
# decode_json_downoad()