urllib3爬取流程

利用urllib3爬取百度图片首页图片

import os
import urllib3
import re
from fake_useragent import UserAgent
user_agent= UserAgent()
ua = user_agent.random
# 1.找到目标数据
url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=index&pos=history&word=meinv'
# 2.分析请求流程
manager =urllib3.PoolManager()
r = manager.request('GET', url)
text = r.data.decode('utf-8')
# 正则匹配获取图片url
html_img = re.findall('"thumbURL":"(.*?)"',text)
# print(html_img)
headers = {
    "user-agent":ua
}
image = enumerate(html_img) #枚举
# print(image)
for index, value in image:
    r = manager.request('GET',value, headers=headers)

# #4.提取数据
    ima_data =r.data
    # print(ima_data)
    if not os.path.exists("meinv"):
        os.mkdir("meinv")
    img_file = "meinv/"+str(index)+".jpg"
    with open(img_file,"wb") as f:
        f.write(ima_data)

 

posted @ 2020-11-20 22:06  瑾年ぺ  阅读(210)  评论(0)    收藏  举报