python 爬图片演练
import requests
from pyquery import PyQuery as pq
import time
import os
import random
#自定义header
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
#自定义代理ip
proxyip = {
'https': '180.109.124.30:4216'
}
#获取页面 url 注: url_list 为一个列表,第一个元素为首页,page_url 为2-9 页面的url;
def page_url():
url_list=["http://www.netbian.com/meinv/"]
url = "http://www.netbian.com/meinv/"
for i in range(2,10):
page_url = url + "index_" +str(i) + ".htm"
url_list.append(page_url)
# print(url_list)
return url_list
# 请求网页,获取源码,提取文本就用text;提取图片、文件,就要用到content
def start_request():
url = page_url()
y = 0
for i in url:
r = requests.get(i,headers=header,proxies=proxyip)
r.encoding = 'GBK'
html = r.text
doc = pq(html)
# 匹配大致图片地址
images = doc('div.list ul li img').items()
for image in images:
#print (image)
#精准匹配图片url
img_url = image.attr('src')
print (img_url)
#提取文本就用text;提取图片、文件,就要用到content
img = requests.get(img_url,headers=header,proxies=proxyip).content
dirs = "F:\image"
if not os.path.exists(dirs):
os.mkdir(dirs,777)
path = "F:\\image\\" + str(y) + ".jpg"
with open(path,'wb') as f:
f.write(img)
time.sleep(2)
print('正在下载第{}张图片'.format(y))
print ("写入完成")
y +=1
def main():
start_request()
if __name__ == "__main__":
main()
效果如下


浙公网安备 33010602011771号