图片获取

import requests
import re
import os
from lxml import html#此处直接引入etree报错是因为版本问题,换个方式引入
etree = html.etree#引入etree方法
from string import punctuation

# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
#
# }
# for page in range(1,4):
#     source= requests.get('https://www.doutula.com/article/list/?page='+str(page),headers=headers).text
#     base =  etree.HTML(source).xpath('//*[@id="home"]/div/div[2]/a')
#     for i in base:
#         pic = i.xpath('div[2]/div/img/@data-original')
#         title = i.xpath('div[1]/text()')[0]
#         title = re.sub('\W', '', title)
#
#         print(title, pic)
#         if os.path.isdir('E:\\img\\'+title):#判断文件夹中是否存在文件
#             pass
#         else:
#             os.mkdir('E:\\img\\'+title)
#         if len(pic)!=0:
#             for img in pic:
#                 pic_name = img.split('_')[-1]
#                 pic_content = requests.get(img,headers=headers).content
#                 # print(title,pic_name)
#                 op = open('E:\\img\\'+title+'/'+pic_name,'wb')
#                 op.write(pic_content)
#                 op.close()


# headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4356.6 Safari/537.36'
# }#headers获取方式查看headers
# source = requests.get('http://www.adoutu.com/article/list/2',headers=headers).text
# # print(source)
# # base = etree.HTML(source).xpath('//div[@class="article-img-list row"]')
# # for i in base:
# #     srcs = i.xpath('div/img/@src')
# #     print(srcs)
# # '/html/body/div/div/div/div[3]/div[2]/div[1]/div/div/a/div[2]/div[1]/img'
# # # # '/html/body/div/div/div/div[3]/div[2]/div[1]/div/div'
# # '/html/body/div[2]/div/div/div[3]/div[2]/div[1]/div/div/a/div[1]/div[1]/span'
# base=etree.HTML(source).xpath('//div[@class="item-content"]')
# for i in base:
#     a=i.xpath('div//div/img/@src')
#     b=i.xpath('div//div/span/text()')[0]
#     b = re.sub('\W','', b)#正则表达式中的替换\W为正则,''为替换成的内容,b为替换的是谁,后面含可以加替换的次数
#     if os.path.isdir('E:\\img1\\'+b):
#         pass#判断文件夹中是否存在文件
#     else:
#         os.mkdir('E:\\img1\\'+b)#创建一个文件夹
#         if len(a)!=0:#判断a中有无元素
#             for img in a:
#                 pic_name = img.split('0')[-1]#图片名,按0分割取最后一个元素为图片名
#                 pic_content = requests.get(img,headers=headers).content#获取图片连接的页面元素,以二进制形式显示
#                 op = open('E:\\img1\\'+b+'/'+pic_name,'wb')
#                 op.write(pic_content)
#                 op.close()

 

posted @ 2021-01-22 16:51  秋叶落日  阅读(105)  评论(0编辑  收藏  举报