import requests
import re
import os
from lxml import html#此处直接引入etree报错是因为版本问题,换个方式引入
etree = html.etree#引入etree方法
from string import punctuation
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
#
# }
# for page in range(1,4):
# source= requests.get('https://www.doutula.com/article/list/?page='+str(page),headers=headers).text
# base = etree.HTML(source).xpath('//*[@id="home"]/div/div[2]/a')
# for i in base:
# pic = i.xpath('div[2]/div/img/@data-original')
# title = i.xpath('div[1]/text()')[0]
# title = re.sub('\W', '', title)
#
# print(title, pic)
# if os.path.isdir('E:\\img\\'+title):#判断文件夹中是否存在文件
# pass
# else:
# os.mkdir('E:\\img\\'+title)
# if len(pic)!=0:
# for img in pic:
# pic_name = img.split('_')[-1]
# pic_content = requests.get(img,headers=headers).content
# # print(title,pic_name)
# op = open('E:\\img\\'+title+'/'+pic_name,'wb')
# op.write(pic_content)
# op.close()
# headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4356.6 Safari/537.36'
# }#headers获取方式查看headers
# source = requests.get('http://www.adoutu.com/article/list/2',headers=headers).text
# # print(source)
# # base = etree.HTML(source).xpath('//div[@class="article-img-list row"]')
# # for i in base:
# # srcs = i.xpath('div/img/@src')
# # print(srcs)
# # '/html/body/div/div/div/div[3]/div[2]/div[1]/div/div/a/div[2]/div[1]/img'
# # # # '/html/body/div/div/div/div[3]/div[2]/div[1]/div/div'
# # '/html/body/div[2]/div/div/div[3]/div[2]/div[1]/div/div/a/div[1]/div[1]/span'
# base=etree.HTML(source).xpath('//div[@class="item-content"]')
# for i in base:
# a=i.xpath('div//div/img/@src')
# b=i.xpath('div//div/span/text()')[0]
# b = re.sub('\W','', b)#正则表达式中的替换\W为正则,''为替换成的内容,b为替换的是谁,后面含可以加替换的次数
# if os.path.isdir('E:\\img1\\'+b):
# pass#判断文件夹中是否存在文件
# else:
# os.mkdir('E:\\img1\\'+b)#创建一个文件夹
# if len(a)!=0:#判断a中有无元素
# for img in a:
# pic_name = img.split('0')[-1]#图片名,按0分割取最后一个元素为图片名
# pic_content = requests.get(img,headers=headers).content#获取图片连接的页面元素,以二进制形式显示
# op = open('E:\\img1\\'+b+'/'+pic_name,'wb')
# op.write(pic_content)
# op.close()