import os.path
import requests
from lxml import etree
if __name__ == '__main__':
if not os.path.exists('./jianli'):
os.mkdir('./jianli')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
}
# 下载前两页模板
for i in range(1, 3):
if i == 1:
url = 'https://sc.chinaz.com/jianli/free.html'
else:
url = 'https://sc.chinaz.com/jianli/free_' + str(i) + '.html'
page = requests.get(url=url, headers=headers)
page.encoding = 'utf-8'
tree = etree.HTML(page.text)
free_jianli = tree.xpath('//div[@id="main"]/div/div')
for free in free_jianli:
# 获取链接
free_url = free.xpath('./a/@href')[0]
# 简历标题
free_title = free.xpath('./a/img/@alt')[0]
# 根据链接获取到对应下载页面
free_content = requests.get(url=free_url, headers=headers).text
free_content_tree = etree.HTML(free_content)
# 找到下载链接
down_path = free_content_tree.xpath('//div[@class="down_wrap"]/div[2]/ul/li[1]/a/@href')[0]
# 下载内容的标题
down_path_title = free_title + '.' + down_path.split('.')[-1]
# 根据下载链接进行二进制数据下载
down_path_content = requests.get(url=down_path, headers=headers).content
# 存入文件中
with open('./jianli/' + down_path_title, 'wb') as fp:
fp.write(down_path_content)
print(down_path_title, "下载成功")
print("第{0}页下载成功".format(i))
print('下载完成')