点击查看代码
# https://sc.chinaz.com/jianli/xiaochengchu.html
import requests
from lxml import etree
url = 'https://sc.chinaz.com/jianli/xiaochengchu.html'
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
response = requests.get(url=url, headers=headers)
response.encoding = 'utf-8'
# print(response.text)
html = etree.HTML(response.text)
divs = html.xpath('//div[@class="box col3 ws_block"]')
for div in divs:
# 简历标题
name = div.xpath('./p/a/text()')[0]
# 获取链接
nr_url = div.xpath('./p/a/@href')[0]
# print(name, nr_url)
# 根据链接获取到对应下载页面
down_res = requests.get(url=nr_url, headers=headers).text
down_tree = etree.HTML(down_res)
# 找到下载链接
d_url = ''.join(down_tree.xpath('//div[@class="clearfix mt20 downlist"]/ul/li[1]/a/@href'))
if not d_url:
continue
print(d_url)
with open(f'./简历/{name}.rar', 'wb') as f:
f.write(requests.get(url=d_url, headers=headers).content)