案例-爬取站长素材图片:xpath
网站:案例-爬取站长素材图片:xpath
xpath代码:
import requests
import json
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
def get_img_url(url):
resp = requests.get(url, headers = headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//body/div[3]/div[1]/div[1]/div[2]/div[2]/img/@src')[0]
res = 'https:{}'.format(data)
return res
def down_img(file_name,url):
img_resp = requests.get(url, headers = headers)
with open(file_name, mode="wb") as f:
f.write(img_resp.content)
def get_img(url):
resp = requests.get(url, headers = headers)
resp.encoding = 'UTF-8'
html_tree = etree.HTML(resp.text)
data = html_tree.xpath('//body/div[3]/div[2]/div')
for item in data:
title = item.xpath('./div/a/@title')[0]
item_link = item.xpath('./div/a/@href')[0]
href_link = 'https://sc.chinaz.com{}'.format(item_link)
url_link = get_img_url(href_link)
url_suffix = url_link.split('.')[-1]
file_name = '{}.{}'.format(title,url_suffix)
down_img(file_name, url_link)
# 分页
url = "cc"
for i in range(1,2835):
if i == 1:
get_img(url)
else:
pre_i = "_{}.html".format(i)
replace_url = url.replace(".html",pre_i)
get_img(replace_url)
浙公网安备 33010602011771号