案例-爬取站长素材图片:xpath

网站:案例-爬取站长素材图片:xpath

 

xpath代码:

import requests
import json
from lxml import etree
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}

def get_img_url(url):
    resp = requests.get(url, headers = headers)
    resp.encoding = 'UTF-8'
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//body/div[3]/div[1]/div[1]/div[2]/div[2]/img/@src')[0]
    res = 'https:{}'.format(data)
    return res

def down_img(file_name,url):
    img_resp = requests.get(url, headers = headers)
    with open(file_name, mode="wb") as f:
        f.write(img_resp.content)

def get_img(url):
    resp = requests.get(url, headers = headers)
    resp.encoding = 'UTF-8'
    html_tree = etree.HTML(resp.text)
    data = html_tree.xpath('//body/div[3]/div[2]/div')
    for item in data:
        title = item.xpath('./div/a/@title')[0]
        item_link = item.xpath('./div/a/@href')[0]
        href_link = 'https://sc.chinaz.com{}'.format(item_link)
        url_link = get_img_url(href_link)
        url_suffix = url_link.split('.')[-1]
        file_name = '{}.{}'.format(title,url_suffix)
        down_img(file_name, url_link)

# 分页
url = "cc"
for i in range(1,2835):
    if i == 1:
        get_img(url)
    else:
        pre_i = "_{}.html".format(i)
        replace_url = url.replace(".html",pre_i)
        get_img(replace_url)

 

 

posted @ 2023-01-05 11:56  屠魔的少年  阅读(8)  评论(0)    收藏  举报