S++

千线一眼

导航

python爬虫-xpath解析

前言

xpath解析方式可以说是最常用最便捷高效的一种解析方式了。而且具有很高的通用性。

环境的安装

pip install lxml

xpath解析原理

1. 实例化一个etree对象,并且需要将被解析的页面源码数据加载到该对象中。
2. 调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获。

xpath教程

   这里更加详细

xpath实践

XPath实践一:爬取58二手房标题
"""
XPath实践1:爬取58二手房信息
"""

import requests
from lxml import etree

if __name__ == '__main__':
    # UA伪装
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    url = 'https://bj.58.com/ershoufang/'
    # 获取页面
    page = requests.get(url=url, headers=headers)
    page.encoding = 'utf-8'
    # 数据解析
    tree = etree.HTML(page.text)
    title_list = tree.xpath('//section[@class="list"]/div/a/div[2]/div[1]/div[1]/h3/text()')
    print(title_list)
    with open("../data2/58二手.text", 'wb') as fp:
        for title in title_list:
            # 这里似乎是字节类型的
            fp.write(bytes(title, 'utf-8'))
            fp.write(bytes('\n','utf-8'))
        print("写入结束!")
XPath实践二:爬取高清图片
"""
Xpath实践2:爬取高清图片
之前我们使用过正则爬取过wallhaven中的缩略图,看着不怎么清洗,这里我们爬取清晰的图片
"""

import os
import requests
from lxml import etree


# 判断页数输入是否合理
def judge_num(num1, num2):
    return 0 < num1 < num2


if __name__ == '__main__':
    # UA伪装
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    bash_url = 'https://wallhaven.cc/'

    # 搜索关键词
    search_words = input("请输入你想要搜索的图片标签(例如:WLOP):\n")
    # 更新url
    # 最后没加根据观看次数排序 '&sorting=views&order=desc'
    # 你也可以去掉,但是我相信广大网友的眼光
    # '&categories=111&purity=111' 似乎是图片等级
    bash_url2 = bash_url + 'search?q=' + search_words + '&categories=111&purity=111' + '&sorting=views&order=desc'
    # 输入想要爬取的页面范围
    # 起始和结束页
    num1 = int(input("请输入你想爬取的起始页数:\n"))
    num2 = int(input("请输入你想爬取的结束页数:\n"))
    while not judge_num(num1, num2):
        print("输入不合格!!!\n")
        num1 = int(input("请重新输入你想爬取的起始页数:\n"))
        num2 = int(input("请重新输入你想爬取的结束页数:\n"))
    # 存放目录设置
    if not os.path.exists('../data2/wallhaven'):  # 在上级目录下创建一个名为wallpapers的文件夹
        os.mkdir('../data2/wallhaven')
    path = '../data2/wallhaven/' + search_words
    if not os.path.exists(path):  # 在wallpapers文件夹下创建一个以关键词命名的子文件夹以存放此次下载的所有壁纸
        os.mkdir(path)

    # 用来存放图片详细网站
    img_href_list = []
    for i in range(num1, num2):
        # 图片页面
        url = bash_url2 + '&page=' + str(i)
        # 拿到当前页面
        page = requests.get(url=url, headers=headers)
        page.encoding = 'utf-8'
        # 通过xpath解析
        tree = etree.HTML(page.text)
        img_href_list += tree.xpath('//a[@class="preview"]/@href')
        # print(img_href_list)

    # 对详细图片下载位置获取
    img_src_list = []
    for img_url in img_href_list:
        img_page = requests.get(url=img_url, headers=headers)
        img_page.encoding = 'utf-8'
        tree = etree.HTML(img_page.text)
        img_src_list += tree.xpath('//img[@id="wallpaper"]/@src')
        # print(img_src_list)

    # 存储高清图片
    img_num = 0
    for img_src in img_src_list:
        # 请求到图片数据
        img_data = requests.get(url=img_src, headers=headers).content
        # 生成图片名称(根据自己的喜好,也可以按url本来的命名方式命名
        img_num += 1
        img_name = search_words + str(img_num) + '.jpg'
        with open('../data2/wallhaven/' + search_words + '/' + img_name, 'wb') as fp:
            fp.write(img_data)
        print(img_name, '下载完成!')
XPath实践三:爬取全国城市名称
"""
XPath实践3:爬取全国城市名称
"""
import requests
from lxml import etree

if __name__ == '__main__':
    # UA伪装
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    url = 'https://www.aqistudy.cn/historydata/'
    page = requests.get(url=url,headers=headers)
    # 解析
    tree = etree.HTML(page.text)
    hot_city_list = tree.xpath('//div[@class="row"]/div[1]/div[@class="hot"]/div[2]/ul/li/a/text()')
    all_city_list = tree.xpath('//div[@class="row"]/div[1]/div[@class="all"]/div[2]/ul/div[2]/li/a/text()')
    # 我们可以看到这里用了两次解析
    # 那么如何通过一次解析获得全部呢?
    all_city_list2 = tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')

    print(hot_city_list)
    print(all_city_list)
    print(all_city_list2)
XPath实践四:爬取站长素材中的免费简介
"""
XPath实践4:爬取站长素材上的免费简历模板
"https://sc.chinaz.com"
url = "https://aspx.sc.chinaz.com/query.aspx?keyword=免费&classID=864"
"""
import os
import requests
from lxml import etree

if __name__ == '__main__':
    # UA伪装
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
                      "Version/14.1 Safari/605.1.15 "
    }
    # 指定url
    url = 'https://aspx.sc.chinaz.com/query.aspx?keyword=免费&classID=864&page=1'
    page = requests.get(url=url, headers=headers)
    page.encoding = 'utf-8'
    tree = etree.HTML(page.text)
    jianjie_list = tree.xpath('//div[@id="container"]/div/a/@href')
    jianjie_list = ['https:' + i for i in jianjie_list]

    # 获取下载地址
    download_list = []
    for detail_jianjie in jianjie_list:
        detail_page = requests.get(url=detail_jianjie, headers=headers)
        detail_page.encoding = 'utf-8'
        tree = etree.HTML(detail_page.text)
        download_list += tree.xpath('//div[@id="down"]/div[2]/ul/li[4]/a/@href')
        # print(download_list)

    # 持久化存储
    if not os.path.exists('../data2/jianjie'):
        os.mkdir('../data2/jianjie')
    download_num = 0
    for download in download_list:
        download_num += 1
        jianjie_data = requests.get(url=download, headers=headers).content
        jianjie_name = 'jianjie_' + str(download_num) + '.rar'
        with open('../data2/jianjie/' + jianjie_name, 'wb') as fp:
            fp.write(jianjie_data)
        print(jianjie_name, '下载完成!')

posted on 2022-03-11 20:48  S++  阅读(164)  评论(0)    收藏  举报