爬虫入门 02 xpath库初步使用

xpath概述:

1.xpath:
最常用、最高效、的一种解析方式【数据解析首选】
2.如何使用?
1.实例化 etree => 加载数据
2.调用 api => 1.标签定位 2.数据解析
xxx.xpath('')
=> xpath表达式:
1.标签定位
2.数据解析
1.标签的文本
2.标签的属性
3.api使用
1.标签定位
2.数据解析
pip install lxml

xpath爬虫案例

1.离线方式使用xpath


import requests
from lxml import etree

if __name__ == '__main__':
    # 1.实例化
    root = etree.parse("D:\ \python-sk\data\lol.html")

    # 2.调用api
    '''
        编写xpath表达式
        1.标签定位
        2.数据解析
    '''

    '''
    1.标签定位:
        1.绝对路径
        2.相对路径
        3.属性定位
        4.索引定位
    爬虫考试卷
        以为特别深入
        结果考试卷就一个xpath
        和script的基本创建
    '''


    #api:1 => 绝对路径 => 返回 Element-list(返回数组)
    t_info = root.xpath('/html/head/title')
    print(t_info)

    #api:2 => 相对路径 => 可以从任何路径定位
    t_info1 = root.xpath('//title')
    print(t_info1)
    t_info1 = root.xpath('/html//title')
    print(t_info1)
   
    # api:3 => 属性定位 => 标签[@属性="xxx"]
    div = root.xpath('//div')
    print(div)

    div_adc = root.xpath('//div[@class="adc"]')
    print(div_adc)

    li_list = root.xpath('//div[@class="adc"]//ul//li')
    print(li_list)

    li_list = root.xpath('//div[@class="adc"]//ul//li')
    print(li_list)

    # api:4 => 索引定位 => 索引下标从1开始 标签[索引]
    # xpath中不管怎么写,返回的都是链表,但是用下标就不是了
    li_list1 = root.xpath('//div[@class="adc"]//ul//li[1]')
    print(li_list1)

    li_list1 = root.xpath('//div[@class="adc"]/ul/li[1]/a')
    print(li_list1)

    # li_list2 = root.xpath('//div[@class="adc"]//ul//li')[0]
    # print(li_list2)

    # 2.数据解析
        # 1.标签文本
        # 2.标签属性
    # a下面的文本
    a_text = root.xpath('//div[@class="adc"]//li[1]/a/text()')
    print(a_text)
    # li下面的文本
    li1_text = root.xpath('//div[@class="adc"]//li[1]//text()')
    print(li1_text)

    # 2.属性解析 => 标签/@属性名
    img_info = root.xpath('//div[@class="top"]//img')
    print(img_info)

    img = root.xpath('//div[@class="top"]//img/@src')
    print(img)

2.xpath爬取二手房信息

import requests
from lxml import etree

if __name__ == '__main__':
    # ua伪装 => 模拟浏览器上网
    headers = {
        "User-Agent":"你的ua"
    }
    url = "https://dl.58.com/ershoufang"

    # 1.通用爬虫
    page_info = requests.get(url,headers=headers)

    # 2.数据解析
    root = etree.HTML(page_info.text)

    # 3.标签定位
    div_list = root.xpath('//section[@class="list"]/div')
    print(div_list)

    fp = open("D:\ \python-sk\data\二手房.txt","w",encoding="utf-8")
    for div in div_list:
        # 标签定位,要加[0]得到里面的数据,不然都是一个一个的list
        title = div.xpath('./a/div[@class="property-content"]/div[@class="property-content-detail"]/div[@class="property-content-title"]/h3/text()')[0]
        fp.write(title+"\n")
        print(title, "=>爬虫ok")

3.爬取58同城租房信息

'''
练习
    1.爬取58租房
'''
import requests
from lxml import etree

if __name__ == '__main__':
    # ua伪装 => 模拟浏览器上网
    headers = {
        "User-Agent":"你的ua"
    }
    url = "https://dl.58.com/chuzu"

    # 1.通用爬虫
    page_info = requests.get(url=url,headers=headers)

    # 2.数据解析
    root = etree.HTML(page_info.text)

    # 3.标签定位
    house_list = root.xpath('//div[@class="list-wrap"]/div[@class="list-box"]/ul/li')
    print(house_list)

    fp = open("D:\ \python-sk\data\租房.txt","w",encoding="utf-8")
    # 4.数据解析,try,except抛出异常
    for el in house_list:
        try:
            message = el.xpath('./div[@class="des"]/h2/a/text()')[0]
            # print(message + "\n")
            fp.write(message)
            print(message,"爬取ok")
        except  BaseException as e:
            print(e)

4.爬取58同城二手车【坑,慎入,别爬】

'''
练习
    2.爬取58二手车
    里面有坑,不太行,售价是UI组件做的
    而且被经常被58墙,请求一次,就不行了
    只能看代码了,已经被墙了,请求发出去返回的是空数据
'''

import requests
from lxml import etree

if __name__ == '__main__':
    # ua伪装 => 模拟浏览器上网
    headers = {
        "User-Agent":"你的ua"
    }
    url = "https://dl.58.com/ershouche"

    # 1.通用爬虫
    page_info = requests.get(url=url,headers=headers)

    # 2.解析数据
    root = etree.HTML(page_info.text)

    # 3.标签定位
    # car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
    # print(car_list)

    car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
    print(car_list)
    # 4.解析数据
    # for el in car_list:
    #     message = el.xpath('./div[@class="info--wrap"]/a/div')
    #     print(message)

5.爬取图片

import requests
from lxml import etree

if __name__ == '__main__':
    # ua伪装 => 模拟浏览器上网
    headers = {
        "User-Agent":"你的ua"
    }

    url = "https://pic.netbian.com/4kmeinv/"

    # url = "https://pic.netbian.com/4kdongman"

    # 1.通用爬虫
    page_info = requests.get(url=url, headers=headers)
    page_info.encoding="gbk"
    print(page_info)
    # 2.解析数据
    root = etree.HTML(page_info.text)

    # 3.标签定位
    li_list = root.xpath('//div[@class="slist"]/ul/li')

    for el in li_list:
        img_url = "https://pic.netbian.com"+el.xpath('./a/img/@src')[0]
        img_title = el.xpath('./a/img/@alt')[0]
        print(img_url)

        # 1.通用爬虫
        img_reponse = requests.get(url=img_url,headers=headers)
        img_data = img_reponse.content
        with open(f"D:\ \python-sk\data\img\{img_title}.jpg","wb") as fp:
            fp.write(img_data)
            print(img_title,"爬取ok")

6.爬取城市信息并去重

import requests
from lxml import etree

if __name__ == '__main__':
    # ua伪装 => 模拟浏览器上网
    headers = {
        "User-Agent":"你的ua"
    }
    url = "http://www.air-level.com/"

    # 1.通用爬虫
    page_info = requests.get(url=url, headers=headers)

    # 2.解析数据
    root = etree.HTML(page_info.text)

    # 3.标签定位
    a_list = root.xpath('//div[@id="citylist"]/div[@class="citynames"]/a')
    # print(a_list)

    fp = open("D:\ \python-sk\data\cityname.txt","w",encoding="utf-8")


    # 去重方法1,使用set
    s1 = {""}
    s1.pop()
    for a in a_list:
        a_text = a.xpath('./text()')[0]
        s1.add(a_text)
    fp.write(str(s1))

    # # 去重方法2,大字符串,in和not in
    st = ""
    for a in a_list:
        a_text = a.xpath('./text()')[0]
        print(a_text)
        if((a_text not in st) == True):
            st = st + a_text
            fp.write(a_text)
            print("写入一个字符串")
        else:
            print("该字符串已存在")



    print("successful")

    '''
    作业,cityname去重
        1.把重点城市去掉
        2.取完再去重【推荐】
    '''
posted @ 2022-04-05 21:33  咸鱼QwQ  阅读(169)  评论(0)    收藏  举报