爬虫入门 02 xpath库初步使用
xpath概述:
1.xpath:
最常用、最高效、的一种解析方式【数据解析首选】
2.如何使用?
1.实例化 etree => 加载数据
2.调用 api => 1.标签定位 2.数据解析
xxx.xpath('')
=> xpath表达式:
1.标签定位
2.数据解析
1.标签的文本
2.标签的属性
3.api使用
1.标签定位
2.数据解析
pip install lxml
xpath爬虫案例
1.离线方式使用xpath
import requests
from lxml import etree
if __name__ == '__main__':
# 1.实例化
root = etree.parse("D:\ \python-sk\data\lol.html")
# 2.调用api
'''
编写xpath表达式
1.标签定位
2.数据解析
'''
'''
1.标签定位:
1.绝对路径
2.相对路径
3.属性定位
4.索引定位
爬虫考试卷
以为特别深入
结果考试卷就一个xpath
和script的基本创建
'''
#api:1 => 绝对路径 => 返回 Element-list(返回数组)
t_info = root.xpath('/html/head/title')
print(t_info)
#api:2 => 相对路径 => 可以从任何路径定位
t_info1 = root.xpath('//title')
print(t_info1)
t_info1 = root.xpath('/html//title')
print(t_info1)
# api:3 => 属性定位 => 标签[@属性="xxx"]
div = root.xpath('//div')
print(div)
div_adc = root.xpath('//div[@class="adc"]')
print(div_adc)
li_list = root.xpath('//div[@class="adc"]//ul//li')
print(li_list)
li_list = root.xpath('//div[@class="adc"]//ul//li')
print(li_list)
# api:4 => 索引定位 => 索引下标从1开始 标签[索引]
# xpath中不管怎么写,返回的都是链表,但是用下标就不是了
li_list1 = root.xpath('//div[@class="adc"]//ul//li[1]')
print(li_list1)
li_list1 = root.xpath('//div[@class="adc"]/ul/li[1]/a')
print(li_list1)
# li_list2 = root.xpath('//div[@class="adc"]//ul//li')[0]
# print(li_list2)
# 2.数据解析
# 1.标签文本
# 2.标签属性
# a下面的文本
a_text = root.xpath('//div[@class="adc"]//li[1]/a/text()')
print(a_text)
# li下面的文本
li1_text = root.xpath('//div[@class="adc"]//li[1]//text()')
print(li1_text)
# 2.属性解析 => 标签/@属性名
img_info = root.xpath('//div[@class="top"]//img')
print(img_info)
img = root.xpath('//div[@class="top"]//img/@src')
print(img)
2.xpath爬取二手房信息
import requests
from lxml import etree
if __name__ == '__main__':
# ua伪装 => 模拟浏览器上网
headers = {
"User-Agent":"你的ua"
}
url = "https://dl.58.com/ershoufang"
# 1.通用爬虫
page_info = requests.get(url,headers=headers)
# 2.数据解析
root = etree.HTML(page_info.text)
# 3.标签定位
div_list = root.xpath('//section[@class="list"]/div')
print(div_list)
fp = open("D:\ \python-sk\data\二手房.txt","w",encoding="utf-8")
for div in div_list:
# 标签定位,要加[0]得到里面的数据,不然都是一个一个的list
title = div.xpath('./a/div[@class="property-content"]/div[@class="property-content-detail"]/div[@class="property-content-title"]/h3/text()')[0]
fp.write(title+"\n")
print(title, "=>爬虫ok")
3.爬取58同城租房信息
'''
练习
1.爬取58租房
'''
import requests
from lxml import etree
if __name__ == '__main__':
# ua伪装 => 模拟浏览器上网
headers = {
"User-Agent":"你的ua"
}
url = "https://dl.58.com/chuzu"
# 1.通用爬虫
page_info = requests.get(url=url,headers=headers)
# 2.数据解析
root = etree.HTML(page_info.text)
# 3.标签定位
house_list = root.xpath('//div[@class="list-wrap"]/div[@class="list-box"]/ul/li')
print(house_list)
fp = open("D:\ \python-sk\data\租房.txt","w",encoding="utf-8")
# 4.数据解析,try,except抛出异常
for el in house_list:
try:
message = el.xpath('./div[@class="des"]/h2/a/text()')[0]
# print(message + "\n")
fp.write(message)
print(message,"爬取ok")
except BaseException as e:
print(e)
4.爬取58同城二手车【坑,慎入,别爬】
'''
练习
2.爬取58二手车
里面有坑,不太行,售价是UI组件做的
而且被经常被58墙,请求一次,就不行了
只能看代码了,已经被墙了,请求发出去返回的是空数据
'''
import requests
from lxml import etree
if __name__ == '__main__':
# ua伪装 => 模拟浏览器上网
headers = {
"User-Agent":"你的ua"
}
url = "https://dl.58.com/ershouche"
# 1.通用爬虫
page_info = requests.get(url=url,headers=headers)
# 2.解析数据
root = etree.HTML(page_info.text)
# 3.标签定位
# car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
# print(car_list)
car_list = root.xpath('//div[@class="list-wrap"]/ul/li')
print(car_list)
# 4.解析数据
# for el in car_list:
# message = el.xpath('./div[@class="info--wrap"]/a/div')
# print(message)
5.爬取图片
import requests
from lxml import etree
if __name__ == '__main__':
# ua伪装 => 模拟浏览器上网
headers = {
"User-Agent":"你的ua"
}
url = "https://pic.netbian.com/4kmeinv/"
# url = "https://pic.netbian.com/4kdongman"
# 1.通用爬虫
page_info = requests.get(url=url, headers=headers)
page_info.encoding="gbk"
print(page_info)
# 2.解析数据
root = etree.HTML(page_info.text)
# 3.标签定位
li_list = root.xpath('//div[@class="slist"]/ul/li')
for el in li_list:
img_url = "https://pic.netbian.com"+el.xpath('./a/img/@src')[0]
img_title = el.xpath('./a/img/@alt')[0]
print(img_url)
# 1.通用爬虫
img_reponse = requests.get(url=img_url,headers=headers)
img_data = img_reponse.content
with open(f"D:\ \python-sk\data\img\{img_title}.jpg","wb") as fp:
fp.write(img_data)
print(img_title,"爬取ok")
6.爬取城市信息并去重
import requests
from lxml import etree
if __name__ == '__main__':
# ua伪装 => 模拟浏览器上网
headers = {
"User-Agent":"你的ua"
}
url = "http://www.air-level.com/"
# 1.通用爬虫
page_info = requests.get(url=url, headers=headers)
# 2.解析数据
root = etree.HTML(page_info.text)
# 3.标签定位
a_list = root.xpath('//div[@id="citylist"]/div[@class="citynames"]/a')
# print(a_list)
fp = open("D:\ \python-sk\data\cityname.txt","w",encoding="utf-8")
# 去重方法1,使用set
s1 = {""}
s1.pop()
for a in a_list:
a_text = a.xpath('./text()')[0]
s1.add(a_text)
fp.write(str(s1))
# # 去重方法2,大字符串,in和not in
st = ""
for a in a_list:
a_text = a.xpath('./text()')[0]
print(a_text)
if((a_text not in st) == True):
st = st + a_text
fp.write(a_text)
print("写入一个字符串")
else:
print("该字符串已存在")
print("successful")
'''
作业,cityname去重
1.把重点城市去掉
2.取完再去重【推荐】
'''

浙公网安备 33010602011771号