xpath
解析原理
- 实例化一个etree对象且将即将被解析的页面源码数据加载到该数据中
- 使用etree对象中的xpath方法结合着xpath表达式进行标签定位和数据提取
实例化xpath对象
- etree.parse('本地文件路径')
- etree.HTML(page_text)
相关方法
from lxml import etree tree = etree.parse('./test.html') # 定位title标签 tree.xpath('/html/head/title') tree.xpath('/html//title') tree.xpath('//title') # 定位class='song'的div标签 tree.xpath('//div[@class='song']') tree.xpath('//div[2]]') # 索引从1开始 # 获取文本 /text():直系文本内容 //text:所有文本内容 tree.xpath('//div[@class='song']/p[1]/text()')[0] tree.xpath('//div[@class='song']//text()') # 取属性 tree.xpath('//a/@href')
爬取Boss直聘信息示例
# Boss直聘 import requests from lxml import etree url = 'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position=' headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
} page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="job-list"]/ul/li') for li in li_list: title = li.xpath('.//div[@class="job-title"]/text()')[0] salary = li.xpath('.//span[@class="red"]/text()')[0] company = li.xpath('.//div[@class="company-text"]/h3/a/text()')[0] print(title+salary+company)
乱码问题
遇到乱码问题的解决办法:
import requests import os from urllib import request from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } start_page = int(input('start_page num')) end_page = int(input('end_page num')) url = 'http://pic.netbian.com/4kmeinv/index_%d.html' if not os.path.exists('./girls'): os.mkdir('./girls') for page in range(start_page,end_page): if page==1: new_url = 'http://pic.netbian.com/4kmeinv/' else: new_url = format(url%page) response = requests.get(new_url, headers) # response.encoding = 'utf-8' # 乱码问题 page_text = response.text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="slist"]/ul/li') for li in li_list: img_name = li.xpath('./a/img/@alt')[0] img_name = img_name.encode('iso-8859-1').decode('gbk')+'.jpg' # 乱码问题 img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0] img_path = './girls/'+img_name request.urlretrieve(img_src, img_path) print(img_name, '下载成功!!!')
附:xpath运算符 | 的使用
import requests from lxml import etree url = 'https://www.aqistudy.cn/historydata/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } page_text = requests.get(url, headers).text tree = etree.HTML(page_text) tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')
例://book | //cd:返回所有拥有 book 和 cd 元素的节点集

浙公网安备 33010602011771号