xpath

解析原理

  • 实例化一个etree对象且将即将被解析的页面源码数据加载到该数据中
  • 使用etree对象中的xpath方法结合着xpath表达式进行标签定位和数据提取

实例化xpath对象

  • etree.parse('本地文件路径')
  • etree.HTML(page_text)

相关方法

from lxml import etree
tree = etree.parse('./test.html')

# 定位title标签
tree.xpath('/html/head/title')
tree.xpath('/html//title')
tree.xpath('//title')

# 定位class='song'的div标签
tree.xpath('//div[@class='song']')
tree.xpath('//div[2]]') # 索引从1开始

# 获取文本 /text():直系文本内容  //text:所有文本内容 
tree.xpath('//div[@class='song']/p[1]/text()')[0]
tree.xpath('//div[@class='song']//text()')

# 取属性
tree.xpath('//a/@href')

 

爬取Boss直聘信息示例

# Boss直聘
import requests
from lxml import etree
url = 'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position='
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
} page_text
= requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//div[@class="job-list"]/ul/li') for li in li_list: title = li.xpath('.//div[@class="job-title"]/text()')[0] salary = li.xpath('.//span[@class="red"]/text()')[0] company = li.xpath('.//div[@class="company-text"]/h3/a/text()')[0] print(title+salary+company)

 

乱码问题

遇到乱码问题的解决办法:

import requests
import os
from urllib import request
from lxml import etree
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
start_page = int(input('start_page num'))
end_page = int(input('end_page num'))
url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
if not os.path.exists('./girls'):
    os.mkdir('./girls')
for page in range(start_page,end_page):
    if page==1:
        new_url = 'http://pic.netbian.com/4kmeinv/'
    else:
        new_url = format(url%page)
    response = requests.get(new_url, headers)
#     response.encoding = 'utf-8' # 乱码问题
    page_text = response.text
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//div[@class="slist"]/ul/li')
    for li in li_list:
        img_name = li.xpath('./a/img/@alt')[0]
        img_name = img_name.encode('iso-8859-1').decode('gbk')+'.jpg' # 乱码问题
        img_src = 'http://pic.netbian.com' + li.xpath('./a/img/@src')[0]
        img_path = './girls/'+img_name
        request.urlretrieve(img_src, img_path)
        print(img_name, '下载成功!!!')

 

附:xpath运算符 | 的使用

import requests
from lxml import etree
url = 'https://www.aqistudy.cn/historydata/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
page_text = requests.get(url, headers).text
tree = etree.HTML(page_text)
tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')

例://book | //cd:返回所有拥有 book 和 cd 元素的节点集

 

posted @ 2019-10-02 20:18  tianqibucuo  阅读(159)  评论(0)    收藏  举报