xpath解析html

  • 安装lxml库
pip install lxml
  • 具体使用方法
from lxml import etree
#1. 将本地的html文档中的源码数据加载到etree对象。
etree.parse('file_path')
#2. 将从互联网上获取的源码数据加载到etree对象中
etree.HTML(page_text)#其中page_text是响应的html数据
from lxml import etree

# HTML 字符串
html_content = """
<html>
  <body>
    <div id="content">
      <p class="paragraph">Hello, World!</p>
      <p class="paragraph">This is a test.<div>xiaohei</div></p>
      <p class="paragraph1" title="xiaohei">This is another test.</p>
    </div>
  </body>
</html>
"""

# 解析 HTML
tree = etree.HTML(html_content)
print(tree.xpath('//p'))# 获取所有 <p> 元素,返回[<Element p at 0x197374ecf80>, <Element p at 0x197374ecf00>, <Element p at 0x197374ecfc0>]
print(tree.xpath('//p[@class="paragraph"]'))#返回class为paragraph的p标签,[<Element p at 0x197374ecf00>, <Element p at 0x197374ecfc0>]
print(tree.xpath('//p/@class'))#返回所有p标签的class属性,['paragraph', 'paragraph', 'paragraph1']
print(tree.xpath('//p[@title="xiaohei"]/@class'))#返回['paragraph1']
print(tree.xpath('//p/text()'))#返回p标签的文本内容,['Hello, World!', 'This is a test.', 'This is another test.']
#如果用//text(),返回的是标签下的所有文本

print('end')
posted @ 2025-03-11 23:27  CodeCraftsMan  阅读(37)  评论(0)    收藏  举报