Python 解析Html

XPath

常用匹配规则:

符号

描述

/

从当前节点,选取子节点

//

从当前节点,选取子孙节点

.

选取当前节点

..

选择当前节点的父节点

@

选择属性

 

 

 

 

 

 

 

 

 

 

 

属性获取:

from lxml import etree
html = '<div><a class="du" href="http://www.baidu.com">百度</a></div>'
parser = etree.HTML(html)
result = parser.xpath('//a[@class="du"]/@href')
print(result)
View Code

文本获取:

from lxml import etree
html = '<div><a class="du" href="http://www.baidu.com">百度</a></div>'
parser = etree.HTML(html)
result = parser.xpath('//a[@class="du"]/text()')
print(result)
View Code

属性多值匹配:

from lxml import etree
html = '<div><a class="du baidu" href="http://www.baidu.com">百度</a></div>'
parser = etree.HTML(html)
result = parser.xpath('//a[contains(@class,"du")]/text()')
print(result)
View Code

多属性匹配:

from lxml import etree
html = '<div><a name="item" class="du baidu" href="http://www.baidu.com">百度</a></div>'
parser = etree.HTML(html)
result = parser.xpath('//a[contains(@class,"du") and @name="item"]/text()')
print(result)
View Code

按序选择:

from lxml import etree
html = """
        <li>item1</li>
        <li>item2</li>
        <li>item3</li>
        <li>item4</li>
        <li>item5</li>
"""
parser = etree.HTML(html)
result = parser.xpath('//li[1]/text()') #匹配第一个
print(result)
result = parser.xpath('//li[last()]/text()') #匹配最后一个
print(result)
result = parser.xpath('//li[position()<3]/text()') #匹配第一、第二个
print(result)
result = parser.xpath('//li[last()-2]/text()') #匹配倒数第三个
print(result)
View Code

更多用法:http://www.w3school.com.cn/xpath/xpath_functions.asp

 

Beautiful Soup

节点选择器:

from bs4 import BeautifulSoup
html = """
        <div>
        <li class="d1">item1</li>
        <li class="d2">item2</li>
        <li class="d3">item3</li>
        <li class="d4">item4</li>
        <li class="d5">item5</li>
        </div>
"""
soup = BeautifulSoup(html,'lxml')
result = soup.div.children
print(result)
for value in result:
    print(value.string)
View Code

方法选择器:

# find_all(name,attrs,recursive,text,**kwargs)
from bs4 import BeautifulSoup
html = """
        <div>
        <li class="d1">item1</li>
        <li class="d2">item2</li>
        <li class="d3">item3</li>
        <li class="d4">item4</li>
        <li class="d5">item5</li>
        </div>
"""
soup = BeautifulSoup(html,'lxml')
result = soup.find_all(name="div")
for value in result:
    result = value.find_all(name="li",class_="d3")[0].get_text() # 等价于string
    print(result)
View Code

Css选择器:

from bs4 import BeautifulSoup
html = """
        <div>
        <li class="d1">item1</li>
        <li class="d2">item2</li>
        <li class="d3">item3</li>
        <li class="d4" name="d">item4</li>
        <li class="d5">item5</li>
        </div>
"""
soup = BeautifulSoup(html,'lxml')
result = soup.select('div li[name="d"]')
for value in result:
    print(type(value))
    print(value.get_text())
View Code

 

Pyquery

初始化

字符串初始化:

from pyquery import PyQuery as pq
html = "<a href='http://www.baidu.com'>百度一下</a>"
parser = pq(html)
View Code

URL初始化:

from pyquery import PyQuery as pq
parser = pq(url="http://www.baidu.com")
print(parser)
View Code

文件初始化:

from pyquery import PyQuery as pq
parser = pq(filename="demo.html")
print(parser)
View Code

 

查找节点

Css选择器:

html = """      <div class="qrcode-text" id="1">
                我是div标签的文本
                <p class="title">我是标题<a href="http://www.baidu.com">百度一下</a></p>
                <p class="content">我是内容</p>
                </div>
        """

from pyquery import PyQuery as pq
parser = pq(html)
result = parser(".qrcode-text .title a")
print(result)
View Code

children()  查找子节点

find()      查找子孙节点

parent()        查找父节点

parents()   查找祖先节点

siblings()    查找兄弟节点

html = """      <body>
                <div class="qrcode-text" id="1">
                我是div标签的文本
                <p class="title">我是标题<a class="du" href="http://www.baidu.com">百度一下</a></p>
                <p class="content">我是内容
                <span class="first">第一行</span>
                </p>
                </div>
                </body>
        """

from pyquery import PyQuery as pq
parser = pq(html)
result = parser(".content").children()
print(result)
result = parser.find("span")
print(result)
result = parser("span").parent()
print(result)
result = parser("span").parents("#1")
print(result)
result = parser(".title").siblings()
print(result)
用法

 

获取信息

获取属性  attr()

内部文本  text()

html文本  html()

html = """      <body>
                <div class="item_1"><span>1.</span>第一行</div>
                <div class="item_2"><span>2.</span>第二行</div>
                <div class="item_3"><span>3.</span>第三行</div>
                </body>
        """

from pyquery import PyQuery as pq
parser = pq(html)
result = parser("div")
for value in result.items():
    print(value.attr("class"))
    print(value.text())
    print(value.html())
用法

 

节点操作

对节点进行动态修改。

removeClass()

addClass()

html = """      <body>
                <div class="item_1"><span>1.</span>第一行</div>
                <div class="item_2"><span>2.</span>第二行</div>
                <div class="item_3"><span>3.</span>第三行</div>
                </body>
        """

from pyquery import PyQuery as pq
parser = pq(html)
result = parser("div")
for n,value in enumerate(result.items(),1):
    value.removeClass(value.attr("class"))
    value.addClass(str(n))
    print(value)
View Code

attr()

text()

html = """      <body>
                <div class="item_1"><span>1.</span>第一行</div>
                <div class="item_2"><span>2.</span>第二行</div>
                <div class="item_3"><span>3.</span>第三行</div>
                </body>
        """

from pyquery import PyQuery as pq
parser = pq(html)
result = parser("div")
for n,value in enumerate(result.items(),1):
    value.attr(id=str(n))
    value.text('Hello World')
    print(value)
View Code

remove()

html = """      <body>
                Hello World!
                <div class="item_1"><span>1.</span>第一行</div>
                <div class="item_2"><span>2.</span>第二行</div>
                <div class="item_3"><span>3.</span>第三行</div>
                </body>
        """

from pyquery import PyQuery as pq
parser = pq(html)
result = parser("body")
value = result.remove("div")
print(value.text())
View Code

更多用法:http://pyquery.readthedocs.io/en/latest/api.html

 

posted @ 2019-12-10 09:09  Sun先生  Views(855)  Comments(0Edit  收藏  举报