pyquery 的简单使用

pyquery 的初步了解(实例引入)

简单举例

from pyquery import PyQuery as pq

html = '''
<div>
<ul>
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''

doc = pq(html)
print(doc)


# 输出:
<div>
<ul>
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li></ul>
</div>

字符串

from pyquery import PyQuery as pq
import requests

# doc1 与 doc2 功能相同
doc1 = pq(url='https://www.cnblogs.com/liyihua/')
print(doc1('title'))

doc2 = pq(requests.get('https://www.cnblogs.com/liyihua/').text)
print(doc1('title'))


1# 输出:
<title>李亦华 - 博客园</title>&#13;
  
<title>李亦华 - 博客园</title>&#13;

URL

from pyquery import PyQuery as pq

doc = pq(filename='test.html')
print(doc('li'))


# 输出:
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</li>


# 文件内容:
<div>
<ul>
<li class="item-O"><a href="linkl.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>

pyquery 中的基本CSS选择器

实例切入:

from pyquery import PyQuery as pq

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''

doc = pq(html)
print(doc('#container .list li'))

print(
    type(
        doc('#container .list li')
    )
)


# 输出:
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     
<class 'pyquery.pyquery.PyQuery'>

查找节点

获取子孙节点

说明:find()方法查找的是所有子孙节点,如果只查找子节点,可以使用children()方法。

from pyquery import PyQuery

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''

doc = PyQuery(html)
items = doc('.list')

print(
    type(items),
    items,
    sep='\n'
)

print(
    type(items.find('li')),
    items.find('li'),
    sep='\n'
)
# 输出:
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>

<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>

获取父节点

from pyquery import PyQuery

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
1</div>
13 '''

doc = PyQuery(html)
items = doc('.list')

print(items, '\n')

print(
    type(items.parent()),
    items.parent(),
    sep='\n'
)
# 输出:
<ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 

<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>

兄弟节点

from pyquery import PyQuery

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''

doc = PyQuery(html)

# 选择class为list的节点内部class为item-0和active的节点
items = doc('.list .item-0.active')

print(
    type(items.siblings()),
    items.siblings(),
    sep='\n'
)

print("\n", items.siblings('.active'))
# 输出:
<class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0">first item</li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     

 <li class="item-1 active"><a href="link4.html">fourth item</a></li>

遍历节点

from pyquery import PyQuery

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''

doc = PyQuery(html)
lis = doc('li').items()             # 调用items()方法,得到一个生成器

for li in lis:
    print(
        li, 
        type(li)
    )
# 输出:
<li class="item-0">first item</li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-1"><a href="link2.html">second item</a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
          <class 'pyquery.pyquery.PyQuery'>
<li class="item-0"><a href="link5.html">fifth item</a></li>
      <class 'pyquery.pyquery.PyQuery'>

获取信息

  1. attr()方法获取属性

    from pyquery import PyQuery
    
    html = '''
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
    '''
    
    doc = PyQuery(html)
    a = doc('.item-0.active a')
    print(
        a,
        type(a),
        a.attr('href'),             # 也可以用a.attr.href,两者作用相同
        sep='\n'
    )
    
    # 输出:
    <a href="link3.html"><span class="bold">third item</span></a>
    <class 'pyquery.pyquery.PyQuery'>
    link3.html
    
  2. text()方法获取文本

    from pyquery import PyQuery
    
    html = '''
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
    '''
    
    doc = PyQuery(html)
    li = doc('li')
    
    print(
        li.html(),                  # 获取节点的内部文本
        li.text(),                  # 获取节点文本,返回结果是纯文字内容
        type(li.text()),
        sep='\n'
    )
    
    # 输出:
    first item
    first item second item third item fourth item fifth item
    <class 'str'>
    

节点操作

添加和移除class

add_class() 和 remove_class() ---- 添加class、移除class
from pyquery import PyQuery

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
</div>
'''

doc = PyQuery(html)
li = doc('.item-0.active')

print(li)
print(li.remove_class('active'))
print(li.add_class('active'))
# 输出:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
         
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         

attr、text 和 html 方法

from pyquery import PyQuery

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
     </ul>
</div>
'''

doc = PyQuery(html)

li = doc('.item-0.active')
print(li)

li.attr('name', 'link')         # 添加属性name,属性值为link
print(li)

li.text('change item')          # 将节点内部的内容改为'change item'
print(li)

li.html('<span>change item</span>')         # 将节点内部的内容改为'<span>change item</span>'
print(li)


# 输出:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
   
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
     
<li class="item-0 active" name="link">change item</li>
    
<li class="item-0 active" name="link"><span>change item</span></li>
# 输出:
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
   
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
     
<li class="item-0 active" name="link">change item</li>
    
<li class="item-0 active" name="link"><span>change item</span></li>

删除节点

from pyquery import PyQuery

html = '''
<div class="LeeHua">
LiYihua
<ul class="201802004731">liyihua</ul>
</div>
'''

doc = PyQuery(html)
Leehua = doc('.LeeHua')
print("移除节点ul前的输出:\n"+Leehua.text())

Leehua.find('ul').remove()
print("移除节点ul后的输出:\n"+Leehua.text())
# 输出:
移除节点ul前的输出:
LiYihua
liyihua
移除节点ul后的输出:
LiYihua

伪选择器

示例:

from pyquery import PyQuery

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
            <li class="item-0">first item</li>
            <li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div>
'''

doc = PyQuery(html)

# 选择属于父元素的第一个子元素的每个 <li> 元素。
li = doc('li:first-child')
print(li)

# 选择属于父元素的最后一个子元素的每个 <li> 元素。
li = doc('li:last-child')
print(li)

# 选择属于其父元素的第二个子元素的每个 <li> 元素
li = doc('li:nth-child(2)')
print(li)

# 选择属于其父元素的最后两个子元素的每个 <li> 元素
li = doc('li:gt(2)')
print(li)

# 选择属于父元素的第偶个子元素的每个 <li> 元素。
li = doc('li:nth-child(2n)')
print(li)

# 选择包含'second'的每个元素
li = doc('li:contains(second)')
print(li)
# 输出:
<li class="item-0">first item</li>
            
<li class="item-0"><a href="link5.html">fifth item</a></li>
       
<li class="item-1"><a href="link2.html">second item</a></li>
            
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
            <li class="item-0"><a href="link5.html">fifth item</a></li>
        
<li class="item-1"><a href="link2.html">second item</a></li>
            <li class="item-1 active"><a href="link4.html">fourth item</a></li>
            
<li class="item-1"><a href="link2.html">second item</a></li>

CSS 选择器的用法:http://www.w3school.com.cn/cssref/css_selectors.asp

posted @ 2019-07-11 16:40  LeeHua  阅读(258)  评论(0编辑  收藏  举报