爬虫之pyquery库

官方文档:https://pyquery.readthedocs.io/en/latest/

PyQuery是一个强大又灵活的网页解析库。如果你觉得正则写起来太麻烦、BeautifulSoup语法太难记,而你熟悉jQury的语法,那么PyQuery就是你的绝佳选择。

一、开始

字符串初始化:

from pyquery import PyQuery as pq
d = pq("<html>哈哈哈</html>")  # 现在d就相当于jQuery的$
print(d("html"))

URL初始化:

from pyquery import PyQuery as pq
d = pq(url="https://www.baidu.com")
print(d("head"))

文件初始化:

from pyquery import PyQuery as pq
d = pq(filename='demo.html')  # filename指定文件路径
print(d("head"))

二、基本CSS选择器

html = """
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
"""
from pyquery import PyQuery as pq
d = pq(html)
print(d("#container .list li"))
View Code

三、查找元素

子元素

d("css选择器").find("li")
html = """
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
"""
from pyquery import PyQuery as pq
d = pq(html)
items = d(".list")
print(type(items))  # <class 'pyquery.pyquery.PyQuery'>
li = items.find("li")
print(type(li))    # <class 'pyquery.pyquery.PyQuery'>
print(li)
"""
 <li class="item-0">first item</li>
 <li class="item-1"><a href="link2.html">second item</a></li>
 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
 <li class="item-1 active"><a href="link4.html">fourth item</a></li>
 <li class="item-0"><a href="link5.html">fifth item</a></li>
"""
View Code

父元素

d("css选择器").parent(<css选择器(可无)>)
html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
items = d(".list")
parents = items.parents()
print(parents)
"""
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div>
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
"""
d(".list").parents()
html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
items = d(".list")
parents = items.parents(".wrap")
print(parents)
"""
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div>
"""
d(".list").parents(".wrap")

兄弟元素

d("css选择器").siblings(<css选择器(可无)>)
html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
li = d(".list .item-0.active")
print(li.siblings())
"""
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
"""
View Code
html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
li = d(".list .item-0.active")
print(li.siblings(".active"))
"""
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
"""
View Code

四、遍历

html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
li = d("li").items()
print(type(li))  # <class 'generator'>
for i in li:
    print(i)
"""
<li class="item-0">first item</li>     
<li class="item-1"><a href="link2.html">second item</a></li>           
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>            
<li class="item-1 active"><a href="link4.html">fourth item</a></li>            
<li class="item-0"><a href="link5.html">fifth item</a></li>
"""
View Code

五、获取信息

获取属性

html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
a = d(".item-0.active a")
print(a.attr("href"))
print(a.attr.href)
View Code

获取文本

html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
a = d(".item-0.active a")
print(a.text())
"""
third item
"""
View Code

获取html

html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
li = d(".item-0.active")
print(li)
print(li.html())
"""
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>        
<a href="link3.html"><span class="bold">third item</span></a>
"""
View Code

六、DOM操作

addClass()、removeClass()

html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
li = d(".item-0.active")
print(li)
li.removeClass("active")
print(li)
li.addClass("active")
print(li)
"""
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>            
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>            
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
"""
View Code

attr()、css()

html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
li = d(".item-0.active")
print(li)
li.attr("name", "link")
print(li)
li.css("font-size", "14px")
print(li)
"""
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>        
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>       
<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>
"""
View Code

remove()

html = """
<div class="wrap">
    Hello, World.
    <p>This is a paragraph.</p>
 </div>
"""

from pyquery import PyQuery as pq
d = pq(html)
wrap = d(".wrap")
print(wrap.text())
"""
Hello, World.
This is a paragraph.
"""
wrap.find("p").remove()
print(wrap.text())  # Hello, World.
View Code

其他DOM方法 

https://pyquery.readthedocs.io/en/latest/api.html

七、伪类选择器

html = """
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
        </ul>
    </div>
</div>
"""
from pyquery import PyQuery as pq
d = pq(html)
li = d("li:first-child")
print(li)  # <li class="item-0">first item</li>
li = d("li:last-child")
print(li)  # <li class="item-0"><a href="link5.html">fifth item</a></li>
li = d("li:nth-child(2)")
print(li)  # <li class="item-1"><a href="link2.html">second item</a></li>
li = d("li:gt(2)")  # 从0开始计数,索引大于2
print(li)
"""
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
 <li class="item-0"><a href="link5.html">fifth item</a></li>
"""
li = d("li:nth-child(2n)")  # 获取偶数顺序的元素(从1开始)
print(li)
"""
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
"""
li = d("li:contains(second)")  # 根据文本匹配,匹配文本包含second的标签
print(li)  # <li class="item-1"><a href="link2.html">second item</a></li>
View Code

更多选择器:http://www.w3school.com.cn/cssref/css_selectors.asp

 

posted @ 2019-04-05 07:53  就俗人一个  阅读(244)  评论(0编辑  收藏  举报