常见的爬虫分析库(4)-爬虫之PyQuery
官方文档:http://pyquery.readthedocs.io/
安装
|
1
|
pip install pyquery |
初始化
字符串初始化
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
html = '''<div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)print(doc('li')) |
View CodeURL初始化
|
1
2
3
|
from pyquery import PyQuery as pqdoc = pq(url='http://www.baidu.com')print(doc('head')) |
View Code文件初始化
|
1
2
3
|
from pyquery import PyQuery as pqdoc = pq(filename='demo.html')print(doc('li')) |
View Code基本CSS选择器
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)print(doc('#container .list li')) |
View Code查找元素
子元素
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')print(type(items))print(items)lis = items.find('li')print(type(lis))print(lis) |
View Code|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')lis = items.children()print(type(lis))print(lis) |
View Code|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')lis = items.children('.active')print(lis) |
View Code父元素
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
html = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')container = items.parent()print(type(container))print(container) |
View Code|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)items = doc('.list')parents = items.parents()print(type(parents))print(parents) |
View Code|
1
2
|
parent = items.parents('.wrap')print(parent) |
View Code兄弟元素
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('.list .item-0.active')print(li.siblings()) |
View Code|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('.list .item-0.active')print(li.siblings('.active')) |
View Code遍历
单个元素
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('.item-0.active')print(li) |
View Code|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)lis = doc('li').items()print(type(lis))for li in lis: print(li) |
View Code获取信息
获取属性
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)a = doc('.item-0.active a')print(a)print(a.attr('href'))print(a.attr.href) |
View Code获取文本
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)a = doc('.item-0.active a')print(a)print(a.text()) |
View Code获取HTML
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('.item-0.active')print(li)print(li.html()) |
View CodeDOM操作
addClass、removeClass
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('.item-0.active')print(li)li.removeClass('active')print(li)li.addClass('active')print(li) |
View Codeattr、css
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('.item-0.active')print(li)li.attr('name', 'link')print(li)li.css('font-size', '14px')print(li) |
View Coderemove
|
1
2
3
4
5
6
7
8
9
10
11
12
|
html = '''<div class="wrap"> Hello, World <p>This is a paragraph.</p> </div>'''from pyquery import PyQuery as pqdoc = pq(html)wrap = doc('.wrap')print(wrap.text())wrap.find('p').remove()print(wrap.text()) |
View Code其他DOM方法 http://pyquery.readthedocs.io/en/latest/api.html
伪类选择器
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
html = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a></li> </ul> </div> </div>'''from pyquery import PyQuery as pqdoc = pq(html)li = doc('li:first-child')print(li)li = doc('li:last-child')print(li)li = doc('li:nth-child(2)')print(li)li = doc('li:gt(2)')print(li)li = doc('li:nth-child(2n)')print(li)li = doc('li:contains(second)')print(li) |

浙公网安备 33010602011771号