【爬虫】网页数据的解析提取[pyquery]

from turtle import ht
from pyquery import PyQuery as pq

# 字符串初始化
html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
doc = pq(html)
print(doc('li'))

# URL初始化
# doc = pq(url='https://www.vilipix.com/')
print(doc('title'))

# 文件初始化
# 自己查吧

# 基本CSS选择器
html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
doc = pq(html)
print(doc('#container .list li'))
print(type(doc('#container .list li')))

for item in doc('#container .list li').items():
    print(item.text())

'''
    查找节点
'''

# 子孙节点
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(lis)

# 子节点
lis = items.children()
print(type(lis))
print(lis)

# or
lis = items.children('.active')
print(type(lis))
print(lis)

# 父节点

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container)

# 祖先节点
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents)

# 筛选祖先节点
items = doc('.list')
parents = items.parents('.wrap')
print(type(parents))
print(parents)

# 兄弟结点
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings())

# 筛选兄弟结点
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings('.active'))

# 遍历结点
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(str(li))

# 多个节点:遍历获取
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
    print(li, type(li))

'''
    获取信息
'''

# 获取属性
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
doc = pq(html)
a = doc('.item-0.active a')
print(a, type(a))
print(a.attr('href'))

# 获取a节点所有属性
doc = pq(html)
a = doc('a')
for item in a.items():
    print(item.attr('href'))

# 获取文本
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())

# 获取节点内的HTML文本   text获得所有,html获得第一个
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html())

'''
    节点操作
'''
# addClass removeClass
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.remove_class('active')
print(li)
li.add_class('active')
print(li)

# attr text html
html = '''
<ul class="list"><li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li></ul>
'''
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')
print(li)
li.text('changed item')
print(li)
li.html('<span>changed item</span')
print(li)

# remove
html = '''
<div class="wrap">
    Hello,world
    <p>this is a paragraph.</p>
</div>
'''
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text())

# 伪类选择器
html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)

  

posted @ 2022-04-24 22:45  帝皇の惊  阅读(45)  评论(0)    收藏  举报