Python开发之爬虫模块介绍(二)

BeautifulSoup模块

是一个又灵活又方便的网页解析库,而且处理高效,支持多种解析器,利用它不用编写正则表达式即可方便的实现网页信息的提取。

html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <div class="author-name">我的Blog</div>
    <div class="info">这个人很懒,什么都没有留下。</div>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.prettify()) # 格式化代码
print(soup.title.string) # 取出title

1、标签选择器

  • 选择元素
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <div class="author-name">我的Blog</div>
    <div class="info">这个人很懒,什么都没有留下。</div>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title) # <title>Blog示例</title>
print(type(soup.title)) #<class 'bs4.element.Tag'>

print(soup.head) #<head><title>Blog示例</title></head>
print(soup.a)
'''
<a class="logo" href="#">
<img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
</a>
'''
  • 获取名称
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <div class="author-name">我的Blog</div>
    <div class="info">这个人很懒,什么都没有留下。</div>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.title.name) # title
  • 获取属性
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'></p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.attrs['name']) # dromouse
print(soup.p['name']) #dromouse
  • 获取内容
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.p.string) # 人生苦短
  • 嵌套
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.head.title.string) # Blog示例
  • 子节点和子孙节点
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.body.contents) # 是一个列表
'''
['\n', <a class="logo" href="#">
<img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
</a>, '\n', <p name="dromouse">人生苦短</p>, '\n']
'''
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.body.children)
for i, children in enumerate(soup.body.children):
    print(i,children)
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.body.descendants) 
for i, children in enumerate(soup.body.descendants):
    print(i,children)
  • 父节点和祖先节点
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.img.parent) 
'''
<a class="logo" href="#">
<img alt="" src="https://q1mi.github.io/Blog/asset/img/head_img.jpg"/>
</a>
'''
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(list(soup.img.parents)) # 祖先节点
  • 兄弟节点
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(list(soup.a.next_siblings)) # 兄弟节点
print(list(soup.a.previous_siblings))

2、标准选择器

find_all(name,attrs,recursive,text,**kwargs)

可根据标签名,属性,内容查找文档

  • name
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
    <p name='dromouse1'>人生苦短1</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all('p')) # 是一个列表
print(soup.find_all('p')[0])
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
    <p name='dromouse1'>人生苦短1</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all('a')) # 是一个列表,
print(type(soup.find_all('a'))) 
for i in soup.find_all('a'):
    print(i.find_all('img'))
  • attrs
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
    <p name='dromouse1'>人生苦短1</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(attrs={"class":'logo'})) 
print(soup.find_all(attrs={"name":'dromouse'})) 
  • text
html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
    <p name='dromouse1'>人生苦短1</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(text='人生苦短'))

find返回单个元素,find_all返回所有元素,用法和find_all一样。

3、CSS选择器

通过select()直接传入CSS选择器即可完成选择

html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
    <p name='dromouse1'>人生苦短1</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.select('.logo'))
print(soup.select('body p'))

获取属性

html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
    <p name='dromouse1'>人生苦短1</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.select('body img')[0].attrs['src'])

获取内容

html = '''
<html lang="zh-CN">
<head><title>Blog示例</title></head>
<body>
    <a href="#" class="logo">
        <img src="https://q1mi.github.io/Blog/asset/img/head_img.jpg" alt="">
    </a>    
    <p name='dromouse'>人生苦短</p>
    <p name='dromouse1'>人生苦短1</p>
</body>
</html>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.select('body p')[0].get_text())

总结:

  • 推荐使用lxml解析库,必要时使用html.parser
  • 标签选择筛选功能弱但是速度快
  • 建议使用find、find_all查询匹配单个结果或者多个结果
  • 如果对CSS选择器比较熟建议使用select()
  • 最后记住常用的获取属性和文本的方法。

PyQuery解析库

是一个强大又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难,如果你熟悉jQuery的语法,那么PyQuery就是你的绝佳选择,好多如果如果如果。。。。。

1、PyQuery初始化

  • 初始化字符串
html='''
<div>
    <ul>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html) # pyquery对象
print(doc('li')) # 元素选择
  • URL初始化
from pyquery import PyQuery as pq

doc = pq(url='http://www.baidu.com')
print(doc('title'))
  • 文件初始化
from pyquery import PyQuery as pq

doc = pq(filename='pyquery.html')
print(doc('li'))

2、基本CSS选择器

html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
print(doc('#container .list li'))

3、查找元素

  • 子元素
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
items = doc('.list')
# print(type(items))
# print(items)
li = items.find('li')
# print(li)
lis = items.children()
# print(lis)
print(items.children('.active'))
  • 父元素
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
items = doc('.list')
con = items.parent()
print(con)
  • 兄弟节点
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
items = doc('.list .item-0.active')
print(items.siblings())

4、遍历

  • 单个元素
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
items = doc('.item-0.active')
print(items)
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
li= doc('li').items() # 遍历的方法是items()
print(li) #<generator object PyQuery.items at 0x0305F2D0> 生成器
print(next(li))
for i in li:
    print(i)

5、获取信息

  • 获取属性
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
a= doc('.item-0.active a')
print(a)
print(a.attr('href'))
print(a.attr.href)
  • 获取文本
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
a= doc('.item-0.active a')
print(a)
print(a.text())
  • 获取HTML
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
a= doc('.item-0.active a')
print(a)
print(a.html())

6、DOM操作

  • addClass、removeClass
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
li = doc('.item-0.active')
li.removeClass('active')
print(li)
li.addClass('active')
print(li)
  • attr 、css
html='''
<div id='container'>
    <ul class='list'>
        <li class="item-0">first item</li>
        <li class="item-1"><a href="link2.html">second item</a></li>
        <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
        <li class="item-1 active"><a href="link4.html">fourth item</a></li>
        <li class="item-0"><a href="link5.html">fifth item</a></li>
    </ul>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
li = doc('.item-0.active')
li.attr('name','link')
print(li)
li.css('color','red')
print(li)
  • remove
html='''
<div class='wrap'>
    Hello,world
    <p>This is a paragraph</p>
</div>
'''
from pyquery import PyQuery as pq

doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text())

  

posted on 2015-12-17 13:00  Mr.Hui  阅读(189)  评论(0编辑  收藏  举报

导航