python爬虫-3 解析库
1.XPath
2.BeautifulSoup
import requests
from bs4 import BeautifulSoup
headers = {'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'content-type': 'application/json',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'}
url = 'https://www.bilibili.com/'
r = requests.get(url=url, headers=headers, verify=False)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
print(soup.meta)
# 第一个meta的属性
print(soup.meta.attrs)
# 格式化后的内容
# print(soup.prettify())
# 获取head下为meta节点的,属性字典
for i, c in enumerate(soup.head.children):
if c.name == 'meta':
print(i, c.attrs)
2.1 获取父亲、兄弟节点
#a属性的一个父节点
soup.a.parent
#a属性所有父节点
soup.a.parents
#a属性上一个兄弟节点
soup.a.previous_sibling
#a属性下一个兄弟节点
soup.a.next_sibling
#a属性上所有兄弟节点
soup.a.previous_siblings
#a属性下所有兄弟节点
soup.a.next_siblings
2.3 选择器
更加灵活的选择,find_all(),find()
for i, c in enumerate(soup.find_all(name='meta')):
print(i, c.attrs)
for i, c in enumerate(soup.find_all(name='meta',attrs={'name':'360-site-verification'})):
print(i, c.attrs)
对于查询class属性的需求,用class_表示
还有find_parents(),find_parent(),
find_next_siblings(),find_next_sibling(),
find_previous_siblings(),find_previous_sibling(),用法类似
CSS选择器:
print(soup.select('meta'))
print(soup.select_one('meta'))
3.pyquery