1. 解析库
soup = BeautifulSoup(html,'html.parser')#Python标准库
BeautifulSoup(html,'lxml')#lxml HTML解析器
BeautifulSoup(html,'lxml')#lxml xml解析库
BeautifulSoup(html,'html.parser')#html5lib
2. 导包
from bs4 import BeautifulSoup
3. 获取所有div标签
divs = soup.find_all('div')
4. 获取指定div标签
divs = soup.find_all('div')[1]
5. 获取指定属性的标签
divs = soup.find_all('div',id='even')#方法一
divs = soup.find_all('div',attrs={'id':'even'})#方法二
6. 获取多个指定属性标签
span = soup.find_all('span',class_='position',width='350')#class属性为Python关键字,后加下划线区别
soup.find_all('span',attrs = {'class':'position','width':'350'})
7. 获取标签的属性值
alist = soup.find_all('a')
#通过下标方式提取
for a in alist:
href = a['href']
#方法二
for a in alist:
href = a.attrs['href']
8. 获取标签内容
a = div.find_all('a')[0]
position = a.string
9. 消除无用信息
infos = list(div.stripped_strings)