comment ="""
<p id="i1">
我是中国人
</p>
<p >
<script>alert(123)</script>
</p>
<p id="i2">
<span>我是中国人</span>
</p>
<p>
<br />
</p>
<p id="i3">
<span>我是中国人</span><img src="/static/images/1.jpg" alt="" />
</p>
"""
#pip3 install beautifulsoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(comment,"html.parser") #html.parser 是一个内置的解析器,BeautifulSoup会根据html.parser把html解析为一个个对象
# tag = soup.find(name="span") #找第一个标签
# print(tag)
# obj = soup.find(attrs={"id":"i2"}) #查找属性 查找第一个
# print(obj)
# obj = soup.find(name="p",attrs={"id":"i2"}) #并且
# print(obj)
# obj = soup.find_all(name="p") #查找属性 查找所有
# print(obj)
#查找所有内容,匹配到的清空内容,不删除标签clear()
# valid_tag = ["p","img","div"]
#
# tags = soup.find_all()
# for tag in tags:
# if tag.name not in valid_tag:
# tag.clear()
# print(soup)
#查找所有内容,匹配到的删除标签
# valid_tag = ["p","img","div"]
#
# tags = soup.find_all()
# for tag in tags:
# if tag.name not in valid_tag:
# tag.decompose()
# print(soup)
#取到的soup是对象,转换成字符串
# print(soup.decode())
#限制某个标签的属性,不在的属性从标签中删除
valid_tag = {
"p":["class","id"],
"img":["src"],
"div":["class"],
}
tags = soup.find_all()
for tag in tags:
if tag.name not in valid_tag:
tag.decompose()
if tag.attrs:
#print(tag.attrs) #获取所有标签的属性
for k in list(tag.attrs.keys()):
if k not in valid_tag[tag.name]:
del tag.attrs[k]
content_str = soup.decode()
print(content_str)