from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
asdf
<div class="title">
<b>The Dormouse's story总共</b>
<h1>f</h1>
</div>
<div class="story">Once upon a time there were three little sisters; and their names were
<a class="sister0" id="link1">Els<span>f</span>ie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</div>
ad<br/>sf
<p class="story">...</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, features='lxml')
# 前戏
# tag1 = soup.find('a')
# print(tag1)
# tag2 = soup.find_all('a')
# for tag in tag2:
# print(tag.text)
# 找到id=link2的标签
# tag3 = soup.select('#link2')
# print(tag3)
#
# tag4 = soup.find('', id='link2')
# print(tag4)
#
# tag5 = soup.select('.title')
# print(tag5, type(tag5[0]))
# 1 name
# tag = soup.find('a')
# print(tag)
# print(tag.name)
#
# tag.name = 'span'
# print(soup)
# 2 attr
# tag = soup.find('a')
# attrs = tag.attrs
# print(attrs)
# print('xxxxxx', tag.get('class'))
#
# tag.attrs = {'ik': 123}
# tag.attrs['id'] = 'iiiii'
# print(tag)
# 3 children 所有子标签
# body = soup.find('body')
# v = body.children
# child_list = []
# for i in v:
# print('分割线'.center(120, '#'))
# print(i)
# 4 children 所有子子孙孙
# body = soup.find('body')
# v = body.descendants
# for i in v:
# print('分割线'.center(120, '#'))
# print(i)
# 5 递归删除所有的标签
# body = soup.find('body')
# body.decompose()
# print(soup)
# 6 clear 将标签的所有子标签全部清空(保留标签名)
# body = soup.find('body')
# body.clear()
# print(soup)
# 7 extract,递归的删除所有的标签,并获取删除的标签
# body = soup.find('body')
# v = body.extract()
# print(soup)
# print('xxxxxxx', v)
# 8 decode,转换为字符串(含当前标签);decode_contents(不含当前标签)
# body = soup.find('body')
# print('没转化之前', type(body), body)
# print("$$$$$$$$$$$$$$$$$")
# v = body.decode()
# v1 = body.decode_contents()
# print(v, type(v))
# print("$$$$$$$$$$$$$$$$$")
# print(v1, type(v1))
# 9. encode,转换为字节(含当前标签);encode_contents(不含当前标签)
# body = soup.find('body')
# v = body.encode()
# v1 = body.encode_contents()
# print(v)
# print('#'.center(120, '#'))
# print(v1)
# 10. find,获取匹配的第一个标签
# tag = soup.find('a')
# tag = soup.find('a', attrs={'class': 'sister'}, recursive=True, text='Lacie') # recursive 递归
# tag = soup.find('a', id='link2')
# print(tag)
# 11. find_all,获取匹配的所有标签
# tags = soup.find_all('a')
# tags = soup.find_all('a', limit=1)
# tags = soup.find_all('a', attrs={'class': 'sister'})
# tags = soup.find_all('a', attrs={'class': 'sister'}, text='Lacie')
# print(tags)
# 列表
# v = soup.find_all(name=['a', 'div'])
#
# v1 = soup.find_all(name='a')
# v2 = soup.find_all(name='div')
# # v = soup.find_all(href=rep)
# print(v)
#
# print("&".center(120, '#'))
# print(v1)
# print("&".center(120, '#'))
# print(v2)
# v = soup.find_all(name=['a', 'div']) # v1 = soup.find_all(name='a') + v2 = soup.find_all(name='div')
# v = soup.find_all(class_=['sister0', 'sister'])
# v = soup.find_all(text='Tillie')
# v = soup.find_all(id=['link1', 'link2'])
# v = soup.find_all(href=["http://example.com/lacie", "http://example.com/tillie"])
# print(v)
# 正则
import re
# rep = re.compile('p')
# rep = re.compile('^p')
# v = soup.find_all(name=rep)
# print(v)
# rep = re.compile('sister.*')
# v = soup.find_all(class_=rep)
# print(v)
# rep = re.compile('http://example.com.*')
#
# v = soup.find_all(href=rep)
# print(v)
# 方法筛选
# def func(tag):
# return tag.has_attr('class') and tag.has_attr('id')
#
#
# v = soup.find_all(name=func)
# print(v)
# get 获取属性
# tag = soup.find('a')
# v = tag.get('id')
# print(v)
# 12. has_attr,检查标签是否具有该属性
# tag = soup.find('a')
# v = tag.has_attr('id')
# print(v)
# 13. get_text,获取标签内部文本内容
# tag = soup.find('a')
# v = tag.get_text('id')
# print(tag)
# print(v)
# 14. index,检查标签在某标签中的索引位置
# tag = soup.find('body')
# v = tag.index(tag.find('p'))
# print(tag)
# print(v)
# tag = soup.find("body")
# for i, v in enumerate(tag):
# print(i, v)
# 15. is_empty_element,是否是空标签(是否可以是空)或者自闭合标签,
# 判断是否是如下标签:'br' , 'hr', 'input', 'img', 'meta','spacer', 'link', 'frame', 'base'
# tag = soup.find('br')
# v = tag.is_empty_element
# print(tag)
# print(v)
# 16. 当前的关联标签
# div = soup.find('div')
# print(div)
# print(div.next)
# print(div.next_element)
# print(div.next_elements)
# print(div.sibling)
# print(div.siblings)
# tag = soup.find('a')
# print(tag)
# print(tag.previous)
# print(tag.previous_element)
# print(tag.previous_elements)
# print(tag.previous_sibling)
# print(tag.previous_siblings)
# print(tag.parent)
# print(tag.parents)
# 17. 查找某标签的关联标签 # 参数同find_all
# tag = soup.find('a')
# print(tag.parent)
# print(tag.find_next()) # 下一个, 内嵌
# print(tag.find_all_next())
# print(tag.find_next_sibling()) # 兄弟
# print(tag.find_next_siblings()) # 所有兄弟
# print(tag.find_previous()) # 等同于找上一级
# print(tag.find_all_previous())
# tag1 = soup.find_all('a')[1]
# # print(tag1)
# # print(tag1.find_previous_sibling()) # 前一个兄弟
# # print(tag1.find_previous_siblings()) # 前面的兄弟们
# print(tag.find_parent()) # tag.parent
# print(tag.find_parents()) # tag.parents
# 18. select,select_one, CSS选择器
# print(soup.select('title'))
# print(soup.select('p nth-of-type(3)'))
# print(soup.select('body a')) # soup.find_al('a')
# soup.select("html head title")
# tag = soup.select("div,a")
# tag = soup.select("head > title") # 注意空格
# tag = soup.select("div > a") # 注意空格
# tag = soup.select("p > a:nth-of-type(2)")
# tag = soup.select("p > #link1")
# tag = soup.select("body > a")
# tag = soup.select("#link1 ~ .sister") # 同级往下所有
# tag = soup.select("#link1 + .sister") # 同级往下一个
# tag = soup.select(".sister") # class
# tag = soup.select("[class~=sister]") # 属性
# tag = soup.select("#link1") # id
# tag = soup.select("a#link2") # a标签里的id=link2
# tag = soup.select('a[href]') # 属性
# tag = soup.select('a[href="http://example.com/lacie"]') # 完全匹配
# tag = soup.select('a[href^="http://example.com/"]') # 开头匹配
# tag = soup.select('a[href$="tillie"]') # 结尾匹配
# tag = soup.select('a[href*=".com/"]') # 随意包含
# print(tag)
from bs4.element import Tag
# def default_condition_generator(tag):
# """找出含有href的标签"""
# for child in tag.descendants:
# if not isinstance(child, Tag):
# continue
# if not child.has_attr('href'):
# continue
# yield child
# tags = soup.find('body').select('a', _candidate_generator=default_condition_generator)
# tags = soup.find('body').select('a', _candidate_generator=default_condition_generator, limit=1)
# print(type(tags), tags)
# 19. 标签的内容
# tag = soup.find('span')
# print(tag.string) # 获取
# tag.string = 'hello world' # 设置
# print(soup)
# tag = soup.find('body')
# print(tag.string)
# tag.string = 'xxx'
# print(soup)
# tag = soup.find('body')
# v = tag.stripped_strings # 递归内部获取所有标签的文本
# for i in v:
# print(i)
# tag = soup.find('body')
# print(tag.text)
# 20 append在当前标签【内部追加】一个标签
# tag = soup.find('body')
# tag.append(soup.find('a')) # <a class="sister0" id="link1">Els<span>f</span>ie</a></body>
# print(soup)
# from bs4.element import Tag
# obj = Tag(name='i',attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.append(obj)
# print(soup)
# 21.insert在当前标签内部指定位置插入一个标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert(2, obj) # 在索引为2的位置插入
# print(soup)
# 22. insert_after,insert_before 在当前标签后面或前面插入
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('body')
# tag.insert_before(obj)
# # tag.insert_after(obj)
# print(soup)
# 23. replace_with 在当前标签替换为指定标签
# from bs4.element import Tag
# obj = Tag(name='i', attrs={'id': 'it'})
# obj.string = '我是一个新来的'
# tag = soup.find('div')
# tag.replace_with(obj)
# print(soup)
# 24. 创建标签之间的关系
# tag = soup.find('div')
# a = soup.find('a')
# tag.setup(previous_sibling=a)
# print(tag.previous_sibling)
# 25. wrap,用指定标签把当前标签包裹起来
# from bs4.element import Tag
# obj1 = Tag(name='div', attrs={'id': 'it'})
# obj1.string = '我是一个新来的'
#
# tag = soup.find('a')
# v = tag.wrap(obj1)
# print(soup)
# tag = soup.find('a')
# v = tag.wrap(soup.find('p'))
# print(soup)
# 26. unwrap,去掉当前标签,将保留其包裹的标签
tag = soup.find('a')
v = tag.unwrap() # v为包裹的标签
print(v)
print(soup)