1 from bs4 import BeautifulSoup
2
3
4
5
6 html = """
7 <html><head><title>This is a python demo page</title></head>
8 <body>
9 <p class="title"><a>The demo python introduces several python courses.</a></p>
10 <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
11 <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1"><b class="element">Basic Python</b></a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
12 </body></html>
13 """
14
15 soup = BeautifulSoup(html,'lxml')
16 #基本使用
17 # print(soup.prettify())
18 # print(soup.title.string)
19
20 #标签选择器
21 #选择元素
22 # print(soup.title)
23 # print(type(soup.title))
24 # print(soup.head)
25 # print(soup.p)#返回第一个Tag
26 #
27 # #获取名称
28 #
29 # print(soup.title.name)
30 #
31 # #获取属性
32 # print(soup.a.attrs['href'])
33 # print(soup.a['href'])
34 #
35 # #获取内容
36 # print(soup.p.string)
37 #
38 # #嵌套选择
39 # print(soup.head.title.string)
40
41 #子节点和子孙节点
42 # print(soup.body.contents)#获取子节点,返回列表类型
43 # print(soup.body.children) #返回迭代器类型
44 # for i,child in enumerate(soup.body.children):
45 # print(i,child)
46
47 # print(soup.body.descendants) #子孙节点,返回迭代类型
48 # for i,child in enumerate(soup.body.descendants):
49 # print(i,child)
50
51 #父节点和祖先节点
52 # print(soup.a.parent)
53 #
54 # print(list(enumerate(soup.a.parents)))
55
56 #兄弟节点
57 # print(list(enumerate(soup.a.next_siblings)))
58 # print(list(enumerate(soup.a.previous_siblings)))
59
60 #标准选择器
61 #find_all(name,attrs,recursive,text,**kwargs) 可根据标签名、属性、内容查找文档
62 #name
63 # print(soup.find_all('p'))
64 # print(type(soup.find_all('p')[0]))
65 # for i in soup.find_all('p'):
66 # print(i.find_all('a')) #嵌套选择
67
68 # #attrs
69 # print(soup.find_all(attrs={'href':"http://www.icourse163.org/course/BIT-268001"}))
70 # print(soup.find_all(attrs={'id':'link1'}))
71 #
72 # print(soup.find_all(id='link1'))
73 # print(soup.find_all(class_='py1'))
74 #
75 # #text查找内容
76 # print(soup.find_all(text='This is a python demo page'))#用来做内容匹配
77 #
78 # #find(name,attrs,recursive,text,**kwargs)
79 # #用法一样,find只是返回单个元素,find_all返回所有元素
80 # print(soup.find('p',attrs={'class':'course'}))
81 # print(type(soup.find('p')))
82
83
84 #CSS选择器,返回列表
85 #通过select()直接传入CSS选择器即可完成选择
86 #选择class属性就直接用'.'代替,例:class=‘course’--》.course;#代表id
87 # print(soup.select('.course .py1'))
88 # print(soup.select('p a'))#嵌套选择
89 # print(soup.select('#link1 .element'))
90 # print(type(soup.select('p')[0]))
91 #
92 # #嵌套选择
93 # for p in soup.select('p'):
94 # print(p.select('a'))
95 #
96
97 #获取属性
98 for p in soup.select('p'):
99 print(p['class'])
100 print(p.attrs['class'])
101
102
103 #获取内容
104 for p in soup.select('p'):
105 print(p.get_text())