1 # -*- coding:utf-8 -*-
2 '''
3 Created on Sep 10, 2018
4
5 @author: SaShuangYiBing
6 '''
7 from lxml import etree
8
9 html='''
10 <html>
11 <head>
12 <title>哈哈测试一下</title>
13 <link type="text/css" rel="stylesheet" href="haha.css" />
14 <link type="text/css" rel="stylesheet" href="haha1.css" />
15 <link type="text/css" rel="stylesheet" href="haha2.css" />
16 <script type="text/javascript" src="haha.js"></script>
17 <script type="text/javascript" src="haha1.js"></script>
18 <script type="text/javascript" src="haha2.js"></script>
19 </head>
20 <body>
21 <div id="id1" class="class1">
22 <div id="id2" class="class2">
23 <ul class="cls_ul1">
24 <li class="cls_li1">
25 <div class="cls_3">
26 <span>span_text1</span>
27 <span>span_text2</span>
28 <i>text_1</i>
29 </div>
30 <div>
31 <a href="a_1.html">a_1</a>
32 <a href="a_2.html">a_2</a>
33 <a href="a_3.html">a_3</a>
34 </div>
35 <div class="cls_4">
36 <a href="a_4.html">
37 <img href="a_img1.jpg" />
38 </a>
39 </div>
40 </li>
41 <li class="cls_li1">
42 <div class="cls_3">
43 <span>span_text3</span>
44 <span>span_text4</span>
45 <i>text_2</i>
46 <i>text_22</i>
47 </div>
48 <div>
49 <a href="a_4.html">a_4</a>
50 <a href="a_5.html">a_5</a>
51 <a href="a_6.html">a_6</a>
52 </div>
53 <div class="cls_4">
54 <a href="a_5.html">
55 <img href="a_img2.jpg" />
56 </a>
57 </div>
58 </li>
59 </ul>
60 </div>
61 <div id="id3" class="class3">
62 <ul class="cls_ul2">
63 <li class="cls_li2">
64 <div class="cls_5">
65 <span>span_text5</span>
66 <span>span_text6</span>
67 <i>text_3</i>
68 </div>
69 <div>
70 <a href="a_1.html">a_1</a>
71 <a href="a_2.html">a_2</a>
72 <a href="a_3.html">a_3</a>
73 </div>
74 <div class="cls_6">
75 <a href="a_4.html">
76 <img href="a_img3.jpg" />
77 </a>
78 </div>
79 </li>
80 <li class="cls_li2">
81 <div class="cls_5">
82 <span>span_text7</span>
83 <span>span_text8</span>
84 <i>text_4</i>
85 </div>
86 <div>
87 <a href="a_4.html">a_4</a>
88 <a href="a_5.html">a_5</a>
89 <a href="a_6.html">a_6</a>
90 </div>
91 <div class="cls_6">
92 <a href="a_5.html">
93 <img href="a_img4.jpg" />
94 </a>
95 </div>
96 </li>
97 </ul>
98 </div>
99 </div>
100 </body>
101 </html>
102 '''
103
104 html_data = etree.HTML(html)
105
106 # 1、从根节点开始,沿着XML路径一步一步选择节点,text()表示节点内容
107 content = html_data.xpath("/html/head/title/text()")
108 for con in content:
109 print (con)
110 print ("~~~~~~~~~这是第一个分隔线~~~~~~~~~")
111
112 # 2、从根节点开始,沿着XML路径一步一步选择节点,text表示节点内容
113 nodes = html_data.xpath("/html/head/title")
114 for i in nodes:
115 print (i.text)
116 print ("~~~~~~~~~这是第二个分隔线~~~~~~~~~")
117
118 # 3、从文档中某个节点开始,不考虑此节点位置,text()表示节点内容
119 content = html_data.xpath("//title/text()")
120 for con in content:
121 print (con)
122 print ("~~~~~~~~~这是第三个分隔线~~~~~~~~~")
123
124 # 4、获取所有div(html/body/div/div)的id属性值
125 nodes = html_data.xpath("/html/body/div/div")
126 for i in range(len(nodes)):
127 content = nodes[i].xpath("@id")
128 for con in content:
129 print (con)
130 print ("~~~~~~~~~这是第四个分隔线~~~~~~~~~")
131
132 # 5、body节点下某节点的属性值
133 content = html_data.xpath("body/div/div[@id= 'id2']/ul/li[1]/div[2]/a/@href")
134 for con in content:
135 print (con)
136 print ("~~~~~~~~~这是第五个分隔线~~~~~~~~~")
137
138 # 6、div[@id='id2']节点下某节点的属性值
139 content = html_data.xpath("//div[@id = 'id2']/ul/li[1]/div[2]/a/@href")
140 for con in content:
141 print (con)
142 print ("~~~~~~~~~这是第六个分隔线~~~~~~~~~")
143
144 # 7、div[@id='id2']节点下某节点的内容
145 content = html_data.xpath("//div[@id= 'id2']/ul/li[1]/div[2]/a/text()")
146 for con in content:
147 print (con)
148 print ("~~~~~~~~~这是第七个分隔线~~~~~~~~~")
149
150 # 8、用'*'来匹配任何元素
151 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[2]/a/text()")
152 for con in content:
153 print (con)
154 print ("~~~~~~~~~这是第八个分隔线~~~~~~~~~")
155
156 # 9、选取多个节点
157 nodes = html_data.xpath("//i|//span")
158 for i in range(len(nodes)):
159 print (nodes[i].text)
160 print ("~~~~~~~~~这是第九个分隔线~~~~~~~~~")
161
162 # 10、选取所有li节点
163 nodes = html_data.xpath("//li")
164 for i in range(len(nodes)):
165 content = nodes[i].xpath("div/@class") # li节点下所有div节点的class属性值
166 print (i,'='*5)
167 for con in content:
168 print (con)
169 print ("~~~~~~~~~这是第十个分隔线~~~~~~~~~")
170
171 # 11、选取所有li节点
172 nodes = html_data.xpath("//li")
173 for i in range(len(nodes)):
174 content = nodes[i].xpath("div[last()]/@class") # li节点下最后一个div节点的class属性值
175 print (i, '='*5)
176 for con in content:
177 print (con)
178 print ("~~~~~~~~~这是第十一个分隔线~~~~~~~~~")
179
180 # 12、这里应用了'..'和'@',其中'..'表示父节点,具体就是上一步(title)的父节点head;'@'表示属性,就是它后面接是属性名,在这里的意思就是属性href的内容
181 content = html_data.xpath("/html/head/title/../script/@src")
182 for con in content:
183 print (con)
184 print ("~~~~~~~~~这是第十二个分隔线~~~~~~~~~")
185
186 # 13、div[@class='cls_3']的子节点span的兄弟节点i
187 nodes = html_data.xpath("//div[@class = 'cls_3']/span/following-sibling::i")
188 for i in range(len(nodes)):
189 content = nodes[i].xpath("./text()") # 当前节点内容
190 for con in content:
191 print (con)
192 print ("~~~~~~~~~这是第十三个分隔线~~~~~~~~~")
193
194 # 14、li[@class='cls_li1']后代节点里第一个div的class属性值
195 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::div[1]/@class")
196 for con in content:
197 print (con)
198 print ("~~~~~~~~~这是第十四个分隔线~~~~~~~~~")
199
200
201 # 15、li[@class='cls_li1']后代节点里span的内容
202 content = html_data.xpath("//li[@class = 'cls_li1']/descendant::span/text()")
203 for con in content:
204 print (con)
205 print ("~~~~~~~~~这是第十五个分隔线~~~~~~~~~")
206
207 # 16、用'*'来匹配任何元素,且不包含class属性的div节点
208 content = html_data.xpath("*//div[@id = 'id2']/ul/li[1]/div[not(@class)]/a/text()")
209 for con in content:
210 print (con)
211 print ("~~~~~~~~~这是第十六个分隔线~~~~~~~~~")
212
213 # 17、多个条件的情况
214 content = html_data.xpath("//div[@id= 'id2' and @class= 'class2']/ul/li[1]/div[1]/span/text()")
215 for con in content:
216 print (con)
217 print ("~~~~~~~~~这是第十七个分隔线~~~~~~~~~")
218
219 # 18、contains 包含的情况
220 content = html_data.xpath("//div[contains(@class,'class2')]/ul/li[2]/div[2]/a/@href")
221 for con in content:
222 print (con)
223 print ("~~~~~~~~~这是第十八个分隔线~~~~~~~~~")
224
225 输出如下:
226
227 哈哈测试一下
228 ~~~~~~~~~这是第一个分隔线~~~~~~~~~
229 哈哈测试一下
230 ~~~~~~~~~这是第二个分隔线~~~~~~~~~
231 哈哈测试一下
232 ~~~~~~~~~这是第三个分隔线~~~~~~~~~
233 id2
234 id3
235 ~~~~~~~~~这是第四个分隔线~~~~~~~~~
236 a_1.html
237 a_2.html
238 a_3.html
239 ~~~~~~~~~这是第五个分隔线~~~~~~~~~
240 a_1.html
241 a_2.html
242 a_3.html
243 ~~~~~~~~~这是第六个分隔线~~~~~~~~~
244 a_1
245 a_2
246 a_3
247 ~~~~~~~~~这是第七个分隔线~~~~~~~~~
248 a_1
249 a_2
250 a_3
251 ~~~~~~~~~这是第八个分隔线~~~~~~~~~
252 span_text1
253 span_text2
254 text_1
255 span_text3
256 span_text4
257 text_2
258 text_22
259 span_text5
260 span_text6
261 text_3
262 span_text7
263 span_text8
264 text_4
265 ~~~~~~~~~这是第九个分隔线~~~~~~~~~
266 0 =====
267 cls_3
268 cls_4
269 1 =====
270 cls_3
271 cls_4
272 2 =====
273 cls_5
274 cls_6
275 3 =====
276 cls_5
277 cls_6
278 ~~~~~~~~~这是第十个分隔线~~~~~~~~~
279 0 =====
280 cls_4
281 1 =====
282 cls_4
283 2 =====
284 cls_6
285 3 =====
286 cls_6
287 ~~~~~~~~~这是第十一个分隔线~~~~~~~~~
288 haha.js
289 haha1.js
290 haha2.js
291 ~~~~~~~~~这是第十二个分隔线~~~~~~~~~
292 text_1
293 text_2
294 text_22
295 ~~~~~~~~~这是第十三个分隔线~~~~~~~~~
296 cls_3
297 cls_3
298 ~~~~~~~~~这是第十四个分隔线~~~~~~~~~
299 span_text1
300 span_text2
301 span_text3
302 span_text4
303 ~~~~~~~~~这是第十五个分隔线~~~~~~~~~
304 a_1
305 a_2
306 a_3
307 ~~~~~~~~~这是第十六个分隔线~~~~~~~~~
308 span_text1
309 span_text2
310 ~~~~~~~~~这是第十七个分隔线~~~~~~~~~
311 a_4.html
312 a_5.html
313 a_6.html
314 ~~~~~~~~~这是第十八个分隔线~~~~~~~~~