1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*-
3 # author:Momo time:2018/6/29
4
5 import urllib.request
6 import urllib
7
8 from lxml import etree
9
10
11 def get_html(url):
12 html_page = urllib.request.urlopen(url)
13 html_code = html_page.read().decode('utf-8')
14 return html_code
15
16 html = get_html("http://www.runoob.com/python3/python3-reg-expressions.html")
17
18 selector = etree.HTML(html)
19 """
20 // 定位根节点
21 / 往下一层寻找
22 /text() 提取文本内容
23 /@XXXX 提取属性内容
24 """
25 # # 提取文本
26 # content = selector.xpath('//*[@id="content"]/p/text()') # /text()
27 # for each in content:
28 # print(each)
29
30 # # 提取属性
31 # link = selector.xpath('/html/body/link/@href')
32 # for each in link:
33 # print(each)
34
35 table = selector.xpath('//*[@id="content"]/table/@class')
36 for each in table:
37 print(each)