1 #!/usr/bin/env python
2 #-*- coding:utf-8 -*-
3 """
4 @author:BanShaoHuan
5 @file: two.py
6 @time: 2018/05/07
7 @contact: banshaohuan@163.com
8 @site:
9 @software: PyCharm
10
11 # code is far away from bugs with the god animal protecting
12 I love animals. They taste delicious.
13 ┏┓ ┏┓
14 ┏┛┻━━━┛┻┓
15 ┃ ☃ ┃
16 ┃ ┳┛ ┗┳ ┃
17 ┃ ┻ ┃
18 ┗━┓ ┏━┛
19 ┃ ┗━━━┓
20 ┃ 神兽保佑 ┣┓
21 ┃ 永无BUG! ┏┛
22 ┗┓┓┏━┳┓┏┛
23 ┃┫┫ ┃┫┫
24 ┗┻┛ ┗┻┛
25 """
26 import lxml.html
27
28 def parse_doc(page_raw):
29 '''
30 :param page_raw:html 源码
31 :return: Element对象
32 '''
33 return lxml.html.fromstring(page_raw)
34
35 html = '''
36 <!DOCTYPE html>
37 <html lang="en">
38 <head>
39 <meta name="content-type" content="text/html;charset=UTF-8" />
40 <title>友情链接查询 - 站长工具</title>
41 <meta name="Keywords" content="友情链接查询">
42 <meta name="Description" content="友情链接查询">
43 </head>
44 <body>
45 <h1 class="heading">Top News</h1>
46 <p style="font-size: 200%">World News only on this page</p>
47 Ah, and here's some more text, by the way.
48 <p>
49 <a href="http://www.4399.com/" target="_blank">4399小游戏</a>
50 </p>
51 <a href="http://www.cydf.cn/" rel="nofollow" target="_blank">青少年发展基金</a>
52 <a href="http://www.4399.com/flash/32979.html" target="_blank">洛克王国</a>
53 <a href="http://game.3533.com/game/" target="_blank">手机游戏</a>
54 <a href="http://game.3533.com/tupian/" target="_blank">手机壁纸</a>
55 <a href="http://www.91wan.com/" title="游戏">91wan游戏</a>
56 </body>
57 </html>
58 '''
59
60 doc = parse_doc(html)
61
62 #====(1)定位html标签节点===
63 # a_xpath = "/html/body/a" # 选择body标签下面的所有a标签,注意不包括p标签下面的a标签节点
64 # a_node_list = doc.xpath(a_xpath) # 返回节点对象的列表
65 # print(a_node_list)
66
67 # a_xpath = "//a" # 选择所有的a标签
68 # a_node_list = doc.xpath(a_xpath) # 返回节点对象的列表
69 # print(a_node_list)
70
71 # a_xpath = "//body//a" # 选择body标签下面的所有a标签,包括p标签下面的a标签
72 # a_node_list = doc.xpath(a_xpath) # 返回节点对象的列表
73 # print(a_node_list)
74
75 #====(2)定位到标签的属性====
76 # h_xpath = "//h1/@class" # 选择h1标签的class属性
77 # class_value_list = doc.xpath(h_xpath)
78 # print(class_value_list)
79
80 #====(3)定位到标签里面的文本====
81 # h_text_xpath = "//h1/text()" # 选择h1标签下面的text
82 # h_text_list = doc.xpath(h_text_xpath)
83 # print(h_text_list)
84 #
85 # body_text_xpath = "//body//text()"# 选择body标签下面的所有text
86 # body_text_list = doc.xpath(body_text_xpath)
87 # print(body_text_list)
88
89 #====(4)根据特定的属性选择节点
90 # meta_keywords_xpath = "//meta[@name='Keywords']" #选择name属性为Keywords的节点
91 # meta_keywords_node_list = doc.xpath(meta_keywords_xpath)
92 # print(meta_keywords_node_list)
93
94 # meta_des_xpath = "//meta[contains(@name, 'Descri')]" # 选择name属性包含Descri的节点
95 # meta_des_list = doc.xpath(meta_des_xpath)
96 # print(meta_des_list)
97
98 # content_value_xpath = "//meta[contains(@name, 'Keywords')]//@content"# 选择符合条件的节点的content属性的值
99 # content_value_list = doc.xpath(content_value_xpath)
100 # print(content_value_list)
101
102 # text_xpath = "//body/a[contains(text(), '游戏')]/text()"# 选择a标签text包含游戏的text
103 # text_list = doc.xpath(text_xpath)
104 # print(text_list)
105
106 #====(5)获取节点的父亲节点====
107 # p_xpath = "//p/a[contains(text(),'4399')]/parent::*"# text包含4399的a标签的父节点
108 # parent_node_list = doc.xpath(p_xpath)
109 # print(parent_node_list[0])
110
111 #====(6)获取节点的后续同级节点
112 # a_follow_xpath = "//body/a[contains(@href, '4399')]/following-sibling::*" # href属性包含4399字符串的a标签的后续同级标签节点
113 # a_follow_node_list = doc.xpath(a_follow_xpath)
114 # print(a_follow_node_list)
115
116 #====(7)节点对象一些常用的方法
117 # node = doc.xpath("//a[contains(text(),'洛克')]")[0] # 得到节点对象
118 # attrib_dict = node.attrib # 此节点标签的字典形式的属性
119 # # print(attrib_dict)
120 #
121 # node_text = node.text # 此节点的text
122 # print(node_text)
123 #
124 # node_tag = node.tag # 此节点的标签名称
125 # print(node_tag)
126 #
127 # child_node_list = node.getchildren() # 获取此节点的孩子节点
128 # print(child_node_list)
129 #
130 # next_node = node.getnext() # 此节点的一个同级节点
131 # print(next_node)
132 #
133 # href_list = node.xpath("./@href") # 此节点下面写xpath,注意前面需要有./ 表明是相对路径
134 # print(href_list)