xpath入门学习

  1 #!/usr/bin/env python  
  2 #-*- coding:utf-8 -*-  
  3 """ 
  4 @author:BanShaoHuan
  5 @file: two.py 
  6 @time: 2018/05/07
  7 @contact: banshaohuan@163.com
  8 @site:  
  9 @software: PyCharm 
 10 
 11 # code is far away from bugs with the god animal protecting
 12     I love animals. They taste delicious.
 13               ┏┓      ┏┓
 14             ┏┛┻━━━┛┻┓
 15             ┃      ☃      ┃
 16             ┃  ┳┛  ┗┳  ┃
 17             ┃      ┻      ┃
 18             ┗━┓      ┏━┛
 19                 ┃      ┗━━━┓
 20                 ┃  神兽保佑    ┣┓
 21                 ┃  永无BUG!   ┏┛
 22                 ┗┓┓┏━┳┓┏┛
 23                   ┃┫┫  ┃┫┫
 24                   ┗┻┛  ┗┻┛ 
 25 """  
 26 import lxml.html
 27 
 28 def parse_doc(page_raw):
 29     '''
 30     :param page_raw:html 源码
 31     :return: Element对象
 32     '''
 33     return lxml.html.fromstring(page_raw)
 34 
 35 html = '''
 36 <!DOCTYPE html>
 37 <html lang="en">
 38 <head>
 39     <meta name="content-type" content="text/html;charset=UTF-8" />
 40     <title>友情链接查询 - 站长工具</title>
 41     <meta name="Keywords" content="友情链接查询">
 42     <meta name="Description" content="友情链接查询">
 43 </head>
 44 <body>
 45     <h1 class="heading">Top News</h1>
 46     <p style="font-size: 200%">World News only on this page</p>
 47     Ah, and here's some more text, by the way.
 48     <p>
 49         <a href="http://www.4399.com/" target="_blank">4399小游戏</a>
 50     </p>
 51     <a href="http://www.cydf.cn/" rel="nofollow" target="_blank">青少年发展基金</a>
 52     <a href="http://www.4399.com/flash/32979.html" target="_blank">洛克王国</a>
 53     <a href="http://game.3533.com/game/" target="_blank">手机游戏</a>
 54     <a href="http://game.3533.com/tupian/" target="_blank">手机壁纸</a>
 55     <a href="http://www.91wan.com/" title="游戏">91wan游戏</a>
 56 </body>
 57 </html>
 58 '''
 59 
 60 doc = parse_doc(html)
 61 
 62 #====(1)定位html标签节点===
 63 # a_xpath = "/html/body/a"    # 选择body标签下面的所有a标签,注意不包括p标签下面的a标签节点
 64 # a_node_list = doc.xpath(a_xpath)    # 返回节点对象的列表
 65 # print(a_node_list)
 66 
 67 # a_xpath = "//a" # 选择所有的a标签
 68 # a_node_list = doc.xpath(a_xpath)    # 返回节点对象的列表
 69 # print(a_node_list)
 70 
 71 # a_xpath = "//body//a"    # 选择body标签下面的所有a标签,包括p标签下面的a标签
 72 # a_node_list = doc.xpath(a_xpath)    # 返回节点对象的列表
 73 # print(a_node_list)
 74 
 75 #====(2)定位到标签的属性====
 76 # h_xpath = "//h1/@class" # 选择h1标签的class属性
 77 # class_value_list = doc.xpath(h_xpath)
 78 # print(class_value_list)
 79 
 80 #====(3)定位到标签里面的文本====
 81 # h_text_xpath = "//h1/text()"    # 选择h1标签下面的text
 82 # h_text_list = doc.xpath(h_text_xpath)
 83 # print(h_text_list)
 84 #
 85 # body_text_xpath = "//body//text()"# 选择body标签下面的所有text
 86 # body_text_list = doc.xpath(body_text_xpath)
 87 # print(body_text_list)
 88 
 89 #====(4)根据特定的属性选择节点
 90 # meta_keywords_xpath = "//meta[@name='Keywords']"    #选择name属性为Keywords的节点
 91 # meta_keywords_node_list = doc.xpath(meta_keywords_xpath)
 92 # print(meta_keywords_node_list)
 93 
 94 # meta_des_xpath = "//meta[contains(@name, 'Descri')]"    # 选择name属性包含Descri的节点
 95 # meta_des_list = doc.xpath(meta_des_xpath)
 96 # print(meta_des_list)
 97 
 98 # content_value_xpath = "//meta[contains(@name, 'Keywords')]//@content"# 选择符合条件的节点的content属性的值
 99 # content_value_list = doc.xpath(content_value_xpath)
100 # print(content_value_list)
101 
102 # text_xpath = "//body/a[contains(text(), '游戏')]/text()"# 选择a标签text包含游戏的text
103 # text_list = doc.xpath(text_xpath)
104 # print(text_list)
105 
106 #====(5)获取节点的父亲节点====
107 # p_xpath = "//p/a[contains(text(),'4399')]/parent::*"# text包含4399的a标签的父节点
108 # parent_node_list = doc.xpath(p_xpath)
109 # print(parent_node_list[0])
110 
111 #====(6)获取节点的后续同级节点
112 # a_follow_xpath = "//body/a[contains(@href, '4399')]/following-sibling::*" # href属性包含4399字符串的a标签的后续同级标签节点
113 # a_follow_node_list = doc.xpath(a_follow_xpath)
114 # print(a_follow_node_list)
115 
116 #====(7)节点对象一些常用的方法
117 # node = doc.xpath("//a[contains(text(),'洛克')]")[0]   # 得到节点对象
118 # attrib_dict = node.attrib   # 此节点标签的字典形式的属性
119 # # print(attrib_dict)
120 #
121 # node_text = node.text   # 此节点的text
122 # print(node_text)
123 #
124 # node_tag = node.tag # 此节点的标签名称
125 # print(node_tag)
126 #
127 # child_node_list = node.getchildren()    # 获取此节点的孩子节点
128 # print(child_node_list)
129 #
130 # next_node = node.getnext()  # 此节点的一个同级节点
131 # print(next_node)
132 #
133 # href_list = node.xpath("./@href")   # 此节点下面写xpath,注意前面需要有./ 表明是相对路径
134 # print(href_list)

 

posted @ 2018-05-07 17:17  banshaohuan  阅读(116)  评论(0)    收藏  举报