from lxml import html
etree = html.etree
# 加载html文件
tree = etree.parse("b.html", etree.HTMLParser()) # ['百度', '谷歌', '搜狗']
# result = tree.xpath("/html/body/ul/li[1]/a/text()") # ['百度', '谷歌', '搜狗']
# result = tree.xpath("/html/body/ul/li[1]/a/text()") # ['百度'] xpath下标从1开始
# result = tree.xpath("/html/body/ol/li/a[@href='dapao']/text()") # 通过 @属性值 来通过属性筛选
# print(result)
# ol_li_list = tree.xpath("/html/body/ol/li")
#
# for li in ol_li_list:
# # 从每一个li中提取到文字信息
# result = li.xpath("./a/text()") # 在li中继续去寻找 ./代表当前节点
# print(result)
# result2 = li.xpath("./a/@href") # 可以通过这种方式拿到属性的值
# print(result2)
#
# print(tree.xpath("/html/body/ul/li/a/@href"))
# 小技巧,打开哦我们的网页后,右键选择检查,我们页面显示的顺序是和element模块中的代码相对应的。找到我们想要的那一行代码,右键可以复制xPath
print(tree.xpath("/html/body/div[1]/text()")) # ['李嘉诚']