1 from lxml import etree
2 import urllib3
3 import requests
4 urllib3.disable_warnings()
5 url="https://www.cnblogs.com/mvc/blog/news.aspx?blogApp=xiaoyujuan"
6
7 r = requests.get(url,verify=False)
8 # print(r.text)
9
10 dom = etree.HTML(r.content.decode("utf-8"))
11 block = dom.xpath("//*[@id='profile_block']")
12 t = etree.tostring(block[0],encoding='utf-8',pretty_print=True)
13 print(t.decode("utf-8"))
14
15 t1 = block[0].xpath("text()")#获取当前节点文本元素
16 print(t1)
17 t2 = block[0].xpath('a')#定位a标签
18 for i,j in zip(t1,t2):
19 print("%s%s" %(i,j.text))
1 from lxml import etree
2 htmldemo = '''
3 <meta charset="UTF-8"> <!-- for HTML5 -->
4 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
5 <html><head><title>yoyo ketang</title></head><body><b><!--Hey, this in comment!--></b>
6 <p class="title"><b>yoyoketang</b></p><p class="yoyo">这里是我的微信公众号:yoyoketang <br>
7 <a href="http://www.cnblogs.com/yoyoketang/tag/fiddler/" class="sister" id="link1">fiddler教程</a><br>
8 <a href="http://www.cnblogs.com/yoyoketang/tag/python/" class="sister" id="link2">python笔记</a><br>
9 <a href="http://www.cnblogs.com/yoyoketang/tag/selenium/" class="sister" id="link3">selenium文档</a><br>
10 快来关注吧!</p>
11 <p class="story">...</p>
12 '''
13 #etree.HTMLz解析html内容
14 demo = etree.HTML(htmldemo)
15 #打印解析之后的html内容,可用etree.tosting方法
16 #encoding="utf-8"参数可以正常输出html里面的中文内容
17 #pretty_print=True是以标准格式输出
18 t = etree.tostring(demo,encoding='utf-8',pretty_print=True)
19 print(t.decode('utf-8'))