python xpath无法解析网站

 奇怪,xpath无法解析出内容,结果为空。但使用注释的就可以,能解析出来。参考https://bbs.csdn.net/topics/392248325,把headers去掉就好了,不知道为什么。

 

使用了headers未解析出内容:

 1 from urllib import request
 2 import urllib.parse
 3 from lxml import etree
 4 
 5 def loadPage():
 6     header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
 7     data={'kw':'赵丽颖'}
 8     data=urllib.parse.urlencode(data)
 9     url = 'https://tieba.baidu.com/f?' + data + '&ie=utf-8&pn=50'
10 
11     # url = 'http://tieba.baidu.com/p/6118150338?red_tag=v3401180473'
12     a=request.Request(url,headers=header)
13     #a=request.Request(url)
14     response=request.urlopen(a).read().decode('utf-8')
15     # print(response)
16 
17     content=etree.HTML(response)
18     link_list=content.xpath("//div[@class='t_con cleafix']/div[2]/div/div[1]/a/@href")
19     # link_list=content.xpath("//img[@class='BDE_Image']/@src")
20     print(link_list)
21 
22 if __name__ == '__main__':
23     loadPage()

 

不使用headers解析出来了内容:

 1 from urllib import request
 2 import urllib.parse
 3 from lxml import etree
 4 
 5 def loadPage():
 6     header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36"}
 7     data={'kw':'赵丽颖'}
 8     data=urllib.parse.urlencode(data)
 9     url = 'https://tieba.baidu.com/f?' + data + '&ie=utf-8&pn=50'
10 
11     # url = 'http://tieba.baidu.com/p/6118150338?red_tag=v3401180473'
12     # a=request.Request(url,headers=header)
13     a=request.Request(url)
14     response=request.urlopen(a).read().decode('utf-8')
15     # print(response)
16 
17     content=etree.HTML(response)
18     link_list=content.xpath("//div[@class='t_con cleafix']/div[2]/div/div[1]/a/@href")
19     # link_list=content.xpath("//img[@class='BDE_Image']/@src")
20     print(link_list)
21 
22 if __name__ == '__main__':
23     loadPage()

 

 

posted on 2020-02-29 20:17  cherry_ning  阅读(831)  评论(0)    收藏  举报

导航