【Pyton】【小甲鱼】爬虫3---隐藏,避免网站防爬虫

两种方法隐藏(修改)headers
1.下面是关于用Python程序调用有道进行翻译的 一段隐藏的代码(第一种方法)。
2.为什么要进行隐藏操作?因为如果一个IP在一定时间访问过于频繁,那么就会被被访问网站进行反爬虫拦截,无法进行我们爬虫的后续工作了,所以要给爬虫披上一层神秘的面纱,从而瞒天过海喽~
1
import urllib.request 2 import urllib.parse #负责解析功能 3 import json 4 5 content=input("请输入需要翻译的内容:") 6 url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index' 7 8 head={} #增加header赋值代码 9 head['User-Agent']='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'#增加header赋值代码,通过网址F12之后Network找到POST请求,找到User-Agent值,在此处进行定位赋值。 10 11 data={} 12 data['type']='AUTO' 13 data['i']=content #'I'换成content 14 data['doctype']='json' 15 data['xmlVersion']='1.8' 16 data['keyfrom']='fanyi.web' 17 data['ue']='UTF-8' 18 data['action']='FY_BY_CLICKBUTTON' 19 data['typoResult']='true' 20 data=urllib.parse.urlencode(data).encode('utf-8') 21 22 req=urllib.request.Request(url,data,head) 23 response=urllib.request.urlopen(req) 24 html=response.read().decode('utf-8') 25 26 #print(html) 27 target=json.loads(html) 28 print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))

运行结果及headers是否正确输入的检查:

1 >>> 
2 请输入需要翻译的内容:I
3 翻译结果:我
4 >>> req.headers #检查
5 {'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'}

第二种方法隐藏:

 1 import urllib.request
 2 import urllib.parse 
 3 import json
 4 
 5 content=input("请输入需要翻译的内容:")
 6 url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'
 7 
 8 '''
 9 1.用下面一句add_header函数语句q来代替以下head的赋值
10 head={}
11 head['User-Agent']='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
12 '''
13 
14 data={}
15 data['type']='AUTO'
16 data['i']=content 
17 data['doctype']='json'
18 data['xmlVersion']='1.8'
19 data['keyfrom']='fanyi.web'
20 data['ue']='UTF-8'
21 data['action']='FY_BY_CLICKBUTTON'
22 data['typoResult']='true'
23 data=urllib.parse.urlencode(data).encode('utf-8')
24 
25 #req=urllib.request.Request(url,data,head)  #2.替换成下一句,因为不再引用上面的head所以去掉head
26 req=urllib.request.Request(url,data)
27 #+3.q语句
28 req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36')
29 
30 response=urllib.request.urlopen(req)
31 html=response.read().decode('utf-8')
32 
33 #print(html)
34 target=json.loads(html)
35 print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))

第三种方法:引入休息时间:

 1 import urllib.request
 2 import urllib.parse 
 3 import json
 4 import time
 5 
 6 while True:#+添加循环
 7     content=input('请输入需要翻译的内容(输入"q!"退出程序):')#+
 8     if content =='q!':#+
 9         break#+
10     url='http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=dict2.index'
11 
12     '''
13     1.用下面一句add_header函数语句q来代替以下head的赋值
14     head={}
15     head['User-Agent']='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
16     '''
17 
18     data={}
19     data['type']='AUTO'
20     data['i']=content 
21     data['doctype']='json'
22     data['xmlVersion']='1.8'
23     data['keyfrom']='fanyi.web'
24     data['ue']='UTF-8'
25     data['action']='FY_BY_CLICKBUTTON'
26     data['typoResult']='true'
27     data=urllib.parse.urlencode(data).encode('utf-8')
28 
29     req=urllib.request.Request(url,data)
30     req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36')
31 
32     response=urllib.request.urlopen(req)
33     html=response.read().decode('utf-8')
34 
35     #print(html)
36     target=json.loads(html)
37     #print("翻译结果:%s"%(target['translateResult'][0][0]['tgt']))
38     target=target['translateResult'][0][0]['tgt']#+
39     print(target)#+
40     time.sleep(5)#+延长时间以避免网站认为是爬虫非法访问

第四种:引入代理,代理把看到的内容返回给你,所以可以达到同样的效果

 1 import urllib.request
 2 
 3 url='http://www.whatismyip.com.tw'
 4 iplist=['']#+
 5 
 6 #proxy_support = urllib.request.ProxyHandler({'http':'196.168.0.100:808'})
 7 proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
 8 
 9 opener=urllib.request.build_opener(proxy_support)
10 opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36')]
11 urllib.request.install_opener(opener)
12 
13 response=urllib.request.urlopen(url)
14 html=response.read().decode('utf-8')
15 
16 print(html)

 

posted @ 2017-04-09 11:04  猪猪宝丫  阅读(1815)  评论(0)    收藏  举报