爬虫笔记
网址中文转译
1 import urllib.parse 2 import string 3 new_url = urllib.parse.quote(new_url, safe=string.printable) #将包含汉字的网址转译
利用字典拼接url(字典传参)
1 import urllib.parse 2 import string 3 url = "http://www.baidu.com/s?" 4 dic = {"wd":"百度", “key”:"zhang"} 5 _url = urllib.parse.urlencode(dic)#可以将字典中:转译为= 6 new_url = url + _url 7 end_url = urllib.parse.quote(new_url, safe = "string.printable")#中文转译
动态传入User-agent
1 import urllib.request 2 import random 3 4 def load_baidu(): 5 6 url = "http://www.baidu.com" 7 user_agent_list = [ 8 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1", 9 "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0", 10 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 11 "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50" 12 13 ] 14 #每次请求的浏览器都是不一样的 15 random_user_agent = random.choice(user_agent_list) 16 17 request = urllib.request.Request(url) 18 19 #增加对应的请求头信息(user_agent) 20 request.add_header("User-Agent",random_user_agent) 21 22 #请求数据 23 response = urllib.request.urlopen(request) 24 #请求头的信息 25 print(request.get_header("User-agent")) 26 27 load_baidu()
利用不同ip请求网络数据
1 import urllib.request 2 3 def proxy_user(): 4 #创建代理ip列表用于轮流访问 5 proxy_list = [ 6 {"https": "106.75.226.36:808"}, 7 {"https": "61.135.217.7:80"}, 8 {"https": "125.70.13.77:8080"}, 9 {"https": "118.190.95.35:9001"} 10 ] 11 for proxy in proxy_list: 12 print(proxy) 13 #利用遍历出来的ip创建处理器 14 proxy_handler = urllib.request.ProxyHandler(proxy) 15 #创建opener 16 opener = urllib.request.build_opener(proxy_handler) 17 18 try: 19 data = opener.open("http://www.baidu.com", timeout=1) 20 #使用创建出来的opener的open函数 21 haha = data.read() 22 print(haha) 23 except Exception as e: 24 print(e) 25 26 27 proxy_user()