Python爬虫 #010 常用的反爬虫手段

11.1-随机请求头

网址:http://www.useragentstring.com/pages/useragentstring.php

里面有各种请求头,下图选择了chrome类型的请求头

  1. 代码示例:

    import random
    import requests
    keys = [
        'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',
        'Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30',
        'Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1',
        'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0',
        'Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0',
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36',
        'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19',
        'Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
        'Mozilla/5.0 (iPod; U; CPU like Mac OS X; en) AppleWebKit/420.1 (KHTML, like Gecko) Version/3.0 Mobile/3A101a Safari/419.3'
    ]
    url = "http://httpbin.org/user-agent"
    for i in range(3):
        headers = {"User-Agent":random.choice(keys)}
        response = requests.get(url=url, headers=headers)
        print(response.text)
    
    {
      "user-agent": "Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30"
    }
    {
      "user-agent": "Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3"
    }
    {
      "user-agent": "Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0"
    }
    

11.2-ip代理池

  1. 获取代理的程序

# /usr/bin/env python

# -*- coding:utf-8 -*-

   import requests
   from bs4 import BeautifulSoup
   import random

   ip_list = []

   def get_ip_list(url):
       headers = {'User-Agent': 'Mozilla/5.0'}
       res = requests.get(url, headers=headers)
       bs = BeautifulSoup(res.text, 'html.parser')
       results = bs.select('#ip_list tr')
       for result in results[1:]:
           ip = result.select('td')[1].text
           port = result.select('td')[2].text
           judge(ip, port)

   def judge(ip, port):
       proxy = {'http': ip+':'+port}
       print('-' * 45)
       print('正在测试 %s ' % proxy)
       try:
           print('-' * 45)
           res = requests.get('https://www.baidu.com', proxies=proxy)
           print('恭喜!%s:%s 测试通过!' % (ip, port))
       except Exception:
           print('IP:' + ip + '无效!')
           return False
       else:
           if 200 <= res.status_code < 300:
               ip_list.append((ip, port))
               return True
           else:
               print('IP:' + ip + '无效!')
               return False

   def get_random_ip():
       ip, port = random.choice(ip_list)
       result = judge(ip, port)
       if result:
           return ip + ':' + port
       else:
           ip_list.remove((ip, port))

   if __name__ == '__main__':
       get_ip_list('https://www.xicidaili.com/wt/')
       print('-' * 45)
       for IP in ip_list: print(':'.join(IP))
       print('-' * 35)
       print('通过测试的 IP 总数:', len(ip_list))
   print('-' * 35)

  1. 代码示例:

    import requests
    import random
    
    proxys = [
        "222.240.184.126:8086",
        "117.88.176.135:3000",
        "122.4.43.149:808",
        "103.10.86.203:8080"
    ]
    
    url = 'http://httpbin.org/ip'
    proxy = random.choice(proxys)
    response = requests.get(url=url, proxies={"http":proxy})
    
    print(response.text)
    

```python
   {
     "origin": "117.88.176.135"
   }
posted @ 2023-06-28 22:56  枫_Null  阅读(28)  评论(0)    收藏  举报