以下是一些Python常用的反反爬策略:
- User-Agent伪装:
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.example.com'
response = requests.get(url, headers=headers)
2. IP代理:
import requests
proxies = {
'http': 'http://127.0.0.1:8888',
'https': 'https://127.0.0.1:8888'
}
url = 'https://www.example.com'
response = requests.get(url, proxies=proxies)
3. 随机延时:
import requests
import time
import random
url = 'https://www.example.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
response = requests.get(url, headers=headers)
time.sleep(random.randint(1, 3)) # 随机延时1-3秒
4. 验证码识别:
import requests
from PIL import Image
import pytesseract
url = 'https://www.example.com/captcha.jpg'
response = requests.get(url)
with open('captcha.jpg', 'wb') as f:
f.write(response.content)
img = Image.open('captcha.jpg')
code = pytesseract.image_to_string(img)
5. Cookie管理:
import requests
url = 'https://www.example.com/login'
data = {'username': 'user', 'password': 'pass'}
response = requests.post(url, data=data)
url = 'https://www.example.com/data'
cookies = response.cookies.get_dict()
response = requests.get(url, cookies=cookies)
6. 模拟登录:
import requests
url = 'https://www.example.com/login'
data = {'username': 'user', 'password': 'pass'}
response = requests.post(url, data=data)
url = 'https://www.example.com/data'
headers = {'Authorization': f'Bearer {response.json()["access_token"]}'}
response = requests.get(url, headers=headers)
7. 动态页面处理:
from selenium import webdriver
url = 'https://www.example.com'
driver = webdriver.Chrome()
driver.get(url)
element = driver.find_element_by_xpath('//*[@id="table"]/tbody/tr[1]/td[1]')
text = element.text
8. 随机请求头:
import requests
import random
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.3',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.3',
]
headers = {
'User-Agent': random.choice(user_agents)
}
url = 'https://www.example.com'
response = requests.get(url, headers=headers)
浙公网安备 33010602011771号