import random
import requests
from lxml import etree
import time
class Spider:
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42"
}
self.proxies = self.get_ip()
self.retry_count = 3
self.reacquire_count = 3
def get_ip(self):
url = "http://route.xiongmaodaili.com/xiongmao-web/api/glip?secret=db6ef34d45474c5978015ebfb7c42a99&orderNo=GL20230524165349xRuRnGKM&count=1&isTxt=1&proxyType=1"
response = requests.get(url).content.decode().replace("\r\n", "")
ip = {"http": "http://" + response, 'https': "http://" + response}
print(f"获取新ip{ip}")
return ip
def request_get(self, url, params=None, headers=None,cookies=None):
"""
每个代理重复尝试访问,得到响应返回,多次代理无响应返回空。
:param url: 请求地址
:return: 响应或空
"""
headers = headers or self.headers
for reacquire in range(self.reacquire_count):
for retry in range(self.retry_count):
try:
response = requests.get(url, headers=headers,cookies=cookies, proxies=self.proxies, params=params,
timeout=5)
#response = json.loads(response.content.decode())
return response
except Exception as e:
print(e)
time.sleep(random.randint(1, 3))
time.sleep(5)
self.proxies = self.get_ip()
return None
def request_post(self, url, params=None,data=None, headers=None):
"""
每个代理重复尝试访问,得到响应返回,多次代理无响应返回空。
:param url: 请求地址
:return: 响应或空
"""
headers = headers or self.headers
for reacquire in range(self.reacquire_count):
for retry in range(self.retry_count):
try:
response = requests.post(url, headers=headers, proxies=self.proxies,params=params, data=data,timeout=5)
return response
except Exception as e:
print(e)
time.sleep(random.randint(1, 3))
continue
time.sleep(5)
self.proxies = self.get_ip()
print("非ip问题,检查url或headers!")
def parse_html(self, response):
"""
解析网页。
:param response: 请求响应的数据,需为html类型。
:return: 返回解析结果
"""
return etree.HTML(response.content.decode())
def xpath(self, html, path):
"""
通过路径获取html内容
:param html: html
:param path: html路径
:return:
"""
return html.xpath(f"{path}")