爬虫帮助类

import random
import requests
from lxml import etree
import time


class Spider:

    def __init__(self):
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42"
        }
        self.proxies = self.get_ip()
        self.retry_count = 3
        self.reacquire_count = 3

    def get_ip(self):
        url = "http://route.xiongmaodaili.com/xiongmao-web/api/glip?secret=db6ef34d45474c5978015ebfb7c42a99&orderNo=GL20230524165349xRuRnGKM&count=1&isTxt=1&proxyType=1"
        response = requests.get(url).content.decode().replace("\r\n", "")
        ip = {"http": "http://" + response, 'https': "http://" + response}
        print(f"获取新ip{ip}")
        return ip

    def request_get(self, url, params=None, headers=None,cookies=None):
        """
        每个代理重复尝试访问,得到响应返回,多次代理无响应返回空。
        :param url: 请求地址
        :return: 响应或空
        """
        headers = headers or self.headers
        for reacquire in range(self.reacquire_count):
            for retry in range(self.retry_count):
                try:
                    response = requests.get(url, headers=headers,cookies=cookies, proxies=self.proxies, params=params,
                                            timeout=5)
                    #response = json.loads(response.content.decode())
                    return response
                except Exception as e:
                    print(e)
                    time.sleep(random.randint(1, 3))
            time.sleep(5)
            self.proxies = self.get_ip()
        return None

    def request_post(self, url, params=None,data=None, headers=None):
        """
        每个代理重复尝试访问,得到响应返回,多次代理无响应返回空。
        :param url: 请求地址
        :return: 响应或空
        """
        headers = headers or self.headers
        for reacquire in range(self.reacquire_count):
            for retry in range(self.retry_count):
                try:
                    response = requests.post(url, headers=headers, proxies=self.proxies,params=params, data=data,timeout=5)
                    return response
                except Exception as e:
                    print(e)
                    time.sleep(random.randint(1, 3))
                    continue
            time.sleep(5)
            self.proxies = self.get_ip()
        print("非ip问题,检查url或headers!")

    def parse_html(self, response):
        """
        解析网页。
        :param response: 请求响应的数据,需为html类型。
        :return: 返回解析结果
        """
        return etree.HTML(response.content.decode())

    def xpath(self, html, path):
        """
        通过路径获取html内容
        :param html: html
        :param path: html路径
        :return:
        """
        return html.xpath(f"{path}")

posted @ 2023-07-19 19:39  向众神祈祷  阅读(22)  评论(0)    收藏  举报