Title

一个携程的酒店爬虫代码

import random
import time
from datetime import datetime

from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import selenium
import pyquery
from ConnectionPool import Client_Pool
from abc import ABC, abstractmethod
from cookies.CookieSaver import CookieSaver


class BaseCrawler(ABC):
    @abstractmethod
    def crawl(self, url: str):
        pass

    @abstractmethod
    def parse(self):
        pass

    @abstractmethod
    def save(self):
        pass



class CrawlerData:
    dataList: list[dict[str, any]]
    dataDict: dict[str, any]
    cssDict: dict[str, str]

    def __init__(self, css_dict: dict[str, str]):
        self.cssDict = css_dict
        self.dataList = []
        self.dataDict = {}

    def write(self, col_name: str, value: any):
        self.dataDict[col_name] = value

    def css(self, name: str) -> str:
        return self.cssDict.get(name, "")

    def nextRow(self):
        for col_name in self.cssDict.keys():
            self.dataDict[col_name] = self.dataDict.get(col_name, None)
        self.dataList.append(self.dataDict)
        self.dataDict = {}

    def getColName(self):
        return tuple(self.cssDict.keys())


class Crawler(BaseCrawler):
    driver: webdriver.Edge
    data: CrawlerData
    cookieSaver: CookieSaver

    def crawl(self, url: str):
        pass

    def parse(self):
        pass

    def save(self):
        with MongoClient() as client:
            db = client["Hotel"]
            collection = db["XIECHENG"]
            tuples = self.data.dataList
            for t in tuples:
                collection.update_one(t, {"$set": t}, upsert=True)

    def __init__(self, url_list: list[str], ):
        # 设置反爬参数,防止被发现是爬虫
        options = Options()
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--ignore-certificate-errors')
        options.add_argument('--ignore-ssl-errors')
        options.add_argument(
            'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0')
        self.driver = webdriver.Edge(options=options)
        self.urlList = url_list


def randomWait(min_time=0.5, max_time=1.5):
    time.sleep(random.uniform(min_time, max_time))


class HotelCrawler(Crawler):
    def __init__(self, url_list: list[str]):
        self.goods_css = "div.card-item-wrap"
        css_dict = {
            "title": ".list-card-title span",
            "location": "span.ads",
            "price": "span.real-price",
            "tags": "div.list-card-tag",
            "comment": "div.list-card-comment p.count",
            "score": "div.score span"
        }
        self.data = CrawlerData(css_dict)
        super().__init__(url_list)
        self.cookieSaver = CookieSaver(self.driver)

    def randomScroll(self):
        self.driver.execute_script("window.scrollBy(0,(Math.random()*0.3+0.7)*document.body.scrollHeight);")

    def parse(self):
        doc = pyquery.PyQuery(self.driver.page_source)
        goods = doc(self.goods_css).items()
        for g in goods:
            for col, css in self.data.cssDict.items():
                self.data.write(col, g(css).text())
            self.data.write("domain", self.cookieSaver.cookies.domain)
            self.data.write("time", datetime.now().date().isoformat())
            self.data.nextRow()

    def findMore(self):
        try:
            target = self.driver.find_element(By.CSS_SELECTOR, "div.list-btn-more div")
            self.driver.execute_script("arguments[0].scrollIntoView({block: 'center',inline: 'center'});", target)
            target.click()
            return True
        except Exception as e:
            return False

    def crawlAllURL(self, times=10):
        for url in self.urlList:
            self.crawl(url,times)

    def crawl(self, url: str, times=10):
        self.driver.get(url)
        self.driver.get(url)
        load = self.cookieSaver.load_cookies()
        valid = self.cookieSaver.is_cookie_valid()
        while not load or not valid:
            input("请登录后按回车键继续...")
            self.cookieSaver.save_cookies()
            load = self.cookieSaver.load_cookies()
            valid = self.cookieSaver.is_cookie_valid()
        more_times = 0
        try:
            while True:
                if self.findMore():
                    more_times += 1
                    if more_times > times:
                        break
                else:
                    self.randomScroll()
                randomWait(2.5, 3)
        except Exception as e:
            self.parse()
            self.save()
            print(f'遇到错误:{e}'
                  f'已经当前数据存储')
        self.parse()
        self.save()


if __name__ == '__main__':
    urls = [
        "https://hotels.ctrip.com/hotels/list?countryId=1&city=-1&optionId=16&optionType=Province&display=%E6%B5%99%E6%B1%9F%2C+%E4%B8%AD%E5%9B%BD",
    ]
    crawler = HotelCrawler(urls)
    crawler.crawlAllURL(100)

posted @ 2024-12-08 20:06  是胡某某啊  阅读(327)  评论(0)    收藏  举报