papunika - 守护式等待

import os
import ssl
import sys
import time

import pymysql
import undetected_chromedriver as uc
from selenium import webdriver

path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(path)
from spider_setting import MYSQL_HOST, MYSQL_POST, MYSQL_PASSWORD, MYSQL_USER


class Papunika(object):
    def __init__(self):
        self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database="cloud_joy_monitoring", user=MYSQL_USER, password=MYSQL_PASSWORD, charset='utf8', autocommit=True)
        self.cursor = self.db.cursor()

        self.main()

    def main(self):
        # 浏览器选项
        chrome_options = webdriver.ChromeOptions()
        # 使用headless无界面浏览器模式
        # chrome_options.add_argument('--headless')
        # 解决DevToolsActivePort文件不存在的报错
        chrome_options.add_argument('--no-sandbox')
        # 谷歌文档提到需要加上这个属性来规避bug
        chrome_options.add_argument('--disable-gpu')
        # 设置默认编码为utf-8
        chrome_options.add_argument('--lang=zh-CN')
        # chrome_options.add_argument('disable-cache')
        chrome_options.add_argument('--disable-javascript')
        chrome_options.add_argument('--disable-java')

        prefs = {
            'profile.default_content_setting_values': {
                'images': 2,
                'javascript': 2  # 2即为禁用的意思
            }
        }
        chrome_options.add_experimental_option('prefs', prefs)

        # 隐藏滚动条, 应对一些特殊页面
        chrome_options.add_argument('--hide-scrollbars')
        chrome_options.add_argument("--proxy-server=192.168.104.134:7890")
        # chrome_options.add_argument('–user-data-dir=C:/Users/cf.yu/AppData/Local/Google/Chrome/User Data')
        # chrome_options.add_argument('--profile-directory=Default')
        # 禁止加载图片
        chrome_options.add_argument('blink-settings=imagesEnabled=false')
        # 指定浏览器分辨率
        chrome_options.add_argument('--start-maximized')
        ssl._create_default_https_context = ssl._create_unverified_context

        uc.TARGET_VERSION = 101
        # driver = uc.Chrome(options=chrome_options)
        driver = webdriver.Chrome(options=chrome_options)
        driver.get('https://papunika.com/')
        self.cursor.execute('select id, url from papunika_url order by id')
        for data in self.cursor.fetchall():
            url = data[1]
            if "https://papunika.com/" in url and url.endswith("/"):
                print(data)
                key = url.replace("https://papunika.com/", "")[:-1]
                if not key:
                    key = "index"
                if not os.path.exists("E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(key)):  #  or 1 == 1
                    driver.get(url)
                    time.sleep(1)
                    handles = driver.window_handles
                    driver.switch_to.window(handles[-1])
                    time.sleep(1)
                    # driver.execute_script("var leafArr = $('.leaflet-tooltip'); leafArr.each(function(){$(this).attr('name',$(this).text())})")
                    driver.execute_script("document.querySelectorAll('.nk-gap-2, .code-block,#BorlabsCookieBoxWrap,#BorlabsCookieBox, #menu-item-9627, #menu-item-10519, #borlabs-cookie-js-after').forEach(node=>node.remove())")
                    url = driver.current_url
                    key = url.replace("https://papunika.com/", "")[:-1]
                    if not key:
                        key = "index"
                    print("js执行完成:{}".format(url))
                    time.sleep(1)
                    page = driver.page_source
                    save_path = self.save_path(key, False)
                    print(url, save_path, handles)
                    path = '/'.join(save_path.split("/")[:-1])
                    try:
                        self.save_file(save_path, page, path)
                    except Exception as e:
                        print("错误:{}".format(e))
                        continue
                # break

        driver.close()

    def save_path(self, number, status):
        if status:
            save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_cn/{}.html".format(number)
        else:
            save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(number)

        return save_path

    def save_file(self, file_name, page, path):
        if not os.path.exists(path):
            os.makedirs(path)
        with open(file_name, 'w', encoding='utf-8') as f:
            f.write("<!DOCTYPE html>\n")
            f.write(page)


if __name__ == "__main__":
    Papunika()

'''
html = etree.HTML(page)
            for i in range(5):
                content = html.xpath('//*[@id="BorlabsCookieBox"]')
                if content:
                    data = etree.tostring(content[0], encoding="utf-8").decode("utf-8")
                    for i in re.findall(r'(<.*?>)', data):
                        page = page.replace(i, "")
                    print(data)
                text = html.xpath('//*[@id="BorlabsCookieBox"]//text')
                if text:
                    for i in text:
                        page = page.replace(i, "")
                    print(data)
'''
发表于 2022-05-06 13:55 守护式等待阅读(428) 评论(0) 收藏举报