import os
import ssl
import sys
import time
import pymysql
import undetected_chromedriver as uc
from selenium import webdriver
path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(path)
from spider_setting import MYSQL_HOST, MYSQL_POST, MYSQL_PASSWORD, MYSQL_USER
class Papunika(object):
def __init__(self):
self.db = pymysql.connect(host=MYSQL_HOST, port=MYSQL_POST, database="cloud_joy_monitoring", user=MYSQL_USER, password=MYSQL_PASSWORD, charset='utf8', autocommit=True)
self.cursor = self.db.cursor()
self.main()
def main(self):
# 浏览器选项
chrome_options = webdriver.ChromeOptions()
# 使用headless无界面浏览器模式
# chrome_options.add_argument('--headless')
# 解决DevToolsActivePort文件不存在的报错
chrome_options.add_argument('--no-sandbox')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu')
# 设置默认编码为utf-8
chrome_options.add_argument('--lang=zh-CN')
# chrome_options.add_argument('disable-cache')
chrome_options.add_argument('--disable-javascript')
chrome_options.add_argument('--disable-java')
prefs = {
'profile.default_content_setting_values': {
'images': 2,
'javascript': 2 # 2即为禁用的意思
}
}
chrome_options.add_experimental_option('prefs', prefs)
# 隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('--hide-scrollbars')
chrome_options.add_argument("--proxy-server=192.168.104.134:7890")
# chrome_options.add_argument('–user-data-dir=C:/Users/cf.yu/AppData/Local/Google/Chrome/User Data')
# chrome_options.add_argument('--profile-directory=Default')
# 禁止加载图片
chrome_options.add_argument('blink-settings=imagesEnabled=false')
# 指定浏览器分辨率
chrome_options.add_argument('--start-maximized')
ssl._create_default_https_context = ssl._create_unverified_context
uc.TARGET_VERSION = 101
# driver = uc.Chrome(options=chrome_options)
driver = webdriver.Chrome(options=chrome_options)
driver.get('https://papunika.com/')
self.cursor.execute('select id, url from papunika_url order by id')
for data in self.cursor.fetchall():
url = data[1]
if "https://papunika.com/" in url and url.endswith("/"):
print(data)
key = url.replace("https://papunika.com/", "")[:-1]
if not key:
key = "index"
if not os.path.exists("E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(key)): # or 1 == 1
driver.get(url)
time.sleep(1)
handles = driver.window_handles
driver.switch_to.window(handles[-1])
time.sleep(1)
# driver.execute_script("var leafArr = $('.leaflet-tooltip'); leafArr.each(function(){$(this).attr('name',$(this).text())})")
driver.execute_script("document.querySelectorAll('.nk-gap-2, .code-block,#BorlabsCookieBoxWrap,#BorlabsCookieBox, #menu-item-9627, #menu-item-10519, #borlabs-cookie-js-after').forEach(node=>node.remove())")
url = driver.current_url
key = url.replace("https://papunika.com/", "")[:-1]
if not key:
key = "index"
print("js执行完成:{}".format(url))
time.sleep(1)
page = driver.page_source
save_path = self.save_path(key, False)
print(url, save_path, handles)
path = '/'.join(save_path.split("/")[:-1])
try:
self.save_file(save_path, page, path)
except Exception as e:
print("错误:{}".format(e))
continue
# break
driver.close()
def save_path(self, number, status):
if status:
save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_cn/{}.html".format(number)
else:
save_path = "E:/07-shunwangwork/33-游戏运营/papunika/html/html_en/{}.html".format(number)
return save_path
def save_file(self, file_name, page, path):
if not os.path.exists(path):
os.makedirs(path)
with open(file_name, 'w', encoding='utf-8') as f:
f.write("<!DOCTYPE html>\n")
f.write(page)
if __name__ == "__main__":
Papunika()
'''
html = etree.HTML(page)
for i in range(5):
content = html.xpath('//*[@id="BorlabsCookieBox"]')
if content:
data = etree.tostring(content[0], encoding="utf-8").decode("utf-8")
for i in re.findall(r'(<.*?>)', data):
page = page.replace(i, "")
print(data)
text = html.xpath('//*[@id="BorlabsCookieBox"]//text')
if text:
for i in text:
page = page.replace(i, "")
print(data)
'''