爬虫

获取ck

借助selenium自动登录获取ck

import datetime,openpyxl,pandas as pd,time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

'''
# 将本地浏览器接口开启监听
cmd :
    start chrome  --flag-switches-begin --flag-switches-end --remote-debugging-port=9887
    netstat -ano |findstr 9887   #检查端口是否开启
    tasklist |findstr 98616
cmd = 'start chrome --flag-switches-begin --flag-switches-end --remote-debugging-port=9887'
subprocess.Popen(cmd, shell=True)
'''

time.sleep(1) # 防呆
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9887")

# driver就是当前浏览器窗口
driver = webdriver.Chrome(options=chrome_options)
time.sleep(2)

driver.get('链接')  # 将网站URL替换为你要登录的网站
time.sleep(5)

# 检查是否需要重新登录
if "登录会话超时,请重新登录" in driver.page_source:
    # 执行重新登录操作
    driver.get('链接')  # 将网站URL替换为你要登录的网站

    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.ID, "formsubmitButton")))

    # 如果需要手动登录,则可以在这里添加等待用户完成登录的提示,例如:
    # input('请完成登录,并按 Enter 键继续...')

    # 找到用户名和密码的输入框,并输入相应的值
    # username_input = driver.find_element(By.XPATH, '//*[@id="username"]')
    password_input = driver.find_element(By.XPATH, '//*[@id="password"]')

    # username_input.send_keys('')
    password_input.send_keys('')

    # input('请完成登录,并按 Enter 键继续...')

    # 睡眠五秒
    time.sleep(2)
    # 找到登录按钮并点击
    login_button = driver.find_element(By.XPATH, '//*[@id="formsubmitButton"]')
    login_button.click()

    # 睡眠两秒,登录操作结束
    time.sleep(2)

driver.get('链接')  # 将网站URL替换为你要登录的网站
time.sleep(5)

# 等待页面刷新
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.ID, "aside-wrap-menu-top")))
time.sleep(5)

# 重刷新防呆
driver.get('链接')  # 将网站URL替换为你要登录的网站
time.sleep(5)

cookies = driver.get_cookies()

# 我习惯将ck保存在本地使用,下面的各位酌情
loginCookies = {}
for cookie in cookies:
    name = cookie['name']
    value = cookie['value']
    if value != None:
        loginCookies[name] = value
# print(loginCookies)

# 将cookie转换成字符串
loginCookiesStr = ""
for key, value in loginCookies.items():
    loginCookiesStr += f"'{key}':'{value}',"
loginCookiesStr = loginCookiesStr[:-2]

# 将cookie打印到表格内
workbook = openpyxl.load_workbook('ck.xlsx')
sheet = workbook['dmsc']
last_row = sheet.max_row + 1
sheet.cell(last_row, 1).value = pd.Timestamp.now()
sheet.cell(last_row, 2).value = loginCookiesStr
workbook.save('ck.xlsx')

被禁止机器人登录

需要从cmd里开启本地监听

# 将本地浏览器接口开启监听
cmd :
    start chrome  --flag-switches-begin --flag-switches-end --remote-debugging-port=9887
    netstat -ano |findstr 9887   #检查端口是否开启
    tasklist |findstr 98616
python代码,早期常驻应用,后期自动化再做封装隐藏处理
cmd = 'start chrome --flag-switches-begin --flag-switches-end --remote-debugging-port=9887'
subprocess.Popen(cmd, shell=True)

chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9887")

# driver就是当前浏览器窗口
driver = webdriver.Chrome(options=chrome_options)
time.sleep(2)

文件请求下载

Doc下载文件

import requests
cookie
url=""
header={
}
form_data={
}
response=request.post(url,headers=header,cookies=cookie,data=form_data)
with open(filePath, "wb") as file:
    file.write(response.content)
print("文件下载完成")

下载2

import requests,pandas as pd
cookie
url=""
header={
}
form_data={
}
response=request.post(url,headers=header,cookies=cookie,data=form_data)
response.encoding='GBK'
if response.status_code == 200:
    # 解析CSV数据
    data = pd.read_csv(io.StringIO(response.text))
    # 保存数据
    data.to_csv(filePath, index=False, encoding='GBK')
else:
    print(f'Request failed with status code {response.status_code}')

反爬方案

posted @ 2023-07-18 13:18  Teriteri  阅读(36)  评论(0)    收藏  举报