一、简介
当爬虫的网站需要登录,尤其是有复杂验证码验证的登录,这种一般需要借助一些第三方解码平台来解析验证码,比较困难。但如果程序不是放在服务器上一直运行,而是只在需要的时候手动跑一下,这样的就可以使用手动登录获取cookie,然后爬取网页的时候带上这个cookie,从而绕过登录这一步骤。
二、不需要验证码的登录
这种不需要验证码的登录比较简单,直接从登录接口模拟登录即可
import requests
session = requests.session()
# 模拟登录(这种适合只需要虚入账号密码这种简单的登录,不适合需要验证码的复杂登录)
# 登录需要的字段到登录接口中去找
login_data = {
'username': 'your_username',
'password': 'your_password'
}
login_headers = {
}
login_url = ''
login_response = session.post(url=login_url, headers=login_headers, data=login_data)
# 登录之后爬取数据
target_url = ''
target_headers = {
}
target_params = {
}
target_response = session.get(url=target_url, headers=target_headers,params=target_params)
三、需要验证码的登录
如果是登录时还需要验证码或者是图片滑动验证这种的,就会非常复杂,此时最好绕过登录过程,即手动登录,然后爬取网页时带上cookie,这样就可以绕过登录直接访问url了。
3.1、selenium爬虫绕过登录
import time
from selenium.webdriver.common.action_chains import ActionChains
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.edge.options import Options
import json
import pandas as pd
edge_driver = webdriver.Edge()
def save_cookie(driver):
login_url = 'https://login.51job.com/login.php?loginway=0&isjump=0&lang=c&from_domain=i&url=https%3A%2F%2Fwe.51job.com%2Fpc%2Fmy%2Fmyjob&zhidinginfo='
driver.get(login_url)
driver.maximize_window()
time.sleep(20)
with open('../file/job51/cookie.txt', 'w', encoding='utf-8') as fp:
# driver.get_cookies()结果是
# [{'domain': '.51job.com', 'expiry': 1733210688, 'httpOnly': False, 'name': 'ssxmod_itna', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'eqmxBDgDuDn7eDKi=DX7qSDCuK7wnAu0irTddqGXh3DZDiqAPGhDC+39x+3+i4RGIWVWmgDmW8+j+G44qMQYPR1PxCPGnDBIT3KADYA8Dt4DTD34DYDixib8xi5GRD0KDFbVztZcDm4i3ExiaDGeDep97DY5DhxDCXGPDwx0CjxxDBx2IiGDD41rGK+p+RooC37BDqmKD9roDsoxjimIw2tdeBO7fL/H3wx0kxq0OnoHz8ooDU0IzcZ5NQm4xQi2t5iheLA0Do0De3nhcQiGKbnW5YY2oeGD4ij8Nsh2t7iGDD=='}, {'domain': '.51job.com', 'httpOnly': True, 'name': 'slife', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'lowbrowser%3Dnot%26%7C%26lastlogindate%3D20240606%26%7C%26securetime%3DBDgBNVcwBWZSNgI5Dj0BbAQ%252BV2M%253D'}, {'domain': '.51job.com', 'expiry': 1717689599, 'httpOnly': False, 'name': 'sajssdk_2015_cross_new_user', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '1'}, {'domain': '.51job.com', 'expiry': 1725434688, 'httpOnly': True, 'name': 'sensor', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'createDate%3D2016-05-21%26%7C%26identityType%3D1'}, {'domain': '.51job.com', 'httpOnly': False, 'name': 'Hm_lpvt_1370a11171bd6f2d9b1fe98951541941', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '1717658674'}, {'domain': '.51job.com', 'expiry': 1733210693, 'httpOnly': False, 'name': 'tfstk', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'fh7Dot2jmi-XgrhTnaYbzYJvUqrRGxT6QO39BFpa4LJSHiCwgUrGUOKNkECAE1WwFnhvusUajTWOIhNXBdJGB1SYwyUdGsT67O2LJydmToFAodk97hY9qUwLpyURGsT67JITcRgSHC9ygIu2b70yFIAw7K-ZU0ReUd8w7O5rtv223N7AzWhhlDtx1f_kgRpFiLAWNaADmpzBEV5ArIxDLsbbI2Y2aaQkv3hThU5C2tRk-rDBig5HuCb_TcvMjZTkE94E5QsVna-Gy5ZfZNvDYZ-oiS1DuCWwjgwnJIYAqHbcc5MPg9pcYE1tt-CkxgxBa3li0EId9NtFurDBeHOGEHI0KqXF4KmrY41eaQPOzci60QOkdUOd2baLc3vLZ7mHCnRWgpFuZci60QOkp7VoxEt2NIJd.'}, {'domain': '.51job.com', 'expiry': 1752218693, 'httpOnly': False, 'name': 'sensorsdata2015jssdkcross', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '%7B%22distinct_id%22%3A%22113346256%22%2C%22first_id%22%3A%2218fec6f635b561-0724a9670837a-4c657b58-921600-18fec6f635c7d7%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThmZWM2ZjYzNWI1NjEtMDcyNGE5NjcwODM3YS00YzY1N2I1OC05MjE2MDAtMThmZWM2ZjYzNWM3ZDciLCIkaWRlbnRpdHlfbG9naW5faWQiOiIxMTMzNDYyNTYifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22113346256%22%7D%2C%22%24device_id%22%3A%2218fec6f635b561-0724a9670837a-4c657b58-921600-18fec6f635c7d7%22%7D'}, {'domain': '.51job.com', 'expiry': 1725434688, 'httpOnly': True, 'name': 'ps', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'needv%3D0'}, {'domain': 'we.51job.com', 'expiry': 1717660488, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ac11000117176586881061949e7fde418ea0fc66b30e40d09e294826538e9f'}, {'domain': '.51job.com', 'expiry': 1725434688, 'httpOnly': True, 'name': '51job', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'cuid%3D113346256%26%7C%26cusername%3DlqrfLpAZ4k3tNOep06mNIydF9OwhPjGa7XgI9U%252FSbk4%253D%26%7C%26cpassword%3D%26%7C%26cname%3DAWc5F2q1LqWOdY6Wn5N6xA%253D%253D%26%7C%26cemail%3DQl9NDXZxO4Q%252FMwMLOLiEdaBPm91l0bPHYe9%252BcobvbEI%253D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0kEoVXjl7Ykw%26%7C%26cconfirmkey%3D24dCFGfJ70zjk%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D24Y.ccbN2fi9A%26%7C%26to%3D8899865f83039bbf6858c002ce1da6f86661643f%26%7C%26'}, {'domain': '.51job.com', 'expiry': 1749194673, 'httpOnly': False, 'name': 'Hm_lvt_1370a11171bd6f2d9b1fe98951541941', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '1717658674'}, {'domain': '.51job.com', 'expiry': 1752218673, 'httpOnly': True, 'name': 'guid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'd67ef5607647dabe6a781576ab2d7bff'}]
json.dump(driver.get_cookies()[0], fp) # 注意这个地方是否取cookie第一个元素要视情况而定
def selenium_load_cookie(driver):
save_cookie(edge_driver)
# 1、再次访问下登录页面的url
login_url = 'https://login.51job.com/login.php?loginway=0&isjump=0&lang=c&from_domain=i&url=https%3A%2F%2Fwe.51job.com%2Fpc%2Fmy%2Fmyjob&zhidinginfo='
driver.get(login_url)
driver.maximize_window()
time.sleep(2) # 用显示等待
# 2、读取保存的cookie并加载到driver
with open('../file/job51/cookie.txt', 'r', encoding='utf-8') as fp:
cookies = json.load(fp)
driver.add_cookie(cookies)
# 3、刷新一下浏览器
driver.refresh()
# 4、爬取页面
target_url = 'https://we.51job.com/pc/my/myjob'
driver.get(target_url)
time.sleep(3) # 显示等待
driver.find_element(By.XPATH, '//input[@placeholder="搜索职位名称/公司名称"]').send_keys('python')
time.sleep(2) # 显示等待
driver.find_element(By.LINK_TEXT, '搜索').click()
# 切换到新打开的窗口
time.sleep(3)
handles = driver.window_handles
# hand = driver.current_window_handle
driver.switch_to.window(handles[1])
time.sleep(2)
# 总页数
page_elements = driver.find_elements(By.XPATH, '//ul[@class="el-pager"]/li[@class="number"]')
pages = [page_element.text for page_element in page_elements]
total_page = pages[-1]
page = 1
while page <= 3:
# 每爬取一个页面都等一秒
time.sleep(1)
job_name_elements = driver.find_elements(By.XPATH, '//div[@class="joblist-item"]//div[@class="joblist-item-top"]/span[@class="jname text-cut"]')
job_name = [job_name_element.text for job_name_element in job_name_elements]
company_salary_elements = driver.find_elements(By.XPATH, '//div[@class="joblist-item"]//div[@class="joblist-item-top"]/span[@class="sal shrink-0"]')
company_salary = [company_salary.text for company_salary in company_salary_elements]
company_name_elements = driver.find_elements(By.XPATH, '//div[@class="joblist-item"]//div[@class="joblist-item-bot"]//a[@class="cname text-cut"]')
company_name = [company_name.text for company_name in company_name_elements]
# print(f'################## 第{page}页 ######################')
# print(job_name, '\n', company_name, '\n', company_salary)
# 数据处理
dict_data = {
'工作职位': job_name,
'公司名称': company_name,
'职位薪水': company_salary
}
df = pd.DataFrame(dict_data)
if page == 1:
df.to_csv('../file/job51.csv', index=False, mode='a', encoding='GBK')
else:
df.to_csv('../file/job51.csv', index=False, header=False, mode='a', encoding='GBK')
# 下一页
driver.find_element(By.XPATH, '//button[@class="btn-next"]').click()
page += 1
if __name__ == '__main__':
selenium_load_cookie(edge_driver)
print('程序结束')
3.2、requests爬虫绕过登录
requests 爬虫绕过登录需要使用cookieJar
import requests
from requests.cookies import RequestsCookieJar
import time
import json
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.chrome.service import Service
# 谷歌浏览器
# chrome_driver_path = ''
# service = Service(chrome_driver_path)
# chrome_driver = webdriver.Chrome(service=service)
# edge 浏览器
edge_driver_path = './edge_chrome_driver/msedgedriver.exe'
service = Service(edge_driver_path)
edge_driver = webdriver.Edge(service=service)
def save_cookie(driver):
login_url = 'https://maimai.cn/platform/login'
driver.get(login_url)
driver.maximize_window()
time.sleep(30)
with open('../file/maimai/cookie.txt', 'w', encoding='utf-8') as fp:
# driver.get_cookies()结果是
# [{'domain': '.51job.com', 'expiry': 1733210688, 'httpOnly': False, 'name': 'ssxmod_itna', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'eqmxBDgDuDn7eDKi=DX7qSDCuK7wnAu0irTddqGXh3DZDiqAPGhDC+39x+3+i4RGIWVWmgDmW8+j+G44qMQYPR1PxCPGnDBIT3KADYA8Dt4DTD34DYDixib8xi5GRD0KDFbVztZcDm4i3ExiaDGeDep97DY5DhxDCXGPDwx0CjxxDBx2IiGDD41rGK+p+RooC37BDqmKD9roDsoxjimIw2tdeBO7fL/H3wx0kxq0OnoHz8ooDU0IzcZ5NQm4xQi2t5iheLA0Do0De3nhcQiGKbnW5YY2oeGD4ij8Nsh2t7iGDD=='}, {'domain': '.51job.com', 'httpOnly': True, 'name': 'slife', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'lowbrowser%3Dnot%26%7C%26lastlogindate%3D20240606%26%7C%26securetime%3DBDgBNVcwBWZSNgI5Dj0BbAQ%252BV2M%253D'}, {'domain': '.51job.com', 'expiry': 1717689599, 'httpOnly': False, 'name': 'sajssdk_2015_cross_new_user', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '1'}, {'domain': '.51job.com', 'expiry': 1725434688, 'httpOnly': True, 'name': 'sensor', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'createDate%3D2016-05-21%26%7C%26identityType%3D1'}, {'domain': '.51job.com', 'httpOnly': False, 'name': 'Hm_lpvt_1370a11171bd6f2d9b1fe98951541941', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '1717658674'}, {'domain': '.51job.com', 'expiry': 1733210693, 'httpOnly': False, 'name': 'tfstk', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'fh7Dot2jmi-XgrhTnaYbzYJvUqrRGxT6QO39BFpa4LJSHiCwgUrGUOKNkECAE1WwFnhvusUajTWOIhNXBdJGB1SYwyUdGsT67O2LJydmToFAodk97hY9qUwLpyURGsT67JITcRgSHC9ygIu2b70yFIAw7K-ZU0ReUd8w7O5rtv223N7AzWhhlDtx1f_kgRpFiLAWNaADmpzBEV5ArIxDLsbbI2Y2aaQkv3hThU5C2tRk-rDBig5HuCb_TcvMjZTkE94E5QsVna-Gy5ZfZNvDYZ-oiS1DuCWwjgwnJIYAqHbcc5MPg9pcYE1tt-CkxgxBa3li0EId9NtFurDBeHOGEHI0KqXF4KmrY41eaQPOzci60QOkdUOd2baLc3vLZ7mHCnRWgpFuZci60QOkp7VoxEt2NIJd.'}, {'domain': '.51job.com', 'expiry': 1752218693, 'httpOnly': False, 'name': 'sensorsdata2015jssdkcross', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '%7B%22distinct_id%22%3A%22113346256%22%2C%22first_id%22%3A%2218fec6f635b561-0724a9670837a-4c657b58-921600-18fec6f635c7d7%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMThmZWM2ZjYzNWI1NjEtMDcyNGE5NjcwODM3YS00YzY1N2I1OC05MjE2MDAtMThmZWM2ZjYzNWM3ZDciLCIkaWRlbnRpdHlfbG9naW5faWQiOiIxMTMzNDYyNTYifQ%3D%3D%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%24identity_login_id%22%2C%22value%22%3A%22113346256%22%7D%2C%22%24device_id%22%3A%2218fec6f635b561-0724a9670837a-4c657b58-921600-18fec6f635c7d7%22%7D'}, {'domain': '.51job.com', 'expiry': 1725434688, 'httpOnly': True, 'name': 'ps', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'needv%3D0'}, {'domain': 'we.51job.com', 'expiry': 1717660488, 'httpOnly': True, 'name': 'acw_tc', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'ac11000117176586881061949e7fde418ea0fc66b30e40d09e294826538e9f'}, {'domain': '.51job.com', 'expiry': 1725434688, 'httpOnly': True, 'name': '51job', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'cuid%3D113346256%26%7C%26cusername%3DlqrfLpAZ4k3tNOep06mNIydF9OwhPjGa7XgI9U%252FSbk4%253D%26%7C%26cpassword%3D%26%7C%26cname%3DAWc5F2q1LqWOdY6Wn5N6xA%253D%253D%26%7C%26cemail%3DQl9NDXZxO4Q%252FMwMLOLiEdaBPm91l0bPHYe9%252BcobvbEI%253D%26%7C%26cemailstatus%3D0%26%7C%26cnickname%3D%26%7C%26ccry%3D.0kEoVXjl7Ykw%26%7C%26cconfirmkey%3D24dCFGfJ70zjk%26%7C%26cautologin%3D1%26%7C%26cenglish%3D0%26%7C%26sex%3D0%26%7C%26cnamekey%3D24Y.ccbN2fi9A%26%7C%26to%3D8899865f83039bbf6858c002ce1da6f86661643f%26%7C%26'}, {'domain': '.51job.com', 'expiry': 1749194673, 'httpOnly': False, 'name': 'Hm_lvt_1370a11171bd6f2d9b1fe98951541941', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': '1717658674'}, {'domain': '.51job.com', 'expiry': 1752218673, 'httpOnly': True, 'name': 'guid', 'path': '/', 'sameSite': 'Lax', 'secure': False, 'value': 'd67ef5607647dabe6a781576ab2d7bff'}]
json.dump(driver.get_cookies(), fp)
def load_cookie():
cookie_jar = RequestsCookieJar()
with open('../file/maimai/cookie.txt', 'r', encoding='utf-8') as fp:
cookie_json = json.load(fp=fp)
for cookie in cookie_json:
cookie_jar.set(name=cookie['name'],
value=cookie['value'],
domain=cookie['domain'],
path=cookie['path'])
return cookie_jar
def spider_page():
cookie_jar = load_cookie()
target_url = 'https://maimai.cn/gossip_list'
target_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}
target_params = {}
session = requests.session()
session.cookies = cookie_jar
target_response = session.get(url=target_url, headers=target_headers)
print(target_response.url)
print(target_response.text)
if __name__ == '__main__':
save_cookie(edge_driver)
# cookie = load_cookie()
# print(cookie)
# spider_page()