# coding:utf-8
import time
import random
from time import sleep
from csv import writer
from selenium import webdriver
from selenium.webdriver.common.by import By
from chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains
driver = webdriver.Chrome()
#打开网页
def open_web(search_name):
driver.get("https://www.izaiwen.cn/pro/sonE-stwE08?psnname={}".format(search_name))
time.sleep(6)
#加载cookie
# with open("cookies.json", "r") as file:
# cookies = json.load(file)
cookies = [{'name': 'HMACCOUNT', 'value': 'xxxxxxxxx'},
{'name': 'heiheihei', 'value': 'xxxxxxxxx'},
{'name': 'hahaha', 'value': 'xxxxxxx'},
{'name': '_', 'value': 'xxxxxxxxxxx'},
{'name': 'acw', 'value': 'xxxxxxxxxxxx'},
{'name': 'cer', 'value': 'xxxxxxxxxx'},
{'name': 'ession', 'value': 'xxxxxxxxxx'},
{'name': 'userId', 'value': 'xxxxx'},
{'name': 'uuid', 'value': 'xxxxxxxxxxxxxxxxxxx'},
]
for cookie in cookies:
driver.add_cookie(cookie)
time.sleep(5)
driver.refresh()
time.sleep(random.randrange(5,10))
#检测是否在验证码页面
def check_condition():
header=driver.title#提取页面标题
# header=driver.find_element(By.XPATH,'/html/head/title').text 此方法无效,只能取出空字符串
print(header)
if header == '请完成安全验证': #如果在验证码页面返回f值
return 'f'
else:
return 't'
#基于xpath定位标签获取数据
def get_information():
parent_element=driver.find_elements(By.XPATH,'.//div[@class="item-box layui-card "]')
for child_element in parent_element:
target_element=child_element.find_elements(By.XPATH,'.//div[@class="layui-col-xs4"]')
print(name)
info=''
for n in target_element:
info+=n.text#提取标签中的数据
info+=','
print(info)
list_data=[name,info]
#保存数据
with open("信息.csv", "a", newline="") as f_object:
writer_object = writer(f_object)
writer_object.writerow(list_data)
time.sleep(5)
#自动识别验证码并提交至超级鹰打码平台识别
def anti_anti_spider():
#找到包含验证码的元素
img=driver.find_element(By.XPATH,'.//div[@id="aliyunCaptcha-window-embed"]')
#对此元素进行截图
img.screenshot('D:/SeleniumX/yzm.png')
#由于新版本的selenium的点击定位是从元素中心点开始,因此计算元素的尺寸来使点击从左上角开始
img_half_width = float(img.rect['width'])/2
img_half_height = float(img.rect['height'])/2
#初始化超级鹰代码,需要从其官网下载代码放到此文件相同文件夹中并导入
chaojiying = Chaojiying_Client('', '', '')#账号,密码,软件ID
#提交到平台并获得结果
im = open('D:/SeleniumX/yzm.png', 'rb').read()
yzm_result=chaojiying.PostPic(im, 9101)['pic_str']
time.sleep(10)
print(yzm_result)
# for index in result.split('|'): #以"|"进行分割,得到一个列表,并循环出每一个字的坐标,在这里因为只返回一个结果所以不需要
x = float(yzm_result.split(',')[0]) # 得到x轴的坐标
y = float(yzm_result.split(',')[1]) # 得到y轴的坐标
#使用动作链模拟点击操作
action = ActionChains(driver) #创建动作链,y).click().perform()
action.move_to_element_with_offset(img,x-img_half_width,y-img_half_height).click().perform()
time.sleep(10)
#主程序
list_name=[]#需要爬取的人名,用于构建页面url
for name in list_name:
open_web(name)#打开该网页
flag=check_condition()#检测是否触发了验证码
print(flag)
if flag == 'f':#若触发了验证码,开始识别并点击验证码
time.sleep(30)
anti_anti_spider()
time.sleep(15)
get_information()
else:
time.sleep(5)
get_information()
time.sleep(random.randrange(10,30))
print(name)