from selenium import webdriver
import time
import re
import pandas as pd
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
import datetime
import random
time.sleep(5)
def login(user_data):
user = user_data['用户名'][0]
password = user_data['密码'][0]
# 打开是登陆网址
driver.get(
"https://cas.baidu.com/?tpl=www2&fromu=http%3A%2F%2Fwww2.baidu.com%2Fcommon%2Fappinit.ajax")
# 切换账号密码登陆
loginheader = driver.find_element_by_xpath('//a[@id="choose-uc-login"]')
time.sleep(random.uniform(2, 3))
loginheader.click()
# 输入账号
entered_login = driver.find_element_by_xpath(
'//form[@id="uc-login"]//input[@id="uc-common-account"]')
time.sleep(random.uniform(2, 3))
entered_login.send_keys(user)
# 输入密码
password_input = driver.find_element_by_xpath(
'//form[@id="uc-login"]//input[@id="ucsl-password-edit"]')
time.sleep(random.uniform(2, 3))
password_input.send_keys(password)
# 点击登陆
while True:
try:
login_click = driver.find_element_by_xpath(
'//form[@id="uc-login"]//input[@id="submit-form"]')
login_click.click()
time.sleep(random.uniform(5, 7))
driver.find_element_by_xpath('//div[@class="vcode-close"]').click()
except BaseException:
time.sleep(random.uniform(3, 5))
break
urlss = driver.current_url
driver.get(
"https://fengchao.baidu.com/fc/toolscenter/optimize/adpreviewAndDiagnose/user/{}/type/adpreview".format(urlss.split('userid=')[1]))
def sem(data, i, user_data):
lis = []
time.sleep(3)
try:
js = "var aa=document.getElementById('hm-circular');aa.parentNode.removeChild(aa)"
driver.execute_script(js)
except:
pass
input_keyword = driver.find_element_by_xpath(
'//input[@class="one-search-box one-search-box-medium"]')
search = driver.find_elements_by_xpath(
'//button[@class="one-button one-search-box-icon-search-btn one-button-primary one-button-medium"]')[0]
diyu = driver.find_elements_by_xpath('//div[@class="one-input-detail"]')[0]
diyu.click()
region = driver.find_elements_by_xpath(
'//input[@class="one-input one-input-medium"]')[0]
region.send_keys(i)
region_sjz = driver.find_element_by_xpath(
'//li[@class="one-cascader-menu-item"]')
region_sjz.click()
time.sleep(1)
for keyword in data['关键词']:
input_keyword.send_keys(Keys.CONTROL + 'a')
input_keyword.send_keys(Keys.BACKSPACE)
time.sleep(random.uniform(1, 3))
while True:
try:
input_keyword.send_keys(keyword)
break
except StaleElementReferenceException:
time.sleep(10)
login(user_data)
time.sleep(1)
input_keyword = driver.find_element_by_xpath(
'//input[@class="one-search-box one-search-box-medium"]')
search = driver.find_elements_by_xpath(
'//button[@class="one-button one-search-box-icon-search-btn one-button-primary one-button-medium"]')[
0]
diyu = driver.find_elements_by_xpath(
'//div[@class="one-input-detail"]')[0]
diyu.click()
region = driver.find_elements_by_xpath(
'//input[@class="one-input one-input-medium"]')[0]
region.send_keys(i)
region_sjz = driver.find_element_by_xpath(
'//li[@class="one-cascader-menu-item"]')
region_sjz.click()
input_keyword.send_keys(keyword)
time.sleep(1)
search.click()
dj = 0
while True:
dj += 0.05
if dj < 20:
try:
driver.find_elements_by_tag_name("iframe")[1]
except BaseException:
time.sleep(0.05)
else:
break
else:
break
while True:
ydiframe = driver.find_elements_by_tag_name("iframe")[0]
driver.switch_to.frame(ydiframe)
yd_keywords = driver.page_source
if 'searchboxtop-bg-fade' in yd_keywords:
driver.switch_to.default_content()
break
else:
driver.switch_to.default_content()
time.sleep(0.05)
# 切换到pc窗口
pciframe = driver.find_elements_by_tag_name("iframe")[1]
driver.switch_to.frame(pciframe)
pc_keywords = driver.page_source
driver.switch_to.default_content()
# 切换到移动窗口
ydiframe = driver.find_elements_by_tag_name("iframe")[0]
driver.switch_to.frame(ydiframe)
yd_keywords = driver.page_source
driver.switch_to.default_content()
yd_list = re.findall(
r'c-color-source c-showurl c-flexbox(.*?)_3anzn1e',
# r'',
yd_keywords,
re.DOTALL)
ydurl = re.findall(
r'<a class="c-blocka ec_title _2gavtw1" ctid="20108" href="(.*?)"',
yd_keywords,
re.DOTALL)
ydbiaoti = re.findall(
r'class="c-line-clamp2"(.*?)</div>',
yd_keywords,
re.DOTALL)
yd_miaoshu = re.findall(
r'ec_desc _63stz74".*?<span>(.*?)</span>',
yd_keywords,
re.DOTALL)
pc_list = re.findall(
r'ec-showurl-line(.*?)_3wnyfua',
pc_keywords,
re.DOTALL)[
1:]
pcurl = re.findall(
r'><div class="wbrjf67"><a href="(.*?)"',
pc_keywords,
re.DOTALL)
pc_biaoti = re.findall(
r'class="wbrjf67".*?draggable="false"(.*?)</a',
pc_keywords,
re.DOTALL)
pc_miaoshu = re.findall(
r'class="ec_desc"(.*?)</span>',
pc_keywords,
re.DOTALL)
msjx = '<.*?>|>'
if yd_list:
ydc = 0
for yd, urlyd, ydbt, ydms in zip(
yd_list, ydurl, ydbiaoti, yd_miaoshu):
ydbt = re.sub(msjx, '', ydbt)
ydms = re.sub(msjx, '', ydms)
js = 'window.open("{}");'.format(urlyd)
driver.execute_script(js)
time.sleep(3)
windows = driver.window_handles
driver.switch_to.window(windows[-1])
urlss = driver.current_url
driver.close()
driver.switch_to.window(windows[0])
if len(yd) > 20:
yd = re.findall('<span>([\u4e00-\u9fa5]*?)</span>', yd)
else:
pass
ydc += 1
yd_dict = {
"地域": i,
"关键词": keyword,
"竞品": yd,
"排名": ydc,
"设备": "yd",
"时间段": datetime.datetime.now().hour,
"url": urlss,
"标题": ydbt,
"描述": ydms
}
lis.append(yd_dict)
print(yd_dict)
if pc_list:
pcc = 0
for pc, urlpc, pcbt, pcms in zip(
pc_list, pcurl, pc_biaoti, pc_miaoshu):
pcbt = re.sub(msjx, '', pcbt)
pcms = re.sub(msjx, '', pcms)
js = 'window.open("{}");'.format(urlpc)
driver.execute_script(js)
time.sleep(3)
windows = driver.window_handles
driver.switch_to.window(windows[-1])
urlss = driver.current_url
driver.close()
driver.switch_to.window(windows[0])
pcc += 1
if len(pc) > 20:
pc = re.findall('<span>([\u4e00-\u9fa5]*?)</span>', pc)
pc_dict = {
"地域": i,
"关键词": keyword,
"竞品": pc,
"排名": pcc,
"设备": "pc",
"时间段": datetime.datetime.now().hour,
"url": urlss,
"标题": pcbt,
"描述": pcms
}
lis.append(pc_dict)
print(pc_dict)
driver.switch_to.default_content()
df = pd.DataFrame(lis)
return df
if __name__ == '__main__':
driver_path = r'geckodriver.exe'
driver = webdriver.Firefox(executable_path=driver_path)
user_data = pd.read_excel('参数.xlsx', sheet_name=2)
citys = pd.read_excel('参数.xlsx', sheet_name=1)
data = pd.read_excel('参数.xlsx')
login(user_data)
date_h = datetime.datetime.now().hour
dfs = []
for i in citys['地域']:
print('正在查询{}地区关键词'.format(i))
dfs.append(sem(data, i, user_data))
time.sleep(random.uniform(10, 15))
result = pd.concat(dfs)
result.to_excel('生成.xlsx')
input('运行完毕')