通过selenium获取百度推广后台的竞价广告排名

from selenium import webdriver
import time
import re
import pandas as pd
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
import datetime
import random


time.sleep(5)


def login(user_data):
    user = user_data['用户名'][0]
    password = user_data['密码'][0]

    # 打开是登陆网址
    driver.get(
        "https://cas.baidu.com/?tpl=www2&fromu=http%3A%2F%2Fwww2.baidu.com%2Fcommon%2Fappinit.ajax")
    # 切换账号密码登陆
    loginheader = driver.find_element_by_xpath('//a[@id="choose-uc-login"]')
    time.sleep(random.uniform(2, 3))
    loginheader.click()
    # 输入账号
    entered_login = driver.find_element_by_xpath(
        '//form[@id="uc-login"]//input[@id="uc-common-account"]')
    time.sleep(random.uniform(2, 3))
    entered_login.send_keys(user)
    # 输入密码
    password_input = driver.find_element_by_xpath(
        '//form[@id="uc-login"]//input[@id="ucsl-password-edit"]')
    time.sleep(random.uniform(2, 3))
    password_input.send_keys(password)
    # 点击登陆
    while True:
        try:
            login_click = driver.find_element_by_xpath(
                '//form[@id="uc-login"]//input[@id="submit-form"]')
            login_click.click()
            time.sleep(random.uniform(5, 7))
            driver.find_element_by_xpath('//div[@class="vcode-close"]').click()

        except BaseException:
            time.sleep(random.uniform(3, 5))
            break
    urlss = driver.current_url
    driver.get(
        "https://fengchao.baidu.com/fc/toolscenter/optimize/adpreviewAndDiagnose/user/{}/type/adpreview".format(urlss.split('userid=')[1]))


def sem(data, i, user_data):
    lis = []
    time.sleep(3)
    try:
        js = "var aa=document.getElementById('hm-circular');aa.parentNode.removeChild(aa)"
        driver.execute_script(js)
    except:
        pass
    input_keyword = driver.find_element_by_xpath(
        '//input[@class="one-search-box one-search-box-medium"]')
    search = driver.find_elements_by_xpath(
        '//button[@class="one-button one-search-box-icon-search-btn one-button-primary one-button-medium"]')[0]
    diyu = driver.find_elements_by_xpath('//div[@class="one-input-detail"]')[0]
    diyu.click()
    region = driver.find_elements_by_xpath(
        '//input[@class="one-input one-input-medium"]')[0]
    region.send_keys(i)

    region_sjz = driver.find_element_by_xpath(
        '//li[@class="one-cascader-menu-item"]')
    region_sjz.click()
    time.sleep(1)
    for keyword in data['关键词']:
        input_keyword.send_keys(Keys.CONTROL + 'a')
        input_keyword.send_keys(Keys.BACKSPACE)
        time.sleep(random.uniform(1, 3))
        while True:
            try:
                input_keyword.send_keys(keyword)
                break
            except StaleElementReferenceException:
                time.sleep(10)
                login(user_data)
                time.sleep(1)
                input_keyword = driver.find_element_by_xpath(
                    '//input[@class="one-search-box one-search-box-medium"]')
                search = driver.find_elements_by_xpath(
                    '//button[@class="one-button one-search-box-icon-search-btn one-button-primary one-button-medium"]')[
                    0]
                diyu = driver.find_elements_by_xpath(
                    '//div[@class="one-input-detail"]')[0]
                diyu.click()
                region = driver.find_elements_by_xpath(
                    '//input[@class="one-input one-input-medium"]')[0]
                region.send_keys(i)

                region_sjz = driver.find_element_by_xpath(
                    '//li[@class="one-cascader-menu-item"]')
                region_sjz.click()
                input_keyword.send_keys(keyword)
        time.sleep(1)
        search.click()
        dj = 0
        while True:
            dj += 0.05
            if dj < 20:
                try:
                    driver.find_elements_by_tag_name("iframe")[1]
                except BaseException:
                    time.sleep(0.05)
                else:
                    break
            else:
                break

        while True:

            ydiframe = driver.find_elements_by_tag_name("iframe")[0]
            driver.switch_to.frame(ydiframe)
            yd_keywords = driver.page_source
            if 'searchboxtop-bg-fade' in yd_keywords:
                driver.switch_to.default_content()
                break
            else:
                driver.switch_to.default_content()
                time.sleep(0.05)

        # 切换到pc窗口
        pciframe = driver.find_elements_by_tag_name("iframe")[1]

        driver.switch_to.frame(pciframe)
        pc_keywords = driver.page_source
        driver.switch_to.default_content()
        # 切换到移动窗口
        ydiframe = driver.find_elements_by_tag_name("iframe")[0]
        driver.switch_to.frame(ydiframe)
        yd_keywords = driver.page_source
        driver.switch_to.default_content()
        yd_list = re.findall(
            r'c-color-source c-showurl c-flexbox(.*?)_3anzn1e',
            # r'',
            yd_keywords,
            re.DOTALL)
        ydurl = re.findall(
            r'<a class="c-blocka ec_title _2gavtw1" ctid="20108" href="(.*?)"',
            yd_keywords,
            re.DOTALL)
        ydbiaoti = re.findall(
            r'class="c-line-clamp2"(.*?)</div>',
            yd_keywords,
            re.DOTALL)
        yd_miaoshu = re.findall(
            r'ec_desc _63stz74".*?<span>(.*?)</span>',
            yd_keywords,
            re.DOTALL)

        pc_list = re.findall(
            r'ec-showurl-line(.*?)_3wnyfua',
            pc_keywords,
            re.DOTALL)[
            1:]
        pcurl = re.findall(
            r'><div class="wbrjf67"><a href="(.*?)"',
            pc_keywords,
            re.DOTALL)
        pc_biaoti = re.findall(
            r'class="wbrjf67".*?draggable="false"(.*?)</a',
            pc_keywords,
            re.DOTALL)
        pc_miaoshu = re.findall(
            r'class="ec_desc"(.*?)</span>',
            pc_keywords,
            re.DOTALL)
        msjx = '<.*?>|>'
        if yd_list:
            ydc = 0
            for yd, urlyd, ydbt, ydms in zip(
                    yd_list, ydurl, ydbiaoti, yd_miaoshu):

                ydbt = re.sub(msjx, '', ydbt)
                ydms = re.sub(msjx, '', ydms)

                js = 'window.open("{}");'.format(urlyd)
                driver.execute_script(js)
                time.sleep(3)
                windows = driver.window_handles
                driver.switch_to.window(windows[-1])
                urlss = driver.current_url
                driver.close()
                driver.switch_to.window(windows[0])
                if len(yd) > 20:
                    yd = re.findall('<span>([\u4e00-\u9fa5]*?)</span>', yd)
                else:
                    pass
                ydc += 1
                yd_dict = {
                    "地域": i,
                    "关键词": keyword,
                    "竞品": yd,
                    "排名": ydc,
                    "设备": "yd",
                    "时间段": datetime.datetime.now().hour,
                    "url": urlss,
                    "标题": ydbt,
                    "描述": ydms
                }
                lis.append(yd_dict)
                print(yd_dict)
        if pc_list:
            pcc = 0
            for pc, urlpc, pcbt, pcms in zip(
                    pc_list, pcurl, pc_biaoti, pc_miaoshu):
                pcbt = re.sub(msjx, '', pcbt)
                pcms = re.sub(msjx, '', pcms)

                js = 'window.open("{}");'.format(urlpc)
                driver.execute_script(js)
                time.sleep(3)
                windows = driver.window_handles
                driver.switch_to.window(windows[-1])
                urlss = driver.current_url
                driver.close()
                driver.switch_to.window(windows[0])
                pcc += 1
                if len(pc) > 20:
                    pc = re.findall('<span>([\u4e00-\u9fa5]*?)</span>', pc)
                pc_dict = {
                    "地域": i,
                    "关键词": keyword,
                    "竞品": pc,
                    "排名": pcc,
                    "设备": "pc",
                    "时间段": datetime.datetime.now().hour,
                    "url": urlss,
                    "标题": pcbt,
                    "描述": pcms
                }
                lis.append(pc_dict)
                print(pc_dict)

        driver.switch_to.default_content()
    df = pd.DataFrame(lis)
    return df


if __name__ == '__main__':
    driver_path = r'geckodriver.exe'
    driver = webdriver.Firefox(executable_path=driver_path)
    user_data = pd.read_excel('参数.xlsx', sheet_name=2)
    citys = pd.read_excel('参数.xlsx', sheet_name=1)
    data = pd.read_excel('参数.xlsx')
    login(user_data)

    date_h = datetime.datetime.now().hour
    dfs = []
    for i in citys['地域']:
        print('正在查询{}地区关键词'.format(i))
        dfs.append(sem(data, i, user_data))
        time.sleep(random.uniform(10, 15))
    result = pd.concat(dfs)
    result.to_excel('生成.xlsx')
    input('运行完毕')
posted @ 2022-01-04 16:08  伟茂  阅读(252)  评论(0)    收藏  举报