selenium模拟登陆百度指数,爬取相应关键词百度指数
百度指数总得来说,反爬机制还是比较厉害的.首先:需要登录才能搜索关键词,而模拟登录当输入账号密码太快会触发反爬机制图形验证码,短时间内登录太多次会把账号及ip放入灰名单,再次登录需要验证手机短信,这样做的好处是:不是爬虫的话,用户体验度不会太差,而有可能是爬虫的时候,反爬等级不断提高,难以大规模爬取数据.代码如下,如有优化,感激不尽:
1 # coding=utf-8 2 """ 3 author = jamon 4 """ 5 from selenium import webdriver 6 from selenium.webdriver import ChromeOptions 7 from selenium.webdriver.chrome.options import Options 8 import time 9 10 11 def sleep(num): 12 # 睡眠 13 time.sleep(num) 14 15 def login(url, user, passwd): 16 # 输入账号密码,登录账号 17 """ 18 chrome_options = Options() # 谷歌无头浏览器 19 chrome_options.add_argument('--headless') 20 chrome_options.add_argument('--disable-gpu') 21 bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options) 22 """ 23 options = ChromeOptions() # 谷歌浏览器 24 options.add_experimental_option('excludeSwitches', ['enable-automation']) 25 bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=options) 26 27 bro.get(url) 28 bro.find_element_by_class_name('username-text').click() 29 sleep(3) 30 bro.find_element_by_class_name('pass-text-input-userName').send_keys(user) 31 bro.find_element_by_class_name('pass-text-input-password').send_keys(passwd) 32 bro.find_element_by_id('TANGRAM__PSP_4__submit').click() 33 sleep(3) 34 try: 35 bro.find_element_by_class_name('vcode-close').click() 36 sleep(2) 37 bro.find_element_by_id('TANGRAM__PSP_4__submit').click() 38 except: 39 pass 40 return bro 41 42 43 def baidu_main(url, ci_list, user, passwd): 44 """ 45 爬取百度指数响应关键词的整体日均值 46 :param url: 地址 47 :param ci_list: 搜索关键词 48 :param user: 用户名 49 :param passwd: 密码 50 :return: 关键词的整体日均值 51 """ 52 53 bro = login(url, user, passwd) 54 sleep(5) 55 56 age_list = dict() 57 for ci in ci_list: 58 bro.get(url) 59 sleep(3) 60 61 bro.find_element_by_class_name('search-input').send_keys(ci) 62 bro.find_element_by_class_name('search-input-cancle').click() 63 sleep(3) 64 average = bro.find_element_by_xpath("//tr/td[2]/div[@class='veui-table-cell']").text 65 print("{}关键词的整体日均值:{}".format(ci, average)) 66 age_list[ci] = average 67 bro.close() 68 return age_list 69 70 71 if __name__ == '__main__': 72 ci_list = ["复旦大学","清华大学","南昌大学","北京大学"] 73 baidu_url = "http://index.baidu.com/v2/index.html#/" 74 user = input("请输入百度账号:") 75 passwd = input("请输入百度密码:") 76 age = baidu_main(baidu_url, ci_list, user, passwd) 77 print(age)
浙公网安备 33010602011771号