selenium模拟登陆百度指数,爬取相应关键词百度指数

百度指数总得来说,反爬机制还是比较厉害的.首先:需要登录才能搜索关键词,而模拟登录当输入账号密码太快会触发反爬机制图形验证码,短时间内登录太多次会把账号及ip放入灰名单,再次登录需要验证手机短信,这样做的好处是:不是爬虫的话,用户体验度不会太差,而有可能是爬虫的时候,反爬等级不断提高,难以大规模爬取数据.代码如下,如有优化,感激不尽:

 1 # coding=utf-8
 2 """
 3 author = jamon
 4 """
 5 from selenium import webdriver
 6 from selenium.webdriver import ChromeOptions
 7 from selenium.webdriver.chrome.options import Options
 8 import time
 9 
10 
11 def sleep(num):
12     # 睡眠
13     time.sleep(num)
14 
15 def login(url, user, passwd):
16     # 输入账号密码,登录账号
17     """
18     chrome_options = Options()        # 谷歌无头浏览器
19     chrome_options.add_argument('--headless')
20     chrome_options.add_argument('--disable-gpu')
21     bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options)
22     """
23     options = ChromeOptions()        # 谷歌浏览器
24     options.add_experimental_option('excludeSwitches', ['enable-automation'])
25     bro = webdriver.Chrome(executable_path='./chromedriver.exe', options=options)
26 
27     bro.get(url)
28     bro.find_element_by_class_name('username-text').click()
29     sleep(3)
30     bro.find_element_by_class_name('pass-text-input-userName').send_keys(user)
31     bro.find_element_by_class_name('pass-text-input-password').send_keys(passwd)
32     bro.find_element_by_id('TANGRAM__PSP_4__submit').click()
33     sleep(3)
34     try:
35         bro.find_element_by_class_name('vcode-close').click()
36         sleep(2)
37         bro.find_element_by_id('TANGRAM__PSP_4__submit').click()
38     except:
39         pass
40     return bro
41 
42 
43 def baidu_main(url, ci_list, user, passwd):
44     """
45     爬取百度指数响应关键词的整体日均值
46     :param url:          地址
47     :param ci_list:     搜索关键词
48     :param user:        用户名
49     :param passwd:      密码
50     :return:           关键词的整体日均值
51     """
52 
53     bro = login(url, user, passwd)
54     sleep(5)
55 
56     age_list = dict()
57     for ci in ci_list:
58         bro.get(url)
59         sleep(3)
60 
61         bro.find_element_by_class_name('search-input').send_keys(ci)
62         bro.find_element_by_class_name('search-input-cancle').click()
63         sleep(3)
64         average = bro.find_element_by_xpath("//tr/td[2]/div[@class='veui-table-cell']").text
65         print("{}关键词的整体日均值:{}".format(ci, average))
66         age_list[ci] = average
67     bro.close()
68     return age_list
69 
70 
71 if __name__ == '__main__':
72     ci_list = ["复旦大学","清华大学","南昌大学","北京大学"]
73     baidu_url = "http://index.baidu.com/v2/index.html#/"
74     user = input("请输入百度账号:")
75     passwd = input("请输入百度密码:")
76     age = baidu_main(baidu_url, ci_list, user, passwd)
77     print(age)

 

posted @ 2020-04-23 10:59  楸枰  阅读(26)  评论(0)    收藏  举报