使用Selenium 结合PhantonJS 百度关键字搜索批量获取URL
- Python2.7 如果想在Python3 环境使用,请修改print
- PhantonJS 将PhantonJS.exe 加入环境变量:PhantonJS下载地址
- Selenium包安装
pip install selenium
pip show selenium
- 如果有问题,重启电脑,或者手动指定下PhantoJS.exe路径
browser = webdriver.PhantomJS(executable_path="phantomjs.exe")
详细代码和注释
"""
@author:随时静听
@file: baiduSpider2.py
@time: 2018/09/21
"""
from selenium import webdriver
import time
pages=10
keyword=u"inurl:?.asp学校"
url="https://www.baidu.com/"
def getUrls(driver):
urls=[]
current_window=driver.current_window_handle
for a in driver.find_elements_by_xpath("//div[@id='content_left']//div/h3/a"):
a.click()
all_windows=driver.window_handles
all_windows.remove(current_window)
for window in all_windows:
driver.switch_to_window(window)
urls.append(driver.current_url)
print "[-] "+driver.current_url
print "[-] The number of pages parsed to URL is :"+str(len(all_windows))
for window in all_windows[:]:
driver.switch_to_window(window)
driver.close()
driver.switch_to_window(current_window)
return urls,driver
def doSearch(url,keyword,driver):
driver.get(url)
driver.find_element_by_id('kw').send_keys(keyword)
driver.find_element_by_id('su').submit()
time.sleep(3)
return driver
def toNextpage(driver):
pages = driver.find_elements_by_xpath("//div[@id='page']//a")
if pages:
pages[-1].click()
return driver
def run():
pages=11
urls=[]
try:
driver = webdriver.PhantomJS()
driver.implicitly_wait(6)
driver=doSearch(url,keyword,driver)
for i in range(pages+1):
url_lst,driver=getUrls(driver)
urls.extend(url_lst)
driver=toNextpage(driver)
time.sleep(2)
except Exception as e:
print "[!] There seems to be a mistake! The error message is as follows: "
print "--"*6+"Error Message Info"+"--"*6
print e
print "--" * 6 + "Error Message Info END" + "--" * 6
finally:
driver.quit()
run()
if __name__ == '__main__':
pass
基本使用
- 由于比较懒,没有加入参数解析,也没将结果写入文件,有需要自己写一下,修改Keyword 直接运行即可
- 修改pages值指定搜索的页面数量
- 获取数度挺慢,不知道是不是我的电脑问题,使用Chrome挺快的
更新加入保存至文件
"""
@author:随时静听
@file: baiduSpider2.py
@time: 2018/09/21
@email:wang_di@topsec.com.cn
"""
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
pages=10
keyword=u"inurl:?.asp学校"
url="https://www.baidu.com/"
def getUrls(driver):
urls=[]
current_window=driver.current_window_handle
for a in driver.find_elements_by_xpath("//div[@id='content_left']//div/h3/a"):
a.click()
all_windows=driver.window_handles
all_windows.remove(current_window)
for window in all_windows:
driver.switch_to_window(window)
urls.append(driver.current_url)
print "\t[-] "+driver.current_url
print "[*] The number of pages parsed to URL is :"+str(len(all_windows))
for window in all_windows[:]:
driver.switch_to_window(window)
driver.close()
driver.switch_to_window(current_window)
return urls,driver
def doSearch(url,keyword,driver):
driver.get(url)
driver.find_element_by_id('kw').send_keys(keyword)
driver.find_element_by_id('su').submit()
time.sleep(3)
return driver
def toNextpage(driver):
pages = driver.find_elements_by_xpath("//div[@id='page']//a")
if pages:
pages[-1].click()
return driver
def getargs():
import sys
usage='''
Usage: BaiduSpider keywords [nums] [filename]
keywords: your Search keywords
nums: default value is 6,seach max pages
filename: result saved file name,default value is result.txt
'''
if len(sys.argv)<2:
print usage
exit()
else:
try:
keywords= sys.argv[1]
if len(sys.argv)==2:
nums= 6
filename="result.txt"
return (keywords,nums,filename)
if len(sys.argv)==3:
nums= int(sys.argv[2])
filename = "result.txt"
return (keywords, nums, filename)
if len(sys.argv)==4:
nums = int(sys.argv[2])
filename=sys.argv[3]
return (keywords, nums, filename)
except:
print "[!] Parameter error !"
print usage
exit()
def saveData(urls,filename):
if urls:
with open(filename,'a+') as f:
for url in urls:
f.write(url+'\n')
def run():
keyword,pages,filename=getargs()
print "[*] Keywords:"+keyword
print "[*] nums:"+str(pages)
print "[*] save file name:"+filename
print "\n"
print "--"*30
urls=[]
try:
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap[
"phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0"
dcap["phantomjs.page.settings.loadImages"] = False
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.set_page_load_timeout(20)
driver.implicitly_wait(6)
driver=doSearch(url,keyword,driver)
for i in range(pages+1):
print "[*] Page data loading: "+str(i+1)
url_lst,driver=getUrls(driver)
urls.extend(url_lst)
driver=toNextpage(driver)
time.sleep(5)
saveData(url_lst,filename)
print "[*] The urls total num is:" + str(len(urls))
print "[*] The spider urls save as file:" + filename
except Exception as e:
print "[!] There seems to be a mistake! The error message is as follows: "
print "--"*10+"Error Message Info"+"--"*10
print e
print "--" * 10 + "Error Message Info END" + "--" * 10
finally:
driver.quit()
if __name__ == '__main__':
run()
pass
【推荐】100%开源!大型工业跨平台软件C++源码提供,建模,组态!
【推荐】2025 HarmonyOS 鸿蒙创新赛正式启动,百万大奖等你挑战