python-selenium实现的简易下载器,并常见错误解决

简易下载器的实现

支持代理、失败重试、确保包含指定ID元素(可根据需求自定义修改)

# coding: utf-8
from Utils import logging
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.proxy import ProxyType
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

class HtmlDownloader:
	def __init__(self):
		self.driver = webdriver.PhantomJS()

	def setProxy(self, proxyStr):
		# 利用DesiredCapabilities(代理设置)参数值,重新打开一个sessionId
		proxy=webdriver.Proxy()
		proxy.proxy_type=ProxyType.MANUAL
		proxy.http_proxy=proxyStr
		# 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
		proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
		self.driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)

	def rmProxy(self):
		# 还原为系统代理
		proxy=webdriver.Proxy()
		proxy.proxy_type=ProxyType.DIRECT
		proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
		browser.start_session(webdriver.DesiredCapabilities.PHANTOMJS)

	def download(self, returnType, url, ensureId, proxyStr = None):
		if proxyStr:
			self.setProxy(proxyStr)
		else:
			self.rmProxy()
		self.driver.get(url)
		# special for xxx.com
		# your code here
		# ensure for some element
		try:
			WebDriverWait(self.driver, 30).until(EC.presence_of_element_located((By.ID, ensureId)))
			if returnType == "html":
				downloadResult = self.driver.page_source
			elif returnType == "bs":
				downloadResult = bs(self.driver.page_source, 'lxml')
			logging("i", "download %s bytes" % len(self.driver.page_source))
			return downloadResult
		except Exception,e:
			logging("e", str(e))
		finally:
			self.driver.close()

	def safeDownload(self, returnType, url, ensureId, proxyStr = None):
		downloadResult = None
		failTimes = 0
		while not downloadResult:
			downloadResult = self.download(returnType, url, ensureId, proxyStr)
			if not downloadResult:
				failTimes += 1
				if failTimes == 5:
					logging("w", "failed %s times, will abort" % failTimes)
					break
				logging("w", "failed %s times, will retry" % failTimes)
		return downloadResult

元素不可见导致不能操作的错误

# ElementNotVisibleException: Message: {"errorMessage":"Element is not currently visible and may not be manipulated"
# Screenshot: available via screen

首先尝试设定窗口大小

self.driver.set_window_size(1024, 768)

不行的话再尝试滚动页面,如滚动到底部:

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

posted on 2016-12-27 12:50  忧伤的南瓜  阅读(1127)  评论(1编辑  收藏  举报

导航