Python模拟浏览器实现用户响应

最近工作中遇到一个问题,在集群上运行的任务有时候无法正常结束,或者无法正常启动。这会造成这批运行的任务无法正常结束运行,处于pending的状态,导致后面的任务无法正常启动。

该问题困扰我们项目已经有半年左右了,一直没有想到很好的解决办法。主要原因就是任务的状态只能在浏览器中看出,无法通过后台的日志或者数据库查询得到。在浏览器中,如果我们看到某个任务长时间没有运行时间和状态的变化,就可以把这个任务当做是“僵尸”任务,从而可以将该任务手动结束掉(kill)。

春节之后在网上看到一些有关爬虫的文章,里面提到过有一种爬虫就是模拟浏览器的行为(包括登录、点击等)去得到网页的数据,进而进行网页抓取,有用信息提取。于是我思考,我们项目的问题和浏览器的交互,只有几种情况,完全可以通过这种方式解决“僵尸”任务。经过一周左右的研究和一周断断续续的coding,终于将这个问题解决了,现在把解决问题的主要思路和关键技术难点写下来,希望一来可以加深自己的印象,二来可以帮助到需要的人。因为实现的任务比较单一,且实现过程比较仓促,code主要就是实现了一些功能,没有进行优化,也没有太参考什么编码规范,设计模式之类的。以后遇到更大的问题,再考虑这些吧。

技术要点:

(1)Python的package:selenium,用这个package,可以和浏览器进行交互,如打开某个浏览器(Chrome,FireFox等),登录需要验证的网站(输入用户名&密码),点击某个特定图标等等,下面是两个有关selenium的链接:

      https://www.baidu.com/link?url=tTeJRPOMKX8noXyTa2YPgpaD6vVlGQ2-RVAfwRg4Yvm&wd=&eqid=acd0879a0043c2e9000000045741cd39

      http://www.cnblogs.com/fnng/archive/2013/05/29/3106515.html

(2)seleniumPhantomJS,这是一个虚拟的浏览器,可以把它看成一个在后台运行的浏览器,用户看不到浏览器的页面,但其他的功能和普通浏览器基本一样,比如可以截图,点击某个图标,抓取网页信息等,之所以使用了这个用来模仿浏览器,是因为我们的server无法安装普通的浏览器,只能运行在终端模式下运行的程序;

      http://phantomjs.org/

(3)xpath,这个是我编程中耗时最多的模块,主要原因有几个,一是元素定位有问题,网站是一秒钟刷新一次,上一秒获取到的元素下一秒就找不到了;二是相似元素太多,层级关系太复杂,用一般的相对路径去寻找,有可能找到一些不想要的元素,所以就造成了寻找元素过程的费时费力。下面是两个有关xpath的介绍,比较实用,特别是在网页爬虫方面(后面我还要专门介绍爬虫):

     http://www.cnblogs.com/fdszlzl/archive/2009/06/02/1494836.html

     http://www.ruanyifeng.com/blog/2009/07/xpath_path_expressions.html

 

以下是核心code,因为项目隐私的原因,把一些敏感的内容用*******代替。如果有什么问题,可以给我留言。

  1 '''
  2 command: 
  3 
  4 python KillJobs.py -url=172.20.9.42:1100 -screenShotPath=*****
  5 
  6 '''
  7 
  8 from selenium import webdriver
  9 from selenium.common.exceptions import NoSuchElementException
 10 from selenium.webdriver.common.keys import Keys
 11 import re
 12 import time
 13 import argparse
 14 import sys
 15 import os
 16 
 17 
 18 mailReceiver = [   
 19                     "xxxxxxxxxxxxxxxx@xx"
 20                 ]
 21 
 22 ZOMBIE_JOB_LIST = {"list1": [], "list2": [], "list3":[]}
 23 
 24 def get_mail_receiver():
 25     receiver = ' '
 26     for recv in mailReceiver:
 27         receiver = receiver + recv + ' '
 28 
 29     return receiver
 30 
 31 def kill_zombie_jobs(screenShotPath, url):
 32     browser = webdriver.PhantomJS() # Get local session of PhantomJS
 33     # browser = webdriver.Firefox() # Get local session of Firefox
 34     browser.set_window_size(2500, 2000)
 35     
 36     targetUrl = "http://%s/#JOBS" %url
 37     print "url: ", targetUrl
 38     
 39     job_to_be_kill_indicate = 0
 40     
 41     browser.get(targetUrl) # Load page
 42     userName = browser.find_elements_by_class_name("gwt-TextBox")
 43     password = browser.find_elements_by_class_name("gwt-PasswordTextBox")
 44     submitButton = browser.find_elements_by_class_name("gwt-Button")
 45     
 46     if len(userName) == 0 or len(password) == 0 or len(submitButton) == 0:
 47         print "error in open url: %s" %targetUrl
 48         browser.quit()
 49         return
 50         
 51     userName[0].send_keys("root")
 52     password[0].send_keys("changeit")
 53     time.sleep(1)
 54     submitButton[0].click()
 55     
 56     time.sleep(2)
 57     
 58     sceen_shot_name = screenShotPath + "/Before_kill_jobs_screen_shot.png"
 59     browser.save_screenshot(sceen_shot_name)
 60 
 61     jobs_name_pattern_0 = "//body/div[2]/div[2]/div/div[4]/div/div[3]/div/div[4]/div/div[2]/div/div[2]/div/div/div/div[3]/table[2]/tbody/tr[1]/td/fieldset/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[1]"
 62     jobs_name_pattern = "//body/div[2]/div[2]/div/div[4]/div/div[3]/div/div[4]/div/div[2]/div/div[2]/div/div/div/div[3]/table[2]/tbody/tr[1]/td/fieldset/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[Order]/td[1]"
 63     jobs_duration_pattern = "//body/div[2]/div[2]/div/div[4]/div/div[3]/div/div[4]/div/div[2]/div/div[2]/div/div/div/div[3]/table[2]/tbody/tr[1]/td/fieldset/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[Order]/td[5]"
 64     
 65     for i in range(1, 4):
 66         tmp_list = "list"+str(i)
 67         job_name_elements_list = browser.find_elements_by_xpath(jobs_name_pattern_0)
 68         job_length = len(job_name_elements_list) 
 69         
 70         for index in range(1, job_length+1):
 71             job_name_pattern = jobs_name_pattern.replace("Order", str(index))
 72             job_duration_pattern = jobs_duration_pattern.replace("Order", str(index))
 73             job_name = get_element_name(browser, job_name_pattern)
 74             job_duration_time = get_duration_time(get_element_name(browser, job_duration_pattern))
 75             
 76             if len(job_name) > 10 and job_duration_time == 0:
 77                 ZOMBIE_JOB_LIST[tmp_list].append(job_name)
 78         time.sleep(60)
 79         
 80     zombie_job_list = get_zombie_job_list(ZOMBIE_JOB_LIST)
 81     print "\n ---------To be killed job list: ", zombie_job_list
 82     len1 = len(zombie_job_list)
 83     print "\n ---------To be killed job list length: ", len1
 84     kill_jobs_in_list(browser, zombie_job_list)
 85     print "\n ---------After killed job list: ", zombie_job_list
 86     len2 = len(zombie_job_list)
 87     print "\n ---------After killed job list length: ", len2
 88     
 89     time.sleep(2)
 90     sceen_shot_name = screenShotPath + "/After_kill_jobs_screen_shot.png"
 91     browser.save_screenshot(sceen_shot_name)
 92     browser.quit()
 93     
 94     if len2 < len1:
 95         job_to_be_kill_indicate = 1
 96     return job_to_be_kill_indicate
 97     
 98 
 99 def get_element_name(browser, element_pattern):
100     element_name = ""
101     try:
102         element = browser.find_element_by_xpath(element_pattern)
103         element_name = element.text
104     except Exception, e:
105         print "element not exist any more!!!!!"
106         element_name = ""
107 
108     return element_name
109 
110 def get_duration_time(timeStr):
111     if timeStr is None or timeStr == "":
112         return 0
113     if re.match(r"\d{2}:\d{2}:\d{2}", timeStr) is None:
114         return 0
115 
116     timeSec = int(timeStr[0:2]) * 3600 + int(timeStr[3:5]) * 60 + int(timeStr[6:8])
117 
118     return timeSec
119 
120 def send_kill_jobs_mail(mailer, screenShotPath, url, indicator):
121     # jobs screen before and after kill
122     mailTitle = "Jobs_on_%s_Hanging"  %url
123     screenShotFile1 = screenShotPath + "/Before_kill_jobs_screen_shot.png"
124     screenShotFile2 = screenShotPath + "/After_kill_jobs_screen_shot.png"
125     logFile = screenShotPath + "/nodes_hanging.log"
126     command = 'mail -a ' + screenShotFile1 + ' -a ' + screenShotFile2 + ' -s ' + mailTitle + mailer +  ' < ' + logFile
127     print "command: ", command 
128     os.system(command)
129 
130     return 0
131 
132 def get_zombie_job_list(job_name_list):
133     print "list1: ", job_name_list['list1']
134     print "list2: ", job_name_list['list2']
135     print "list3: ", job_name_list['list3']
136     job_list = []
137     if (not job_name_list) or (not job_name_list['list1']) or (not job_name_list['list2']) or (not job_name_list['list3']):
138         return job_list
139     else:
140         for job in job_name_list['list1']:
141             if (job in job_name_list['list2']) and (job in job_name_list['list3']):
142                 job_list.append(job)
143     
144     return job_list
145 
146 def kill_jobs_in_list(browser, zombie_job_list):
147     if (not browser) or (not zombie_job_list):
148         return 0
149         
150     jobs_name_pattern_0 = "//body/div[2]/div[2]/div/div[4]/div/div[3]/div/div[4]/div/div[2]/div/div[2]/div/div/div/div[3]/table[2]/tbody/tr[1]/td/fieldset/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[1]"
151     jobs_name_pattern = "//body/div[2]/div[2]/div/div[4]/div/div[3]/div/div[4]/div/div[2]/div/div[2]/div/div/div/div[3]/table[2]/tbody/tr[1]/td/fieldset/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[Order]/td[1]"
152     jobs_kill_pattern_0 = "//table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[3]/div/button"
153     jobs_kill_pattern = "//body/div[2]/div[2]/div/div[4]/div/div[3]/div/div[4]/div/div[2]/div/div[2]/div/div/div/div[3]/table[2]/tbody/tr[1]/td/fieldset/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[Order]/td[3]/div/button"
154     jobs_duration_pattern = "//body/div[2]/div[2]/div/div[4]/div/div[3]/div/div[4]/div/div[2]/div/div[2]/div/div/div/div[3]/table[2]/tbody/tr[1]/td/fieldset/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[Order]/td[5]"
155 
156     for i in range(0, 5):    
157         job_name_elements_list = browser.find_elements_by_xpath(jobs_name_pattern_0)
158         job_length = len(job_name_elements_list)
159         
160         for index in range(1, job_length+1):
161             job_name_pattern = jobs_name_pattern.replace("Order", str(index))
162             job_kill_pattern = jobs_kill_pattern.replace("Order", str(index))
163             job_duration_pattern = jobs_duration_pattern.replace("Order", str(index))
164             job_name = get_element_name(browser, job_name_pattern)
165             job_duration_time = get_duration_time(get_element_name(browser, job_duration_pattern))
166             
167             if (job_name in zombie_job_list) and (job_duration_time == 0):
168                 print "This job should be killed: ", job_name
169                 try:
170                     kill_button_element = browser.find_element_by_xpath(job_kill_pattern)
171                     kill_button_element.click()
172                     confirm_kill_button_pattern = "//table/tbody/tr/td/table/tbody/tr/td[1]/button"
173                     
174                     confirm_kill_button_element = browser.find_element_by_xpath(confirm_kill_button_pattern)
175                     if confirm_kill_button_element.text == "Yes":
176                         print "press button: ", confirm_kill_button_element.text
177                         confirm_kill_button_element.click()
178                         time.sleep(1)
179                         zombie_job_list.remove(job_name)
180                 except Exception, e:
181                     print "Confirm Yes Button does not exist any more!!!!!"
182                 time.sleep(0.5)
183     
184 
185 def monitor():
186     # kill exist PhantomJS
187     command = "killall phantomjs"
188     print "kill all existing phantomjs: ", command
189     os.system(command)
190     
191     parser = argparse.ArgumentParser()
192     parser.add_argument('-url', action='store', dest='url', help='data url', required=True)
193     parser.add_argument('-screenShotPath', action='store', dest='screenShotPath', help='the screen shot path', required=True)
194     results = parser.parse_args()
195 
196     print 'DataRush URL = ', results.url
197     url = results.url
198     print 'Screen Shot Path  = ', results.screenShotPath
199     screenShotPath = results.screenShotPath
200 
201     mailer = get_mail_receiver()
202 
203     print "START: Monitor DataRush Starting.............................." 
204     
205     job_killed_inicate = kill_zombie_jobs(screenShotPath, url)
206 
207     if job_killed_inicate == 1:
208         print "zombie jobs has been killed!!!!!!!"
209         send_kill_jobs_mail(mailer, screenShotPath, url, 1)
210     else:
211         pass
212 
213     print "End: Monitor Finished....................................." 
214 
215 if __name__ == '__main__':
216     monitor()
217     
218    
219    

 

posted @ 2016-05-22 23:40  zhangchao3322218  阅读(5143)  评论(0编辑  收藏