Python网络爬虫(version-1)
#!/usr/bin/python # spider version 1 # goal: pass -u url -d deep -thread number import urllib import argparse import threading import Queue import bs4 import sys import time import re #import this # #----------------------------------------------- # # bs4 test pass # #----------------------------------------------- # soup = bs4.BeautifulSoup("<p>some<b>bad<i>HTML") # print soup.prettify() # #----------------------------------------------- # # argparse test pass # #----------------------------------------------- parser = argparse.ArgumentParser(description = "spider command line test!!!") parser.add_argument("filename", help="spider filename") parser.add_argument("-u", "--url", help="input a org rul", default="http://www.sina.com") parser.add_argument("-d", "--deep", help="search deep, only one parameter", type=int, default="2") parser.add_argument("-thread", "--thread_number", help="thread number in threadpool", type=int, default="300") class CMD_ARG(): pass args = CMD_ARG() parser.parse_args(sys.argv, namespace=args) #print args.filename, args.url, args.deep, args.thread_number #time.sleep(10) # #----------------------------------------------- # # threading test pass # #----------------------------------------------- # class x(threading.Thread): # def __init__(self):ls # threading.Thread.__init__(self) # def run(self): # th = x() # th.acquire() # th.release() # th.wait() # th.notify() # th.start() # th.join() # th.stop() # #----------------------------------------------- # # Queue test pass # #----------------------------------------------- # queue = Queue.Queue() # queue.put(x) # queue.get() # #----------------------------------------------- # # main code # #----------------------------------------------- # input args.url, args.deep #url queue queue = Queue.Queue() #visited queue visited = Queue.Queue() #threading pool thpool = [] #threading num thnum = args.thread_number #element in queue is a dict # { # "url": "http://www.xxx.com", # "deep": x # } #first element for queue queue.put({'url':args.url, 'deep':0}) #re pattern #attention: matched url should not contain "" pat = re.compile(r'"(http://.+?)"') #get a RLock, no use, because queue control itself mylock = threading.RLock() flag_done = False # class Spider class Spider(threading.Thread): def __init__(self, url_dict): try: threading.Thread.__init__(self) self.url = url_dict['url'] self.deep = url_dict['deep'] except: print "init error\n" def run(self): print "%s\n" %(self.getName()) try: self.search() except: print "search error\n" # search url in content def search(self): global pat global queue #get url content try: content = urllib.urlopen(self.url).read() except: print "open content error\n" #match all url in content try: matched= pat.findall(content) except: print "match error\n" #put all matched url to queue for link in matched: queue.put({'url':link, 'deep':self.deep+1}) def work(): #global thnum global flag_done global queue global visited global thpool global thnum #flag_done is True when Spider reach reqiured deep while not flag_done: # thread number should not bigger than reqiured if threading.active_count() >= thnum: continue #queue not empty if not queue.empty(): #get a value from queue val = queue.get() #put this url to visited queue visited.put(val) #reach reqiured deep if(val['deep'] == args.deep): flag_done = True #waiting all thread end for t in thpool: print "waiting %s stop\n" %(t.getName()) t.join(20) print "Spider done in deep: %d, qsize: %d, visited: %d\n" %(args.deep, queue.qsize(), visited.qsize()) break else: #gen thread th = Spider(val) #add thread to threadPool thpool.append(th) th.start() else: continue return #main work()