Python网络爬虫(version-1)

#!/usr/bin/python

# spider version 1
# goal: pass -u url -d deep -thread number

import urllib
import argparse
import threading
import Queue
import bs4
import sys
import time
import re
#import this

# #-----------------------------------------------
# #					bs4 test pass
# #-----------------------------------------------
# soup = bs4.BeautifulSoup("<p>some<b>bad<i>HTML")
# print soup.prettify()

# #-----------------------------------------------
# # 				argparse test pass
# #-----------------------------------------------
parser = argparse.ArgumentParser(description = "spider command line test!!!")
parser.add_argument("filename", help="spider filename")
parser.add_argument("-u", "--url", help="input a org rul", default="http://www.sina.com")
parser.add_argument("-d", "--deep", help="search deep, only one parameter", type=int, default="2")
parser.add_argument("-thread", "--thread_number", help="thread number in threadpool", type=int, default="300")

class CMD_ARG():
	pass

args = CMD_ARG()

parser.parse_args(sys.argv, namespace=args)

#print args.filename, args.url, args.deep, args.thread_number
#time.sleep(10)
# #-----------------------------------------------
# # 				threading test pass
# #-----------------------------------------------
# class x(threading.Thread):
# 	def __init__(self):ls
# 		threading.Thread.__init__(self)
# 	def run(self):

# th = x()
# th.acquire()
# th.release()
# th.wait()
# th.notify()
# th.start()
# th.join()
# th.stop()



# #-----------------------------------------------
# # 				Queue test pass
# #-----------------------------------------------
# queue = Queue.Queue()
# queue.put(x)
# queue.get()




# #-----------------------------------------------
# # 				main code
# #-----------------------------------------------


# input args.url, args.deep

#url queue
queue = Queue.Queue()

#visited queue
visited = Queue.Queue()

#threading pool
thpool = []

#threading num
thnum = args.thread_number


#element in queue is a dict
# {	
# 	"url": "http://www.xxx.com",
# 	"deep": x
# }

#first element for queue
queue.put({'url':args.url, 'deep':0})

#re pattern
#attention: matched url should not contain ""
pat = re.compile(r'"(http://.+?)"')


#get a RLock, no use, because queue control itself
mylock = threading.RLock()

flag_done = False



# class Spider
class Spider(threading.Thread):

	def __init__(self, url_dict):
		try:
			threading.Thread.__init__(self)
			self.url = url_dict['url']
			self.deep = url_dict['deep']
		except:
			print "init error\n"

	def run(self):
		print "%s\n" %(self.getName())

		try: 
			self.search()
		except:
			print "search error\n"

		

	# search url in content
	def search(self):
		global pat
		global queue

		#get url content
		try:
			content = urllib.urlopen(self.url).read()
		except:
			print "open content error\n"

		#match all url in content
		try:
			matched= pat.findall(content)
		except:
			print "match error\n"

		#put all matched url to queue
		for link in matched:
			queue.put({'url':link, 'deep':self.deep+1})
		


def work():
	#global thnum		
	global flag_done
	global queue
	global visited
	global thpool
	global thnum

	#flag_done is True when Spider reach reqiured deep
	while not flag_done:

		# thread number should not bigger than reqiured
		if threading.active_count() >= thnum:
			continue
		
		#queue not empty
		if not queue.empty():
		
			#get a value from queue
			val = queue.get()
		
			#put this url to visited queue
			visited.put(val)
		
			#reach reqiured deep
			if(val['deep'] == args.deep):
				flag_done = True
		
				#waiting all thread end
				for t in thpool:
					print "waiting %s stop\n" %(t.getName())
					t.join(20)
				print "Spider done in deep: %d, qsize: %d, visited: %d\n" %(args.deep, queue.qsize(), visited.qsize())
				break

			else:
				#gen thread
				th = Spider(val)
		
				#add thread to threadPool
				thpool.append(th)
				th.start()
				
		else:
			continue
	return 

#main
work()
 	

  

posted @ 2012-08-19 10:19  semiok  阅读(271)  评论(0编辑  收藏  举报