1 #coding:utf-8
2 import time
3 import threading
4 from html_downLoader import HtmlDownLoader
5 import ParseAlexa
6 import multiprocessing
7 from MongoQueue import MongoQueue
8 import sys
9 if sys.getdefaultencoding()!="utf-8":
10 reload(sys)
11 sys.setdefaultencoding("utf-8")
12 SLEEP_TIME=1
13 alexaCallback=ParseAlexa.AlexaCallback()
14 crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
15 max_threads=5
16 result={}
17 def threaded_crawler():
18 threads=[]
19 #crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
20 dlownloader=HtmlDownLoader()
21 def process_queue():
22 while True:
23 try:
24 url=crawl_queue.pop()
25 crawl_queue.complete(url)
26 except Exception,e:
27 print e.message
28 break
29 else:
30 print "正在爬取%s"%url
31 html=dlownloader.downLoad(url)
32 result[url]=html
33
34 while threads or crawl_queue.__nonzero__():
35 while len(threads)<max_threads and crawl_queue.__nonzero__():
36 thread=threading.Thread(target=process_queue)
37 thread.setDaemon(True)
38 thread.start()
39 threads.append(thread)
40 time.sleep(SLEEP_TIME)
41 for thread in threads:
42 if not thread.is_alive():
43 threads.remove(thread)
44 print result,'\n\n\n\n\n'
45
46 def process_crawler():
47 num_cpus=multiprocessing.cpu_count()
48 print "Starting {} process".format(num_cpus)
49 process=[]
50 for i in range(num_cpus):
51 p=multiprocessing.Process(target=threaded_crawler)
52 p.daemon=True
53 p.start()
54 # p.join()
55 process.append(p)
56 for p in process:
57 p.join()
58 # print result
59 if __name__ == '__main__':
60 #alexaCallback=ParseAlexa.AlexaCallback()
61 #threaded_crawler(alexaCallback)
62 process_crawler()
63 # print result