celery的使用
1.celery的任务调度
# -*- coding: utf-8 -*-
import threading
from bs4 import BeautifulSoup
from tornado import httpclient
from celery import Celery
from tornado.httpclient import HTTPClient
broker = 'redis://localhost:6379'
backend = 'redis://localhost:6379'
app = Celery('tasks', broker=broker, backend=backend)
visited = {}
@app.task
def get_html(url):
http_client = HTTPClient()
try:
response = http_client.fetch(url, follow_redirects=True)
return response.body
except httpclient.HTTPError as e:
return None
finally:
http_client.close()
def start(url):
threads = []
for i in range(20):
t = threading.Thread(target=schedule, args=(url,))
t.daemon = True
t.start()
threads.append(t)
for thread in threads:
thread.join()
def process_html(url, html):
print url + ": " + html
_add_links_to_queue(url, html)
def schedule(url):
print "before call _work " + url
_worker.delay(url)
print "after call _work " + url
def _add_links_to_queue(url, html):
soup = BeautifulSoup(html)
links = soup.find_all('a')
for link in links:
try:
_url = link['href']
except:
pass
if not _url.startswith('http'):
_url = 'http://' + _url
print url + "==>" + _url
schedule(_url)
@app.task
def _worker(url):
print str(threading.currentThread()) + " running " + url
while 1:
if url in visited:
continue
result = get_html.delay(url)
try:
html = result.get(timeout=5)
except Exception as e:
print(url)
print(e)
finally:
process_html(url, html)
visited[url] = True
if __name__ == '__main__':
start("http://www.hao123.com/")
2.celery如何进行负载均衡设计
celery有send_task方式去做任务调度,因此,负载均衡的话,可以采用自己的算法去做任务分配,可参考:http://blog.csdn.net/vintage_1/article/details/47664187

浙公网安备 33010602011771号