爬虫 ==》 同步调用

爬虫 ==》  同步调用

import requests
def parse_page(res):
    print('PARSE %s' %(len(res)))

def get_page(url):
    print('GET %s' %url)
    response=requests.get(url)
    if response.status_code == 200:
        return response.text


if __name__ == '__main__':
    urls=[
        'https://www.baidu.com',
        'https://www.taobao.com',
        'https://www.openstack.org',
    ]
    for url in urls:
        res=get_page(url)
        parse_page(res)
同步调用

 

import requests
from threading import Thread,current_thread

def parse_page(res):
    print('%s PARSE %s' %(current_thread().getName(),len(res)))

def get_page(url,callback=parse_page):
    print('%s GET %s' %(current_thread().getName(),url))
    response=requests.get(url)
    if response.status_code == 200:
        callback(response.text)


if __name__ == '__main__':
    urls=[
        'https://www.baidu.com',
        'https://www.taobao.com',
        'https://www.openstack.org',
    ]
    for url in urls:
        t=Thread(target=get_page,args=(url,))
        t.start()
多线程与多进程

 

import requests
from threading import current_thread
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor

def parse_page(res):
    res=res.result()
    print('%s PARSE %s' %(current_thread().getName(),len(res)))

def get_page(url):
    print('%s GET %s' %(current_thread().getName(),url))
    response=requests.get(url)
    if response.status_code == 200:
        return response.text

if __name__ == '__main__':
    urls=[
        'https://www.baidu.com',
        'https://www.taobao.com',
        'https://www.openstack.org',
    ]
    pool=ThreadPoolExecutor(50)

    for url in urls:
        pool.submit(get_page,url).add_done_callback(parse_page)

    pool.shutdown(wait=True)
线程池与进程池

 

from gevent import joinall,spawn,monkey;monkey.patch_all()
import requests
from threading import current_thread

def parse_page(res):
    print('%s PARSE %s' %(current_thread().getName(),len(res)))

def get_page(url,callback=parse_page):
    print('%s GET %s' %(current_thread().getName(),url))
    response=requests.get(url)
    if response.status_code == 200:
        callback(response.text)

if __name__ == '__main__':
    urls=[
        'https://www.baidu.com',
        'https://www.taobao.com',
        'https://www.openstack.org',
    ]

    tasks=[]
    for url in urls:
        tasks.append(spawn(get_page,url))

    joinall(tasks)
gevent模块

 

posted @ 2018-01-22 19:07  Aray007  阅读(190)  评论(0)    收藏  举报