ray分布式

import os
os.environ["RAY_DEDUP_LOGS"] = "0"
import time
import ray

database = ["Learning", "ray", "a", "b","c"]
db_obeject_ref = ray.put(database)


@ray.remote
def retrieve_task(item, db):
    print(f"Task {item} 当前进程ID(PID): {os.getpid()}", flush=True)
    time.sleep(item/10.)
    return item, db[item]

def print_runtime(input_data, start_time):
    print(f'runtime:{time.time() - start_time:.2f} seconds, data:')
    print(*input_data, sep="\n")

start = time.time()
object_references = [retrieve_task.remote(item,db_obeject_ref) for item in range(5)]

data = ray.get(object_references)
print_runtime(data, start)

使用ray.wait获取已经完成的任务结果:

import os
os.environ["RAY_DEDUP_LOGS"] = "0"
import time
import ray

database = ["Learning", "ray", "a", "b","c"]
db_obeject_ref = ray.put(database)


@ray.remote
def retrieve_task(item, db):
    print(f"Task {item} 当前进程ID(PID): {os.getpid()}", flush=True)
    time.sleep(item/10.)
    return item, db[item]

def print_runtime(input_data, start_time):
    print(f'runtime:{time.time() - start_time:.2f} seconds, data:')
    print(*input_data, sep="\n")

start = time.time()
object_references = [retrieve_task.remote(item,db_obeject_ref) for item in range(5)]

all_data = []
while len(object_references) > 0:
    # 使用ray.wait获取已经完成的任务数据
    finished, object_references = ray.wait(object_references, num_returns=min(2, len(object_references)), timeout = 7.0)  
    data = ray.get(finished)
    print_runtime(data, start)
    all_data.extend(data)

处理任务的依赖

import os
os.environ["RAY_DEDUP_LOGS"] = "0"
import time
import ray

database = ["Learning", "ray", "a", "b","c", "d"]
db_obeject_ref = ray.put(database)

def retrieve(item,db_obeject_ref):
    print(f"retrieve {item} 当前进程ID(PID): {os.getpid()}", flush=True)
    time.sleep(item/10.)
    return db_obeject_ref[item]

@ray.remote
def follow_up_task(retrieve_result, db_obeject_ref):
    original_item, _ = retrieve_result
    follow_up_result = retrieve(original_item + 1, db_obeject_ref)
    return retrieve_result, follow_up_result

@ray.remote
def retrieve_task(item, db):
    print(f"Task {item} 当前进程ID(PID): {os.getpid()}", flush=True)
    time.sleep(item/10.)
    return item, db[item]

retrieve_refs = [retrieve_task.remote(item, db_obeject_ref) for item in [0,2,4]]
# 通过传递 futures对象, 来管理任务之间的依赖
follow_up_refs = [follow_up_task.remote(ref, db_obeject_ref) for ref in retrieve_refs]  
result = [print(data) for data in ray.get(follow_up_refs)]

上面的代码,当脚本启动后,通过:

ps -ef|grep ray 

xzc      3550114 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550115 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550116 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550117 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550118 3550014  5 23:12 pts/14   00:00:00 ray::follow_up_task
xzc      3550119 3550014  6 23:12 pts/14   00:00:00 ray::follow_up_task
xzc      3550120 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550121 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550122 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550123 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550124 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550125 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550126 3550014  5 23:12 pts/14   00:00:00 ray::follow_up_task
xzc      3550127 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550128 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550129 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550130 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550131 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550132 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550143 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550156 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550162 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550164 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550167 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550168 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550171 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550172 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550173 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550174 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550175 3550014  6 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550176 3550014  5 23:12 pts/14   00:00:00 ray::IDLE
xzc      3550177 3550014  5 23:12 pts/14   00:00:00 ray::IDLE

如果在没有显示的调用ray.init的情况下,ray的默认初始化如下:

ray.init(
num_cpus=os.cpu_count(), # 默认使用所有CPU核心
ignore_reinit_error=True
)

follow_up_task 的 PID 与对应的 retrieve_task 相同

这是因为 Ray 会尽量将​​有依赖关系的任务​​调度到同一个 worker

DataTracker 执行器

当看到"执行器"时,可以自动脑补成:

"这是一个分布式服务实例,就像微服务中的一个服务节点,有自己的状态和专属资源"

import os
os.environ["RAY_DEDUP_LOGS"] = "0"
import time
import ray

@ray.remote
class DataTracker:
    def __init__(self):
        self._counts = 0
    def increment(self):
        print(f"increment当前进程ID(PID): {os.getpid()}", flush=True)
        time.sleep(10)
        self._counts += 1
    def counts(self):
        print(f"counts当前进程ID(PID): {os.getpid()}", flush=True)
        return self._counts

database = ["Learning", "ray", "a", "b","c", "d"]
db_obeject_ref = ray.put(database)

@ray.remote
def retrieve_tracker_task(item, tracker, db):
    print(f"Task {item} 当前进程ID(PID): {os.getpid()}", flush=True)
    time.sleep(item/10.)
    tracker.increment.remote()
    return item, db[item]

tracker = DataTracker.remote()
retrieve_refs = [retrieve_tracker_task.remote(item, tracker, db_obeject_ref) for item in range(6)]
data = ray.get(retrieve_refs)
print(data)
print(ray.get(tracker.counts.remote()))

输出结果:

2025-05-22 23:40:01,734 INFO worker.py:1852 -- Started a local Ray instance.
(retrieve_tracker_task pid=3583820) Task 5 当前进程ID(PID): 3583820
(DataTracker pid=3583819) increment当前进程ID(PID): 3583819
(retrieve_tracker_task pid=3583823) Task 1 当前进程ID(PID): 3583823
(retrieve_tracker_task pid=3583826) Task 2 当前进程ID(PID): 3583826
(retrieve_tracker_task pid=3583822) Task 4 当前进程ID(PID): 3583822
(retrieve_tracker_task pid=3583833) Task 0 当前进程ID(PID): 3583833
(retrieve_tracker_task pid=3583830) Task 3 当前进程ID(PID): 3583830
[(0, 'Learning'), (1, 'ray'), (2, 'a'), (3, 'b'), (4, 'c'), (5, 'd')]
(DataTracker pid=3583819) increment当前进程ID(PID): 3583819
(DataTracker pid=3583819) increment当前进程ID(PID): 3583819
(DataTracker pid=3583819) increment当前进程ID(PID): 3583819
(DataTracker pid=3583819) increment当前进程ID(PID): 3583819
(DataTracker pid=3583819) increment当前进程ID(PID): 3583819
6
(DataTracker pid=3583819) counts当前进程ID(PID): 3583819
posted @ 2025-05-22 22:20  xiezhengcai  阅读(49)  评论(0)    收藏  举报