GPU CUDA设备图谱发布

 

上图可通过以下示例代码生成:

__global__ void launchFireAndForgetGraph(cudaGraphExec_t graph) {
cudaGraphLaunch(graph, cudaStreamGraphFireAndForget);
}
void graphSetup() {
cudaGraphExec_t gExec1, gExec2;
cudaGraph_t g1, g2;
// Create, instantiate, and upload the device graph.
create_graph(&g2);
cudaGraphInstantiate(&gExec2, g2, cudaGraphInstantiateFlagDeviceLaunch);
cudaGraphUpload(gExec2, stream);
// Create and instantiate the launching graph.
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
launchFireAndForgetGraph<<<1, 1, 0, stream>>>(gExec2);
cudaStreamEndCapture(stream, &g1);
cudaGraphInstantiate(&gExec1, g1);
// Launch the host graph, which will in turn launch the device graph.
cudaGraphLaunch(gExec1, stream);
}

 

 

 

 上述执行流程可通过以下代码生成:

__global__ void launchTailGraph(cudaGraphExec_t graph) {
cudaGraphLaunch(graph, cudaStreamGraphTailLaunch);
}
void graphSetup() {
cudaGraphExec_t gExec1, gExec2;
cudaGraph_t g1, g2;
// Create, instantiate, and upload the device graph.
create_graph(&g2);
cudaGraphInstantiate(&gExec2, g2, cudaGraphInstantiateFlagDeviceLaunch);
cudaGraphUpload(gExec2, stream);
// Create and instantiate the launching graph.
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
launchTailGraph<<<1, 1, 0, stream>>>(gExec2);
cudaStreamEndCapture(stream, &g1);
cudaGraphInstantiate(&gExec1, g1);
// Launch the host graph, which will in turn launch the device graph.
cudaGraphLaunch(gExec1, stream);
}

 由尾部图排入队列的尾部启动任务,将优先于尾部启动列表中先前图形排入队列的尾部启动任务执行。这些新增的尾部启动任务将按照其入队顺序依次执行。

 

单个图最多可保留 255 个待执行的尾部启动任务。
3.2.8.6.6.2.1.3.1 尾部自启动
设备图可将自身加入尾部启动队列,但同一时刻每个图仅能注册一个自启动任务。为查询当前运行的设备图句柄以实现重启功能,新增设备端函数:

c
复制
cudaGraphExec_t cudaGetCurrentGraphExec();

该函数返回当前运行的设备图句柄(若内核执行环境位于设备图内)。若当前执行的内核不隶属于任何设备图节点,则返回 NULL。以下示例代码演示了用于重启循环的典型用法:

(注:术语处理说明

  1. "pending tail launches" 译为"待执行的尾部启动任务",通过"待执行"明确任务状态

  2. "self-launch"译为"自启动",保持与计算机体系结构术语的一致性

  3. 保留"device graph"标准译法"设备图",与CUDA文档体系一致

  4. 函数原型维持代码原文格式,通过注释说明增强技术文档的可移植性)

__device__ int relaunchCount = 0;
__global__ void relaunchSelf() {
int relaunchMax = 100;
if (threadIdx.x == 0) {
if (relaunchCount < relaunchMax) {
cudaGraphLaunch(cudaGetCurrentGraphExec(), cudaStreamGraphTailLaunch);
}
relaunchCount++;
}
}

 

posted @ 2025-04-13 04:42  吴建明wujianming  阅读(49)  评论(0)    收藏  举报