amd gpu测xgmi跨卡ping latency

卡1写sig buf,然后轮询读本地,然后写卡0 buf

卡0轮询读sig buf,读到退出,说明卡1已经进入轮询读,接着计时,启动卡0写卡1 buf,轮询读本地,读到说明卡1接收到卡0写的值,并往卡0 buf中写回完成

/*
 * MPI-based cross-device ping-pong latency test
 * Two processes control device 0 and device 1 respectively
 * Use IPC handles for cross-device memory access
 */
#include <hip/hip_runtime.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>

#define HIP_CALL(cmd)                                                          \
  do {                                                                         \
    hipError_t error = (cmd);                                                  \
    if (error != hipSuccess) {                                                 \
      fprintf(stderr, "HIP error: '%s'(%d) at %s:%d\n",                        \
              hipGetErrorString(error), error, __FILE__, __LINE__);            \
      exit(EXIT_FAILURE);                                                      \
    }                                                                          \
  } while (0)

/*
 * start first at gpu1
 */
__global__ void d1_loop_load_and_store(int* d1_addr, int* signal_addr, int* d0_addr)
{
    d1_addr[0] = 0;
    __scoped_atomic_store_n(signal_addr, 1223, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
    while (__scoped_atomic_load_n(d1_addr, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE) != 1);
    __scoped_atomic_store_n(d0_addr, 2, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
}

__global__ void d0_start_sync(int* signal_addr)
{
    while (__scoped_atomic_load_n(signal_addr, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE) != 1223);
}

__global__ void d0_store_and_loop_load(int* d0_addr, int* d1_addr)
{
    d0_addr[0] = 0;
    __scoped_atomic_store_n(d1_addr, 1, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM);
    while (__scoped_atomic_load_n(d0_addr, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE) != 2);
}

int main(int argc, char** argv)
{
    MPI_Init(&argc, &argv);
    
    int rank, size;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    
    if (size != 2) {
        if (rank == 0) printf("Error: This test requires exactly 2 MPI processes\n");
        MPI_Finalize();
        return -1;
    }
    
    // Set device based on rank
    HIP_CALL(hipSetDevice(rank));
    
    // Enable P2P
    int peer = 1 - rank;
    hipError_t err = hipDeviceEnablePeerAccess(peer, 0);
    if (err != hipSuccess && err != hipErrorPeerAccessAlreadyEnabled) {
        HIP_CALL(err);
    }
    
    // Allocate local buffers with IPC-compatible memory
    int *d1_addr, *signal_addr, *d0_addr;
    HIP_CALL(hipExtMallocWithFlags((void**)&d1_addr, sizeof(int), hipDeviceMallocUncached));
    HIP_CALL(hipExtMallocWithFlags((void**)&signal_addr, sizeof(int), hipDeviceMallocUncached));
    HIP_CALL(hipExtMallocWithFlags((void**)&d0_addr, sizeof(int), hipDeviceMallocUncached));
    HIP_CALL(hipMemset(d1_addr, 0, sizeof(int)));
    HIP_CALL(hipMemset(signal_addr, 0, sizeof(int)));
    HIP_CALL(hipMemset(d0_addr, 0, sizeof(int)));
    
    // Get IPC handles for local buffers
    hipIpcMemHandle_t my_handles[3];
    void* base_ptrs[3];
    int64_t offsets[3] = {0, 0, 0};
    
    void* bufs[3] = {d1_addr, signal_addr, d0_addr};
    for (int i = 0; i < 3; i++) {
        HIP_CALL(hipPointerGetAttribute(&base_ptrs[i], HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR,
                                        (hipDeviceptr_t)bufs[i]));
        HIP_CALL(hipIpcGetMemHandle(&my_handles[i], base_ptrs[i]));
        offsets[i] = ((char*)bufs[i]) - ((char*)base_ptrs[i]);
    }
    
    // Allgather IPC handles and offsets
    hipIpcMemHandle_t all_handles[6];  // 2 ranks * 3 buffers
    int64_t all_offsets[6];
    
    MPI_Allgather(my_handles, 3 * sizeof(hipIpcMemHandle_t), MPI_BYTE,
                  all_handles, 3 * sizeof(hipIpcMemHandle_t), MPI_BYTE, MPI_COMM_WORLD);
    MPI_Allgather(offsets, 3, MPI_INT64_T, all_offsets, 3, MPI_INT64_T, MPI_COMM_WORLD);
    
    // Open peer's IPC handles
    int *peer_d1_addr, *peer_signal_addr, *peer_d0_addr;
    char* ipc_ptrs[3];
    
    for (int i = 0; i < 3; i++) {
        HIP_CALL(hipIpcOpenMemHandle((void**)&ipc_ptrs[i], all_handles[peer * 3 + i], 
                                     hipIpcMemLazyEnablePeerAccess));
        ipc_ptrs[i] += all_offsets[peer * 3 + i];
    }
    peer_d1_addr = (int*)ipc_ptrs[0];
    peer_signal_addr = (int*)ipc_ptrs[1];
    peer_d0_addr = (int*)ipc_ptrs[2];
    
    MPI_Barrier(MPI_COMM_WORLD);
    
    if (rank == 1) {
        // Device 1: launch consumer kernel first
        d1_loop_load_and_store<<<1, 1>>>(d1_addr, peer_signal_addr, peer_d0_addr);
    }
    
    MPI_Barrier(MPI_COMM_WORLD);
    
    if (rank == 0) {
        // Device 0: sync then launch producer kernel with timing
        d0_start_sync<<<1, 1>>>(signal_addr);
        HIP_CALL(hipDeviceSynchronize());
        
        hipEvent_t start, stop;
        HIP_CALL(hipEventCreate(&start));
        HIP_CALL(hipEventCreate(&stop));
        HIP_CALL(hipEventRecord(start, 0));
        
        d0_store_and_loop_load<<<1, 1>>>(d0_addr, peer_d1_addr);
        
        HIP_CALL(hipEventRecord(stop, 0));
        HIP_CALL(hipEventSynchronize(stop));
        
        float ms;
        HIP_CALL(hipEventElapsedTime(&ms, start, stop));
        printf("Ping-pong latency: %.3f us\n", ms * 1000.0f);
        
        HIP_CALL(hipEventDestroy(start));
        HIP_CALL(hipEventDestroy(stop));
    }
    
    HIP_CALL(hipDeviceSynchronize());
    MPI_Barrier(MPI_COMM_WORLD);
    
    // Cleanup
    for (int i = 0; i < 3; i++) {
        HIP_CALL(hipIpcCloseMemHandle((void*)(ipc_ptrs[i] - all_offsets[peer * 3 + i])));
    }
    HIP_CALL(hipFree(d1_addr));
    HIP_CALL(hipFree(signal_addr));
    HIP_CALL(hipFree(d0_addr));
    
    MPI_Finalize();
    return 0;
}

编译

hipcc -I/usr/lib/x86_64-linux-gnu/openmpi/include \
      -L/usr/lib/x86_64-linux-gnu/openmpi/lib \
      -lmpi \
      ping_latency.cu -o ping_latency

运行

mpirun -np 2 ./ping_latency

 最后控制台打印的latency是两个gpu一来一回的latency,ping latency还要除2,只是个大概,并不是非常精确

posted @ 2026-01-12 16:16  Wangtn  阅读(2)  评论(0)    收藏  举报