amd gpu测xgmi跨卡ping latency
卡1写sig buf,然后轮询读本地,然后写卡0 buf
卡0轮询读sig buf,读到退出,说明卡1已经进入轮询读,接着计时,启动卡0写卡1 buf,轮询读本地,读到说明卡1接收到卡0写的值,并往卡0 buf中写回完成
/* * MPI-based cross-device ping-pong latency test * Two processes control device 0 and device 1 respectively * Use IPC handles for cross-device memory access */ #include <hip/hip_runtime.h> #include <mpi.h> #include <stdio.h> #include <stdlib.h> #define HIP_CALL(cmd) \ do { \ hipError_t error = (cmd); \ if (error != hipSuccess) { \ fprintf(stderr, "HIP error: '%s'(%d) at %s:%d\n", \ hipGetErrorString(error), error, __FILE__, __LINE__); \ exit(EXIT_FAILURE); \ } \ } while (0) /* * start first at gpu1 */ __global__ void d1_loop_load_and_store(int* d1_addr, int* signal_addr, int* d0_addr) { d1_addr[0] = 0; __scoped_atomic_store_n(signal_addr, 1223, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM); while (__scoped_atomic_load_n(d1_addr, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE) != 1); __scoped_atomic_store_n(d0_addr, 2, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM); } __global__ void d0_start_sync(int* signal_addr) { while (__scoped_atomic_load_n(signal_addr, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE) != 1223); } __global__ void d0_store_and_loop_load(int* d0_addr, int* d1_addr) { d0_addr[0] = 0; __scoped_atomic_store_n(d1_addr, 1, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM); while (__scoped_atomic_load_n(d0_addr, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_DEVICE) != 2); } int main(int argc, char** argv) { MPI_Init(&argc, &argv); int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); if (size != 2) { if (rank == 0) printf("Error: This test requires exactly 2 MPI processes\n"); MPI_Finalize(); return -1; } // Set device based on rank HIP_CALL(hipSetDevice(rank)); // Enable P2P int peer = 1 - rank; hipError_t err = hipDeviceEnablePeerAccess(peer, 0); if (err != hipSuccess && err != hipErrorPeerAccessAlreadyEnabled) { HIP_CALL(err); } // Allocate local buffers with IPC-compatible memory int *d1_addr, *signal_addr, *d0_addr; HIP_CALL(hipExtMallocWithFlags((void**)&d1_addr, sizeof(int), hipDeviceMallocUncached)); HIP_CALL(hipExtMallocWithFlags((void**)&signal_addr, sizeof(int), hipDeviceMallocUncached)); HIP_CALL(hipExtMallocWithFlags((void**)&d0_addr, sizeof(int), hipDeviceMallocUncached)); HIP_CALL(hipMemset(d1_addr, 0, sizeof(int))); HIP_CALL(hipMemset(signal_addr, 0, sizeof(int))); HIP_CALL(hipMemset(d0_addr, 0, sizeof(int))); // Get IPC handles for local buffers hipIpcMemHandle_t my_handles[3]; void* base_ptrs[3]; int64_t offsets[3] = {0, 0, 0}; void* bufs[3] = {d1_addr, signal_addr, d0_addr}; for (int i = 0; i < 3; i++) { HIP_CALL(hipPointerGetAttribute(&base_ptrs[i], HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR, (hipDeviceptr_t)bufs[i])); HIP_CALL(hipIpcGetMemHandle(&my_handles[i], base_ptrs[i])); offsets[i] = ((char*)bufs[i]) - ((char*)base_ptrs[i]); } // Allgather IPC handles and offsets hipIpcMemHandle_t all_handles[6]; // 2 ranks * 3 buffers int64_t all_offsets[6]; MPI_Allgather(my_handles, 3 * sizeof(hipIpcMemHandle_t), MPI_BYTE, all_handles, 3 * sizeof(hipIpcMemHandle_t), MPI_BYTE, MPI_COMM_WORLD); MPI_Allgather(offsets, 3, MPI_INT64_T, all_offsets, 3, MPI_INT64_T, MPI_COMM_WORLD); // Open peer's IPC handles int *peer_d1_addr, *peer_signal_addr, *peer_d0_addr; char* ipc_ptrs[3]; for (int i = 0; i < 3; i++) { HIP_CALL(hipIpcOpenMemHandle((void**)&ipc_ptrs[i], all_handles[peer * 3 + i], hipIpcMemLazyEnablePeerAccess)); ipc_ptrs[i] += all_offsets[peer * 3 + i]; } peer_d1_addr = (int*)ipc_ptrs[0]; peer_signal_addr = (int*)ipc_ptrs[1]; peer_d0_addr = (int*)ipc_ptrs[2]; MPI_Barrier(MPI_COMM_WORLD); if (rank == 1) { // Device 1: launch consumer kernel first d1_loop_load_and_store<<<1, 1>>>(d1_addr, peer_signal_addr, peer_d0_addr); } MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) { // Device 0: sync then launch producer kernel with timing d0_start_sync<<<1, 1>>>(signal_addr); HIP_CALL(hipDeviceSynchronize()); hipEvent_t start, stop; HIP_CALL(hipEventCreate(&start)); HIP_CALL(hipEventCreate(&stop)); HIP_CALL(hipEventRecord(start, 0)); d0_store_and_loop_load<<<1, 1>>>(d0_addr, peer_d1_addr); HIP_CALL(hipEventRecord(stop, 0)); HIP_CALL(hipEventSynchronize(stop)); float ms; HIP_CALL(hipEventElapsedTime(&ms, start, stop)); printf("Ping-pong latency: %.3f us\n", ms * 1000.0f); HIP_CALL(hipEventDestroy(start)); HIP_CALL(hipEventDestroy(stop)); } HIP_CALL(hipDeviceSynchronize()); MPI_Barrier(MPI_COMM_WORLD); // Cleanup for (int i = 0; i < 3; i++) { HIP_CALL(hipIpcCloseMemHandle((void*)(ipc_ptrs[i] - all_offsets[peer * 3 + i]))); } HIP_CALL(hipFree(d1_addr)); HIP_CALL(hipFree(signal_addr)); HIP_CALL(hipFree(d0_addr)); MPI_Finalize(); return 0; }
编译
hipcc -I/usr/lib/x86_64-linux-gnu/openmpi/include \ -L/usr/lib/x86_64-linux-gnu/openmpi/lib \ -lmpi \ ping_latency.cu -o ping_latency
运行
mpirun -np 2 ./ping_latency
最后控制台打印的latency是两个gpu一来一回的latency,ping latency还要除2,只是个大概,并不是非常精确
无情的摸鱼机器

浙公网安备 33010602011771号