0_Simple__simpleP2P - 爨爨爨好

公告

使用 P2P 特性在 GPU 之间传输、读写数据。

▶ 源代码。包括 P2P 使用前的各项检查，设备之间的数据互拷，主机和设备之间数据传输和相互访问。

  1 #include <stdlib.h>
  2 #include <stdio.h>
  3 #include <cuda_runtime.h>
  4 #include "device_launch_parameters.h"
  5 #include <helper_cuda.h>
  6 #include <helper_functions.h>
  7 
  8 #define MAX_GPU_COUNT 64
  9 
 10 __global__ void SimpleKernel(float *src, float *dst)
 11 {   
 12     const int idx = blockIdx.x * blockDim.x + threadIdx.x;
 13     dst[idx] = src[idx] * 2.0f;
 14 }
 15 
 16 inline bool IsGPUCapableP2P(cudaDeviceProp *pProp)
 17 {
 18 #ifdef _WIN32
 19     return (bool)(pProp->tccDriver ? true : false);
 20 #else
 21     return (bool)(pProp->major >= 2);
 22 #endif
 23 }
 24 
 25 int main(int argc, char **argv)
 26 {
 27     printf("\n\tStarting\n", argv[0]);
 28 
 29     // 检查是否使用 64 位操作系统环境
 30     if (sizeof(void*) != 8)
 31     {
 32         printf("\n\tError for program only supported with 64-bit OS and 64-bit target\n");
 33         return EXIT_WAIVED;
 34     }
 35     
 36     // 找到头两块计算能力不小于 2.0 的设备
 37     int gpu_n;
 38     cudaGetDeviceCount(&gpu_n);
 39     printf("\n\tDevice count: %d\n", gpu_n);
 40     if (gpu_n < 2)
 41     {
 42         printf("\n\tError for two or more GPUs with SM2.0 required\n");
 43         return EXIT_WAIVED;
 44     }
 45     
 46     cudaDeviceProp prop[MAX_GPU_COUNT];
 47     int gpuid[MAX_GPU_COUNT], gpu_count = 0;
 48     printf("\n\tShow device\n");// 展示所有设备
 49     for (int i=0; i < gpu_n; i++)
 50     {
 51         cudaGetDeviceProperties(&prop[i], i);
 52         if ((prop[i].major >= 2)
 53 #ifdef _WIN32
 54             && prop[i].tccDriver// Windows 系统还要求有 Tesla 计算集群驱动
 55 #endif
 56            )    
 57             gpuid[gpu_count++] = i;
 58         printf("\n\tGPU%d = \"%15s\" ---- %s\n", i, prop[i].name, (IsGPUCapableP2P(&prop[i]) ? "YES" : "NO"));
 59     }
 60     if (gpu_count < 2)
 61     {
 62         printf("\n\tError for two or more GPUs with SM2.0 required\n");
 63 #ifdef _WIN32
 64         printf("\nOr for TCC driver required\n");
 65 #endif
 66         cudaSetDevice(0);
 67         return EXIT_WAIVED;
 68     }
 69 
 70     // 寻找测试设备
 71     int can_access_peer, p2pCapableGPUs[2];
 72     p2pCapableGPUs[0] = p2pCapableGPUs[1] = -1;
 73     printf("\n\tShow combination of devices with P2P\n");// 展示所有能 P2P 的设备组合
 74     for (int i = 0; i < gpu_count - 1; i++)
 75     {
 76         for (int j = i + 1; j < gpu_count; j++)
 77         {
 78             cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]);
 79             if (can_access_peer)
 80             {
 81                 printf("\n\tGPU%d (%s) <--> GPU%d (%s) : %s\n", gpuid[i], prop[gpuid[i]].name, gpuid[j], prop[gpuid[j]].name);
 82                 if (p2pCapableGPUs[0] == -1)
 83                     p2pCapableGPUs[0] = gpuid[i], p2pCapableGPUs[1] = gpuid[j];
 84             }
 85         }
 86     }
 87     if (p2pCapableGPUs[0] == -1 || p2pCapableGPUs[1] == -1)
 88     {
 89         printf("\n\tError for P2P not available among GPUs\n");
 90         for (int i=0; i < gpu_count; i++)
 91             cudaSetDevice(gpuid[i]);
 92         return EXIT_WAIVED;
 93     }
 94 
 95     // 使用找到的设备进行测试
 96     gpuid[0] = p2pCapableGPUs[0];
 97     gpuid[1] = p2pCapableGPUs[1];
 98     printf("\n\tEnabling P2P between GPU%d and GPU%d\n", gpuid[0], gpuid[1]);
 99     
100     // 启用 P2P
101     cudaSetDevice(gpuid[0]);
102     cudaDeviceEnablePeerAccess(gpuid[1], 0);
103     cudaSetDevice(gpuid[1]);
104     cudaDeviceEnablePeerAccess(gpuid[0], 0);
105 
106     // 检查设备是否支持同一可视地址空间 (Unified Virtual Address Space，UVA)
107     if (!(prop[gpuid[0]].unifiedAddressing && prop[gpuid[1]].unifiedAddressing))
108         printf("\n\tError for GPU not support UVA\n");
109         return EXIT_WAIVED;
110 
111     // 申请内存
112     const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
113     printf("\n\tAllocating buffers %iMB\n", int(buf_size / 1024 / 1024));
114     cudaSetDevice(gpuid[0]);
115     float *g0;
116     cudaMalloc(&g0, buf_size);
117     cudaSetDevice(gpuid[1]);
118     float *g1;
119     cudaMalloc(&g1, buf_size);
120     float *h0;
121     cudaMallocHost(&h0, buf_size);
122 
123     cudaEvent_t start_event, stop_event;
124     int eventflags = cudaEventBlockingSync;
125     float time_memcpy;
126     cudaEventCreateWithFlags(&start_event, eventflags);
127     cudaEventCreateWithFlags(&stop_event, eventflags);
128     cudaEventRecord(start_event, 0);
129 
130     for (int i=0; i<100; i++)
131     {
132         // GPU 互拷
133         // UVA 特性下 cudaMemcpyDefault 直接根据指针（属于主机还是设备）来确定拷贝方向
134         if (i % 2 == 0)
135             cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
136         else
137             cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault);
138     }
139     cudaEventRecord(stop_event, 0);
140     cudaEventSynchronize(stop_event);
141     cudaEventElapsedTime(&time_memcpy, start_event, stop_event);
142     printf("\n\tcudaMemcpy: %.2fGB/s\n", (100.0f * buf_size) / (1024.0f * 1024.0f * 1024.0f * (time_memcpy / 1000.0f)));
143 
144     for (int i=0; i<buf_size / sizeof(float); i++)
145         h0[i] = float(i % 4096);
146     cudaSetDevice(gpuid[0]);
147     cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault);
148 
149     const dim3 threads(512, 1);
150     const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
151 
152     // 使用 GPU1 读取 GPU0 的全局内存数据，计算并写入 GPU1 的全局内存
153     printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[1], gpuid[0], gpuid[1]);
154     cudaSetDevice(gpuid[1]);
155     SimpleKernel<<<blocks, threads>>>(g0, g1);
156     cudaDeviceSynchronize();
157 
158     // 使用 GPU0 读取 GPU1 的全局内存数据，计算并写入 GPU0 的全局内存
159     printf("\n\tRun kernel on GPU%d, reading data from GPU%d and writing to GPU%d\n", gpuid[0], gpuid[1], gpuid[0]);
160     cudaSetDevice(gpuid[0]);
161     SimpleKernel<<<blocks, threads>>>(g1, g0);
162     cudaDeviceSynchronize();
163 
164     // 检查结果
165     cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault);
166     int error_count = 0;
167     for (int i=0; i<buf_size / sizeof(float); i++)
168     {
169         if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
170         {
171             printf("\n\tResult error at %i: gpu[i] = %f, cpu[i] = %f\n", i, h0[i], (float(i%4096)*2.0f*2.0f));
172             if (error_count++ > 10)
173                 break;
174         }
175     }
176 
177     // 关闭 P2P
178     cudaSetDevice(gpuid[0]);
179     cudaDeviceDisablePeerAccess(gpuid[1]);
180     cudaSetDevice(gpuid[1]);
181     cudaDeviceDisablePeerAccess(gpuid[0]);
182 
183     // 回收工作
184     cudaFreeHost(h0);
185     cudaSetDevice(gpuid[0]);
186     cudaFree(g0);
187     cudaSetDevice(gpuid[1]);
188     cudaFree(g1);    
189     cudaEventDestroy(start_event);
190     cudaEventDestroy(stop_event);
191     for (int i=0; i<gpu_n; i++)
192         cudaSetDevice(i);
193     printf("\n\t%s!\n",error_count?"Test failed": "Test passed");
194 
195     getchar();
196     return 0;
197 }

▶ 输出结果

只有一台设备，暂无法进行测试

▶ 涨姿势：

● P2P 要求：至少两台计算能力不低于 2.0 的设备，并支持同一可视内存空间特性；计算环境不低于 CUDA 4.0；Windows 安装 Tesla 计算集群驱动。

● 使用P2P的关键步骤

 1 // 检查两台设备之间是否能使用 P2P
 2 int can_access_peer;
 3 cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
 4 
 5 // 启用 P2P
 6 cudaSetDevice(gpuid[i]);
 7 cudaDeviceEnablePeerAccess(gpuid[j], 0);
 8 cudaSetDevice(gpuid[j];
 9 cudaDeviceEnablePeerAccess(gpuid[i], 0);
10 
11 // 设备间传输数据
12 cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault);
13 
14 // 关闭 P2P
15 cudaSetDevice(gpuid[i]);
16 cudaDeviceDisablePeerAccess(gpuid[i]);
17 cudaSetDevice(gpuid[j]);
18 cudaDeviceDisablePeerAccess(gpuid[j]);
19 
20 // cuda_runtime_api.h
21 extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);
22 
23 extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);
24 
25 extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);

● 其他代码中的定义

1 // helper_string.h
2 #define EXIT_WAIVED 2

posted on 2017-11-25 12:01 爨爨爨好阅读(818) 评论(0) 收藏举报

刷新页面返回顶部