0_Simple__simpleStreams - 爨爨爨好

公告

对比使用单流和多流（4条）情况下数据拷贝，以及数据拷贝加内核调用的效率差别。
▶ 源代码

  1 #include <stdio.h>
  2 #include <cuda_runtime.h>
  3 #include "device_launch_parameters.h"
  4 #include <helper_functions.h>
  5 #include <helper_cuda.h>
  6 
  7 // 默认使用 windows64 系统，使用 64-bit 目标代码，码删掉了对其他系统的支持
  8 #define MEMORY_ALIGNMENT    4096                                // 内存对齐到 4KB
  9 #define ALIGN_UP(x,size)    (((size_t)x+(size-1))&(~(size-1)) ) // x 除以 size 向上取整
 10 
 11 __global__ void init_array(int *g_data, int *factor, int num_iterations)
 12 {
 13     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 14     for (int i = 0; i < num_iterations; i++)
 15         g_data[idx] += *factor;
 16 }
 17 
 18 bool check(int *a, const int nArray, const int c)
 19 {
 20     for (int i = 0; i < nArray; i++)
 21     {
 22         if (a[i] != c)
 23         {
 24             printf("\nArray\tError at i = %d, %d, %d\n", i, a[i], c);
 25             return false;
 26         }
 27     }
 28     return true;
 29 }
 30 
 31 inline void AllocateHostMemory(bool bPinGenericMemory, int **pp_a, int **ppAligned_a, int nByte)
 32 {
 33     if (bPinGenericMemory)// 申请原生页对齐锁定内存
 34     {
 35         printf("\nVirtualAlloc(), %4.2f MB (generic page-aligned system memory)\n", (float)nByte/1048576.0f);
 36         *pp_a = (int *) VirtualAlloc(NULL, (nByte + MEMORY_ALIGNMENT), MEM_RESERVE|MEM_COMMIT, PAGE_READWRITE);
 37         *ppAligned_a = (int *)ALIGN_UP(*pp_a, MEMORY_ALIGNMENT);        
 38         cudaHostRegister(*ppAligned_a, nByte, cudaHostRegisterMapped);  // 页锁定内存，异步拷贝必需
 39     }
 40     else
 41     {
 42         printf("\ncudaMallocHost(), %4.2f MB\n", (float)nByte/1048576.0f);
 43         cudaMallocHost((void **)pp_a, nByte);                           // 申请时已经页锁定
 44         *ppAligned_a = *pp_a;                                           
 45     }
 46 }
 47 
 48 int main()// 使用默认参数，不再从命令行中获取参数
 49 {
 50     printf("\n\tStart\n");               
 51     int nreps = 100;        // 核函数测试次数
 52     int niterations = 5;    // 核函数中的重复次数
 53     int nstreams = 4;       // 使用的流数
 54     float elapsed_time;
 55     bool bPinGenericMemory;
 56    
 57     cudaSetDevice(0);// 删掉了筛选设备的过程
 58     cudaDeviceProp deviceProp;
 59     cudaGetDeviceProperties(&deviceProp, 0);
 60     if (deviceProp.canMapHostMemory)// 检查 GPU 是否支持主机内存映射，否则原生内存还是不能用
 61         bPinGenericMemory = true;
 62     else
 63     {
 64         printf("\nDevice not support mapping of generic host memory, use cudaMallocHost() instead\n");
 65         bPinGenericMemory = false;
 66     }
 67         
 68     // 流处理器个数不足 32 时降低测试负载（源代码没有减少 nByte 的大小，已改进）
 69     float scale_factor = max(32.0f / float(_ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount), 1.0f);
 70     int nArray = (int)rint((float)16 * 1024 * 1024 / scale_factor); // 测试数组元素个数
 71     int nByte = nArray * sizeof(int);                               // 测试数组内存大小
 72     printf("\nWorkload *= %1.4f, array_size = %d\n", 1.0f / scale_factor, nArray);
 73     
 74     cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync | (bPinGenericMemory ? cudaDeviceMapHost : 0));// 使用线程块同步，减少 CPU 的使用
 75 
 76     int *h_a = 0, *hAligned_a = 0; 
 77     AllocateHostMemory(bPinGenericMemory, &h_a, &hAligned_a, nByte);// 使用设定的方式申请内存
 78     int c = 5, *d_a = 0, *d_c = 0;
 79     cudaMalloc((void **)&d_a, nByte);
 80     cudaMemset(d_a, 0x0, nByte);
 81     cudaMalloc((void **)&d_c, sizeof(int));
 82     cudaMemcpy(d_c, &c, sizeof(int), cudaMemcpyHostToDevice);
 83     cudaEvent_t start_event, stop_event;
 84     cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync);
 85     cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync);
 86     cudaStream_t *streams = (cudaStream_t *)malloc(nstreams * sizeof(cudaStream_t));
 87     for (int i = 0; i < nstreams; i++)
 88         cudaStreamCreate(&(streams[i]));
 89 
 90     printf("\n\tStart test\n");   
 91     // 异步拷贝测试
 92     cudaEventRecord(start_event, 0);     
 93     cudaMemcpyAsync(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost, streams[0]);
 94     cudaEventRecord(stop_event, 0);
 95     cudaEventSynchronize(stop_event);  
 96     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
 97     printf("memcopy:\t%.2f\n", elapsed_time);
 98 
 99     // 核函数测试
100     dim3 threads = dim3(512); 
101     dim3 blocks = dim3(nArray / threads.x);
102     cudaEventRecord(start_event, 0);
103     init_array << <blocks, threads, 0, streams[0] >> > (d_a, d_c, niterations);
104     cudaEventRecord(stop_event, 0); 
105     cudaEventSynchronize(stop_event);
106     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
107     printf("kernel:\t\t%.2f\n", elapsed_time);
108 
109     // 串行测试
110     cudaEventRecord(start_event, 0); 
111     for (int k = 0; k < nreps; k++)
112     {
113         init_array << <blocks, threads >> > (d_a, d_c, niterations);
114         cudaMemcpy(hAligned_a, d_a, nByte, cudaMemcpyDeviceToHost); 
115     }
116     cudaEventRecord(stop_event, 0);
117     cudaEventSynchronize(stop_event);
118     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
119     printf("non-streamed:\t%.2f\n", elapsed_time / nreps);
120 
121     // 多流测试
122     blocks = dim3(nArray / (nstreams*threads.x), 1);
123     memset(hAligned_a, 255, nByte);     
124     cudaMemset(d_a, 0, nByte);          
125     cudaEventRecord(start_event, 0);
126     for (int k = 0; k < nreps; k++)     // 分流给出内核函数和数据回传工作
127     {
128         for (int i = 0; i < nstreams; i++)
129             init_array << <blocks, threads, 0, streams[i] >> > (d_a + i *nArray / nstreams, d_c, niterations);
130         for (int i = 0; i < nstreams; i++)
131             cudaMemcpyAsync(hAligned_a + i * nArray / nstreams, d_a + i * nArray / nstreams, nByte / nstreams, cudaMemcpyDeviceToHost, streams[i]);
132     }
133     cudaEventRecord(stop_event, 0);
134     cudaEventSynchronize(stop_event);
135     cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
136     printf("%d streams:\t%.2f\n", nstreams, elapsed_time / nreps);
137 
138     // 检查结果和回收工作
139     printf("\n\tResult: %s\n", check(hAligned_a, nArray, c*nreps*niterations)?"Passed":"Failed");
140     cudaFree(d_a);
141     cudaFree(d_c);
142     if (bPinGenericMemory)
143     {
144         cudaHostUnregister(hAligned_a);
145         VirtualFree(h_a, 0, MEM_RELEASE);
146     }
147     else
148         cudaFreeHost(h_a);
149     cudaEventDestroy(start_event);
150     cudaEventDestroy(stop_event);
151     for (int i = 0; i < nstreams; i++)
152         cudaStreamDestroy(streams[i]);
153 
154     getchar();
155     return 0;
156 }

▶ 输出结果

    Start

Workload *= 1.0000, array_size = 16777216

VirtualAlloc(), 64.00 MB (generic page-aligned system memory)

    Start test
memcopy:        5.34
kernel:         5.15
non-streamed:   9.95
4 streams:      5.24

    Result: Passed

▶ 涨姿势

● 涉及的宏和内部函数原型

 1 // driver types.h
 2 #define cudaStreamPerThread                 ((cudaStream_t)0x2)
 3 
 4 #define cudaEventDefault                    0x00  // Default event flag 
 5 #define cudaEventBlockingSync               0x01  // Event uses blocking synchronization 
 6 #define cudaEventDisableTiming              0x02  // Event will not record timing data 
 7 #define cudaEventInterprocess               0x04  // Event is suitable for interprocess use. cudaEventDisableTiming must be set 
 8 
 9 #define cudaDeviceScheduleAuto              0x00  // Device flag - Automatic scheduling 
10 #define cudaDeviceScheduleSpin              0x01  // Device flag - Spin default scheduling 
11 #define cudaDeviceScheduleYield             0x02  // Device flag - Yield default scheduling 
12 #define cudaDeviceScheduleBlockingSync      0x04  // Device flag - Use blocking synchronization 
13 #define cudaDeviceBlockingSync              0x04  // Device flag - Use blocking synchronization 
14                                                      deprecated This flag was deprecated as of CUDA 4.0 and
15                                                      replaced with ::cudaDeviceScheduleBlockingSync. 
16 #define cudaDeviceScheduleMask              0x07  // Device schedule flags mask 
17 #define cudaDeviceMapHost                   0x08  // Device flag - Support mapped pinned allocations 
18 #define cudaDeviceLmemResizeToMax           0x10  // Device flag - Keep local memory allocation after launch 
19 #define cudaDeviceMask                      0x1f  // Device flags mask 
20 
21 #define cudaArrayDefault                    0x00  // Default CUDA array allocation flag 
22 #define cudaArrayLayered                    0x01  // Must be set in cudaMalloc3DArray to create a layered CUDA array 
23 #define cudaArraySurfaceLoadStore           0x02  // Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array 
24 #define cudaArrayCubemap                    0x04  // Must be set in cudaMalloc3DArray to create a cubemap CUDA array 
25 #define cudaArrayTextureGather              0x08  // Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array 
26 
27 #define cudaIpcMemLazyEnablePeerAccess      0x01  // Automatically enable peer access between remote devices as needed 
28 
29 #define cudaMemAttachGlobal                 0x01  // Memory can be accessed by any stream on any device
30 #define cudaMemAttachHost                   0x02  // Memory cannot be accessed by any stream on any device 
31 #define cudaMemAttachSingle                 0x04  // Memory can only be accessed by a single stream on the associated device 
32 
33 #define cudaOccupancyDefault                0x00  // Default behavior 
34 #define cudaOccupancyDisableCachingOverride 0x01  // Assume global caching is enabled and cannot be automatically turned off 
35 
36 #define cudaCpuDeviceId                     ((int)-1) // Device id that represents the CPU 
37 #define cudaInvalidDeviceId                 ((int)-2) // Device id that represents an invalid device 
38 
39 // cuda_runtime_api.h
40 extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags );
41 
42 extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
43 
44 extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags);
45 
46 extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr);
47 
48 
49 // memoryapi.h
50 WINBASEAPI _Ret_maybenull_ _Post_writable_byte_size_(dwSize) LPVOID WINAPI VirtualAlloc                 \
51 (                                                                                                       \
52     _In_opt_ LPVOID lpAddress, _In_ SIZE_T dwSize, _In_ DWORD flAllocationType, _In_ DWORD flProtect    \
53 );
54 
55 WINBASEAPI BOOL WINAPI VirtualFree  \
56 (
57     _Pre_notnull_ _When_(dwFreeType == MEM_DECOMMIT, _Post_invalid_) _When_(dwFreeType == MEM_RELEASE, _Post_ptr_invalid_) LPVOID lpAddress,
58     _In_ SIZE_T dwSize,
59     _In_ DWORD dwFreeType
60 );
61 
62 // winnt.h
63 #define PAGE_READWRITE  0x04
64 #define MEM_COMMIT      0x1000      
65 #define MEM_RESERVE     0x2000

● 使用原生页对齐锁定内存的步骤

 1 #define CEIL(x,y) (((x) - 1) / (y) + 1)
 2 
 3 int sizeByte = sizeof(int) * 16 * 1024 * 1024;
 4 int align = 4096;
 5 int *p, *pAlign;
 6 p= (int *)VirtualAlloc(NULL, (sizeByte + align), MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 7 pAlign = (int *)CEIL(*p, align);
 8 cudaHostRegister(pAlign, sizeByte, cudaHostRegisterMapped);
 9 
10 ...
11 
12 cudaHostUnregister(pAlign);
13 VirtualFree(p, 0, MEM_RELEASE);

● 使用函数 cudaEventCreateWithFlags() 相关来计时，与之前的函数 cudaEventCreate() 稍有不同。

 1 float elapsed_time = 0.0f;
 2 cudaEvent_t start_event, stop_event;
 3 cudaEventCreateWithFlags(&start_event, cudaEventBlockingSync);
 4 cudaEventCreateWithFlags(&stop_event, cudaEventBlockingSync);
 5 cudaEventRecord(start_event, 0);
 6     
 7 ...
 8 
 9 cudaEventRecord(stop_event, 0);
10 cudaEventSynchronize(stop_event);
11 cudaEventElapsedTime(&elapsed_time, start_event, stop_event);
12 
13 cudaEventDestroy(start_event);
14 cudaEventDestroy(stop_event);

cudaEventCreateWithFlags

posted on 2017-11-30 23:16 爨爨爨好阅读(591) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部