零内存拷贝和普通拷贝对比

下载链接

  1 /*
  2 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 *
  4 * NVIDIA Corporation and its licensors retain all intellectual property and
  5 * proprietary rights in and to this software and related documentation.
  6 * Any use, reproduction, disclosure, or distribution of this software
  7 * and related documentation without an express license agreement from
  8 * NVIDIA Corporation is strictly prohibited.
  9 *
 10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 * associated with this source code for terms and conditions that govern
 12 * your use of this NVIDIA software.
 13 *
 14 */
 15 
 16 
 17 #include "../common/book.h"
 18 #include "cuda.h"
 19 #include "cuda_runtime.h"
 20 #include "device_launch_parameters.h"
 21 #include "device_functions.h"
 22 #define imin(a,b) (a<b?a:b)
 23 
 24 const int N = 33 * 1024 * 1024;
 25 const int threadsPerBlock = 256;
 26 const int blocksPerGrid =
 27 imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
 28 
 29 
 30 __global__ void dot(int size, float *a, float *b, float *c) {
 31     __shared__ float cache[threadsPerBlock];
 32     int tid = threadIdx.x + blockIdx.x * blockDim.x;
 33     int cacheIndex = threadIdx.x;
 34 
 35     float   temp = 0;
 36     while (tid < size) {
 37         temp += a[tid] * b[tid];
 38         tid += blockDim.x * gridDim.x;
 39     }
 40 
 41     // set the cache values
 42     cache[cacheIndex] = temp;
 43 
 44     // synchronize threads in this block
 45     __syncthreads();
 46 
 47     // for reductions, threadsPerBlock must be a power of 2
 48     // because of the following code
 49     int i = blockDim.x / 2;
 50     while (i != 0) {
 51         if (cacheIndex < i)
 52             cache[cacheIndex] += cache[cacheIndex + i];
 53         __syncthreads();
 54         i /= 2;
 55     }
 56 
 57     if (cacheIndex == 0)
 58         c[blockIdx.x] = cache[0];
 59 }
 60 
 61 
 62 float malloc_test(int size) {
 63     cudaEvent_t     start, stop;
 64     float           *a, *b, c, *partial_c;
 65     float           *dev_a, *dev_b, *dev_partial_c;
 66     float           elapsedTime;
 67 
 68     HANDLE_ERROR(cudaEventCreate(&start));
 69     HANDLE_ERROR(cudaEventCreate(&stop));
 70 
 71     // allocate memory on the CPU side
 72     a = (float*)malloc(size*sizeof(float));
 73     b = (float*)malloc(size*sizeof(float));
 74     partial_c = (float*)malloc(blocksPerGrid*sizeof(float));
 75 
 76     // allocate the memory on the GPU
 77     HANDLE_ERROR(cudaMalloc((void**)&dev_a,
 78         size*sizeof(float)));
 79     HANDLE_ERROR(cudaMalloc((void**)&dev_b,
 80         size*sizeof(float)));
 81     HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c,
 82         blocksPerGrid*sizeof(float)));
 83 
 84     // fill in the host memory with data
 85     for (int i = 0; i<size; i++) {
 86         a[i] = i;
 87         b[i] = i * 2;
 88     }
 89 
 90     HANDLE_ERROR(cudaEventRecord(start, 0));
 91     // copy the arrays 'a' and 'b' to the GPU
 92     HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float),
 93         cudaMemcpyHostToDevice));
 94     HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float),
 95         cudaMemcpyHostToDevice));
 96 
 97     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,
 98         dev_partial_c);
 99     // copy the array 'c' back from the GPU to the CPU
100     HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,
101         blocksPerGrid*sizeof(float),
102         cudaMemcpyDeviceToHost));
103 
104     HANDLE_ERROR(cudaEventRecord(stop, 0));
105     HANDLE_ERROR(cudaEventSynchronize(stop));
106     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
107         start, stop));
108 
109     // finish up on the CPU side
110     c = 0;
111     for (int i = 0; i<blocksPerGrid; i++) {
112         c += partial_c[i];
113     }
114 
115     HANDLE_ERROR(cudaFree(dev_a));
116     HANDLE_ERROR(cudaFree(dev_b));
117     HANDLE_ERROR(cudaFree(dev_partial_c));
118 
119     // free memory on the CPU side
120     free(a);
121     free(b);
122     free(partial_c);
123 
124     // free events
125     HANDLE_ERROR(cudaEventDestroy(start));
126     HANDLE_ERROR(cudaEventDestroy(stop));
127 
128     printf("Value calculated:  %f\n", c);
129 
130     return elapsedTime;
131 }
132 
133 
134 float cuda_host_alloc_test(int size) {
135     cudaEvent_t     start, stop;
136     float           *a, *b, c, *partial_c;
137     float           *dev_a, *dev_b, *dev_partial_c;
138     float           elapsedTime;
139 
140     HANDLE_ERROR(cudaEventCreate(&start));
141     HANDLE_ERROR(cudaEventCreate(&stop));
142 
143     // allocate the memory on the CPU
144     HANDLE_ERROR(cudaHostAlloc((void**)&a,
145         size*sizeof(float),
146         cudaHostAllocWriteCombined |
147         cudaHostAllocMapped));
148     HANDLE_ERROR(cudaHostAlloc((void**)&b,
149         size*sizeof(float),
150         cudaHostAllocWriteCombined |
151         cudaHostAllocMapped));
152     HANDLE_ERROR(cudaHostAlloc((void**)&partial_c,
153         blocksPerGrid*sizeof(float),
154         cudaHostAllocMapped));
155 
156     // find out the GPU pointers
157     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0));
158     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0));
159     HANDLE_ERROR(cudaHostGetDevicePointer(&dev_partial_c,
160         partial_c, 0));
161 
162     // fill in the host memory with data
163     for (int i = 0; i<size; i++) {
164         a[i] = i;
165         b[i] = i * 2;
166     }
167 
168     HANDLE_ERROR(cudaEventRecord(start, 0));
169 
170     dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,
171         dev_partial_c);
172 
173     HANDLE_ERROR(cudaThreadSynchronize());
174     HANDLE_ERROR(cudaEventRecord(stop, 0));
175     HANDLE_ERROR(cudaEventSynchronize(stop));
176     HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
177         start, stop));
178 
179     // finish up on the CPU side
180     c = 0;
181     for (int i = 0; i<blocksPerGrid; i++) {
182         c += partial_c[i];
183     }
184 
185     HANDLE_ERROR(cudaFreeHost(a));
186     HANDLE_ERROR(cudaFreeHost(b));
187     HANDLE_ERROR(cudaFreeHost(partial_c));
188 
189     // free events
190     HANDLE_ERROR(cudaEventDestroy(start));
191     HANDLE_ERROR(cudaEventDestroy(stop));
192 
193     printf("Value calculated:  %f\n", c);
194 
195     return elapsedTime;
196 }
197 
198 
199 int main(void) {
200     cudaDeviceProp  prop;
201     int whichDevice;
202     HANDLE_ERROR(cudaGetDevice(&whichDevice));
203     HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
204     if (prop.canMapHostMemory != 1) {
205         printf("Device can not map memory.\n");
206         return 0;
207     }
208 
209     float           elapsedTime;
210 
211     HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
212 
213     // try it with malloc
214     elapsedTime = malloc_test(N);
215     printf("Time using cudaMalloc:  %3.1f ms\n",
216         elapsedTime);
217 
218     // now try it with cudaHostAlloc
219     elapsedTime = cuda_host_alloc_test(N);
220     printf("Time using cudaHostAlloc:  %3.1f ms\n",
221         elapsedTime);
222 }

 

posted @ 2014-09-27 14:52  青竹居士  阅读(577)  评论(0编辑  收藏  举报