1 /*
2 * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 *
4 * NVIDIA Corporation and its licensors retain all intellectual property and
5 * proprietary rights in and to this software and related documentation.
6 * Any use, reproduction, disclosure, or distribution of this software
7 * and related documentation without an express license agreement from
8 * NVIDIA Corporation is strictly prohibited.
9 *
10 * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 * associated with this source code for terms and conditions that govern
12 * your use of this NVIDIA software.
13 *
14 */
15
16
17 #include "../common/book.h"
18 #include "cuda.h"
19 #include "cuda_runtime.h"
20 #include "device_launch_parameters.h"
21 #include "device_functions.h"
22 #define imin(a,b) (a<b?a:b)
23
24 const int N = 33 * 1024 * 1024;
25 const int threadsPerBlock = 256;
26 const int blocksPerGrid =
27 imin(32, (N + threadsPerBlock - 1) / threadsPerBlock);
28
29
30 __global__ void dot(int size, float *a, float *b, float *c) {
31 __shared__ float cache[threadsPerBlock];
32 int tid = threadIdx.x + blockIdx.x * blockDim.x;
33 int cacheIndex = threadIdx.x;
34
35 float temp = 0;
36 while (tid < size) {
37 temp += a[tid] * b[tid];
38 tid += blockDim.x * gridDim.x;
39 }
40
41 // set the cache values
42 cache[cacheIndex] = temp;
43
44 // synchronize threads in this block
45 __syncthreads();
46
47 // for reductions, threadsPerBlock must be a power of 2
48 // because of the following code
49 int i = blockDim.x / 2;
50 while (i != 0) {
51 if (cacheIndex < i)
52 cache[cacheIndex] += cache[cacheIndex + i];
53 __syncthreads();
54 i /= 2;
55 }
56
57 if (cacheIndex == 0)
58 c[blockIdx.x] = cache[0];
59 }
60
61
62 float malloc_test(int size) {
63 cudaEvent_t start, stop;
64 float *a, *b, c, *partial_c;
65 float *dev_a, *dev_b, *dev_partial_c;
66 float elapsedTime;
67
68 HANDLE_ERROR(cudaEventCreate(&start));
69 HANDLE_ERROR(cudaEventCreate(&stop));
70
71 // allocate memory on the CPU side
72 a = (float*)malloc(size*sizeof(float));
73 b = (float*)malloc(size*sizeof(float));
74 partial_c = (float*)malloc(blocksPerGrid*sizeof(float));
75
76 // allocate the memory on the GPU
77 HANDLE_ERROR(cudaMalloc((void**)&dev_a,
78 size*sizeof(float)));
79 HANDLE_ERROR(cudaMalloc((void**)&dev_b,
80 size*sizeof(float)));
81 HANDLE_ERROR(cudaMalloc((void**)&dev_partial_c,
82 blocksPerGrid*sizeof(float)));
83
84 // fill in the host memory with data
85 for (int i = 0; i<size; i++) {
86 a[i] = i;
87 b[i] = i * 2;
88 }
89
90 HANDLE_ERROR(cudaEventRecord(start, 0));
91 // copy the arrays 'a' and 'b' to the GPU
92 HANDLE_ERROR(cudaMemcpy(dev_a, a, size*sizeof(float),
93 cudaMemcpyHostToDevice));
94 HANDLE_ERROR(cudaMemcpy(dev_b, b, size*sizeof(float),
95 cudaMemcpyHostToDevice));
96
97 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,
98 dev_partial_c);
99 // copy the array 'c' back from the GPU to the CPU
100 HANDLE_ERROR(cudaMemcpy(partial_c, dev_partial_c,
101 blocksPerGrid*sizeof(float),
102 cudaMemcpyDeviceToHost));
103
104 HANDLE_ERROR(cudaEventRecord(stop, 0));
105 HANDLE_ERROR(cudaEventSynchronize(stop));
106 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
107 start, stop));
108
109 // finish up on the CPU side
110 c = 0;
111 for (int i = 0; i<blocksPerGrid; i++) {
112 c += partial_c[i];
113 }
114
115 HANDLE_ERROR(cudaFree(dev_a));
116 HANDLE_ERROR(cudaFree(dev_b));
117 HANDLE_ERROR(cudaFree(dev_partial_c));
118
119 // free memory on the CPU side
120 free(a);
121 free(b);
122 free(partial_c);
123
124 // free events
125 HANDLE_ERROR(cudaEventDestroy(start));
126 HANDLE_ERROR(cudaEventDestroy(stop));
127
128 printf("Value calculated: %f\n", c);
129
130 return elapsedTime;
131 }
132
133
134 float cuda_host_alloc_test(int size) {
135 cudaEvent_t start, stop;
136 float *a, *b, c, *partial_c;
137 float *dev_a, *dev_b, *dev_partial_c;
138 float elapsedTime;
139
140 HANDLE_ERROR(cudaEventCreate(&start));
141 HANDLE_ERROR(cudaEventCreate(&stop));
142
143 // allocate the memory on the CPU
144 HANDLE_ERROR(cudaHostAlloc((void**)&a,
145 size*sizeof(float),
146 cudaHostAllocWriteCombined |
147 cudaHostAllocMapped));
148 HANDLE_ERROR(cudaHostAlloc((void**)&b,
149 size*sizeof(float),
150 cudaHostAllocWriteCombined |
151 cudaHostAllocMapped));
152 HANDLE_ERROR(cudaHostAlloc((void**)&partial_c,
153 blocksPerGrid*sizeof(float),
154 cudaHostAllocMapped));
155
156 // find out the GPU pointers
157 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_a, a, 0));
158 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_b, b, 0));
159 HANDLE_ERROR(cudaHostGetDevicePointer(&dev_partial_c,
160 partial_c, 0));
161
162 // fill in the host memory with data
163 for (int i = 0; i<size; i++) {
164 a[i] = i;
165 b[i] = i * 2;
166 }
167
168 HANDLE_ERROR(cudaEventRecord(start, 0));
169
170 dot << <blocksPerGrid, threadsPerBlock >> >(size, dev_a, dev_b,
171 dev_partial_c);
172
173 HANDLE_ERROR(cudaThreadSynchronize());
174 HANDLE_ERROR(cudaEventRecord(stop, 0));
175 HANDLE_ERROR(cudaEventSynchronize(stop));
176 HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,
177 start, stop));
178
179 // finish up on the CPU side
180 c = 0;
181 for (int i = 0; i<blocksPerGrid; i++) {
182 c += partial_c[i];
183 }
184
185 HANDLE_ERROR(cudaFreeHost(a));
186 HANDLE_ERROR(cudaFreeHost(b));
187 HANDLE_ERROR(cudaFreeHost(partial_c));
188
189 // free events
190 HANDLE_ERROR(cudaEventDestroy(start));
191 HANDLE_ERROR(cudaEventDestroy(stop));
192
193 printf("Value calculated: %f\n", c);
194
195 return elapsedTime;
196 }
197
198
199 int main(void) {
200 cudaDeviceProp prop;
201 int whichDevice;
202 HANDLE_ERROR(cudaGetDevice(&whichDevice));
203 HANDLE_ERROR(cudaGetDeviceProperties(&prop, whichDevice));
204 if (prop.canMapHostMemory != 1) {
205 printf("Device can not map memory.\n");
206 return 0;
207 }
208
209 float elapsedTime;
210
211 HANDLE_ERROR(cudaSetDeviceFlags(cudaDeviceMapHost));
212
213 // try it with malloc
214 elapsedTime = malloc_test(N);
215 printf("Time using cudaMalloc: %3.1f ms\n",
216 elapsedTime);
217
218 // now try it with cudaHostAlloc
219 elapsedTime = cuda_host_alloc_test(N);
220 printf("Time using cudaHostAlloc: %3.1f ms\n",
221 elapsedTime);
222 }