1 #include <stdio.h>
2 #include <cuda_runtime.h>
3 #include <device_launch_parameters.h>
4 #include <book.h>
5 #include <gputimer.h>
6 #define N (33 * 1024)
7
8 __global__ void add(int *a, int *b, int *c){
9 int tid = threadIdx.x + blockIdx.x * blockDim.x;
10 while (tid < N){
11 c[tid] = a[tid] + b[tid];
12 tid += blockDim.x * gridDim.x;
13 }
14 }
15
16 int main(void){
17 int a[N], b[N], c[N];
18 int *dev_a, *dev_b, *dev_c;
19
20 HANDLE_ERROR(cudaMalloc((void **)&dev_a, N * sizeof(int)));
21 HANDLE_ERROR(cudaMalloc((void **)&dev_b, N*sizeof(int)));
22 HANDLE_ERROR(cudaMalloc((void **)&dev_c, N*sizeof(int)));
23
24 for (int i = 0; i < N; i++){
25 a[i] = i;
26 b[i] = i*i;
27 }
28
29 HANDLE_ERROR(cudaMemcpy(dev_a, a, N*sizeof(int), cudaMemcpyHostToDevice));
30 HANDLE_ERROR(cudaMemcpy(dev_b, b, N*sizeof(int), cudaMemcpyHostToDevice));
31 add << <128, 128 >> >(dev_a, dev_b, dev_c);
32
33 HANDLE_ERROR(cudaMemcpy(c, dev_c, N*sizeof(int), cudaMemcpyDeviceToHost));
34
35 bool success = true;
36 for (int i = 0; i < N; i++){
37 if (a[i] + b[i] != c[i]){
38 printf("Error: %d + %d != %d\n", a[i], b[i], c[i]);
39 success = false;
40 }
41 }
42 if (success)
43 printf("We did it!\n");
44
45 cudaFree(dev_a);
46 cudaFree(dev_b);
47 cudaFree(dev_c);
48 return 0;
49 }