0_Simple__cppIntegration - 爨爨爨好

公告

▶ 分离编译【留坑，在 Linux 上用命令行试一下】

▶ 源代码：

 1 // cppIntegration_gold.cpp
 2 #include <vector_types.h>
 3 
 4 extern "C" void computeGold(char *reference, char *idata, const unsigned int len);
 5 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);
 6 
 7 void computeGold(char *reference, char *idata, const unsigned int len)
 8 {
 9     for (unsigned int i = 0; i < len; ++i)
10         reference[i] = idata[i] - 10;
11 }
12 
13 void computeGold2(int2 *reference, int2 *idata, const unsigned int len)
14 {
15     for (unsigned int i = 0; i < len; ++i)
16     {
17         reference[i].x = idata[i].x - idata[i].y;
18         reference[i].y = idata[i].y;
19     }
20 }

 1 // cppIntegration.cu
 2 #include <stdlib.h>
 3 #include <stdio.h>
 4 #include <string.h>
 5 #include <math.h>
 6 #include <assert.h>
 7 #include <cuda_runtime.h>
 8 #include <helper_cuda.h>
 9 #include <helper_functions.h>
10 
11 #ifndef MAX
12 #define MAX(a,b) (a > b ? a : b)
13 #endif
14 
15 extern "C" void computeGold(char *reference, char *idata, const unsigned int len);
16 extern "C" void computeGold2(int2 *reference, int2 *idata, const unsigned int len);
17 
18 // GPU上的运算
19 __global__ void kernel(int *g_data)
20 {
21     const unsigned int tid = threadIdx.x;
22     int data = g_data[tid];
23     // data 每个字节的值减去 10，再拼接到一起
24     g_data[tid] = ((((data <<  0) >> 24) - 10) << 24) | ((((data <<  8) >> 24) - 10) << 16) | ((((data << 16) >> 24) - 10) <<  8) | ((((data << 24) >> 24) - 10) <<  0) ;
25 }
26 
27 __global__ void kernel2(int2 *g_data)       // 使用 int2* 格式的输入
28 {
29     const unsigned int tid = threadIdx.x;
30     int2 data = g_data[tid];    
31     g_data[tid].x = data.x - data.y;        // data.x 中每个元素减去 data.y 中对应元素的偏移量
32 }
33 
34 // 测试不同的核函数处理的结果。输入两种格式的待处理数据，及其长度
35 extern "C" bool runTest(char *data, int2 *data_int2, unsigned int len)
36 {   
37     assert((len % 4) == 0);                                                     // 要求数组长度为 4 的倍数
38     const unsigned int num_threads = len / 4, mem_size = sizeof(char) * len, mem_size_int2 = sizeof(int2) * len;
39 
40     char *d_data;
41     cudaMalloc((void **)&d_data, mem_size);
42     cudaMemcpy(d_data, data, mem_size, cudaMemcpyHostToDevice);
43     int2 *d_data_int2;
44     cudaMalloc((void **)&d_data_int2, mem_size_int2);
45     cudaMemcpy(d_data_int2, data_int2, mem_size_int2, cudaMemcpyHostToDevice);
46     
47     kernel << < dim3(1, 1, 1), dim3(num_threads, 1, 1) >> > ((int *)d_data);
48     kernel2 << < dim3(1, 1, 1), dim3(len, 1, 1) >> > (d_data_int2);
49 
50     getLastCudaError("Kernel execution failed");                                // 检查和函数运行是否有错误，有错则输出这话
51 
52     char *reference = (char *)malloc(mem_size);                                 // 使用 CPU 计算
53     computeGold(reference, data, len);
54     printf("ref char*:%s\n", reference);
55     int2 *reference2 = (int2 *)malloc(mem_size_int2);
56     computeGold2(reference2, data_int2, len);
57     printf("ref int2 :");
58     for (int i = 0; i < len;i++)
59         printf("%c", reference2[i].x);
60     printf("\n");
61 
62     cudaMemcpy(data, d_data, mem_size, cudaMemcpyDeviceToHost);    
63     cudaMemcpy(data_int2, d_data_int2, mem_size_int2, cudaMemcpyDeviceToHost);
64     cudaDeviceSynchronize();
65     printf("gpu char*:%s\n", (char *)data);
66     printf("gpu int2 :");
67     for (int i = 0; i < len; i++)        
68         printf("%c", data_int2[i].x);
69     printf("\n");
70     
71     cudaFree(d_data);
72     cudaFree(d_data_int2);
73     free(reference);
74     free(reference2);
75     return 0;
76 }

 1 // main.cpp
 2 #include <iostream>
 3 #include <cstdlib>
 4 #include <cuda_runtime.h>
 5 #include <vector_types.h>
 6 #include <helper_cuda.h>
 7 
 8 extern "C" bool runTest(char *data, int2 *data_int2, unsigned int len);
 9 
10 int main()
11 {
12     const int len = 16;
13     int2 i2[16];                    // cuda 内置的 int2 类型
14     char str[len] = { 82, 111, 118, 118,121, 42, 97, 121, 124, 118, 110, 56, 10, 10, 10,  10};    
15     for (int i = 0; i < len; i++)
16     {
17         i2[i].x = str[i];
18         i2[i].y = 10;
19     }    
20     runTest(str, i2, len);
21 
22     getchar();
23     return 0;
24 }

● 输出结果：

ref char*: Hello World.
ref int2 :Hello World.
gpu char*: Hello World.
gpu int2 :Hello World.

▶ 涨姿势：

● cuda 内置的 int2 类型，整数有序对。涉及的定义如下：

1 #define __cuda_builtin_vector_align8(tag, members)  \
2     struct __device_builtin__ __align__(8) tag      \
3     {                                               \
4         members                                     \
5     }
6 
7 __cuda_builtin_vector_align8(int2, int x; int y;);
8 
9 typedef __device_builtin__ struct int2 int2;

● 警告函数和错误检查函数

 1 #define assert(expression) (void)                                                                   \
 2 (                                                                                                   \
 3 (!!(expression)) || (_wassert(_CRT_WIDE(#expression), _CRT_WIDE(__FILE__), (unsigned)(__LINE__)), 0)\
 4 )
 5 
 6 #define getLastCudaError(msg) __getLastCudaError (msg, __FILE__, __LINE__)
 7 
 8 inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
 9 {
10     cudaError_t err = cudaGetLastError();
11     if (cudaSuccess != err)
12     {
13         fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
14             file, line, errorMessage, (int)err, cudaGetErrorString(err));
15         DEVICE_RESET
16         exit(EXIT_FAILURE);
17     }
18 }

posted on 2017-10-27 12:10 爨爨爨好阅读(373) 评论(0) 收藏举报

刷新页面返回顶部