▶ CUDA 动态并行实现快排算法(单线程的递归调用)
▶ 源代码:动态并行递归调用线程块。要点:添加 -rdc=true 选项(生成 relocatable device code,相当于执行分离编译),以及链接库 cudadevrt.lib (用于动态并行,不同于运行时库 cudart.lib)
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_cuda.h> 6 #include <helper_string.h> 7 8 __device__ int g_blockId = 0; // 线程块的全局编号,供所有线程读写 9 10 __device__ void print_info(int depth, int blockId, int parent_threadId, int parent_blockId) // 打印当前线程块信息,包括深度,当前块号, 11 { 12 if (threadIdx.x == 0) 13 { 14 if (depth == 0) 15 printf("BLOCK %d launched by the host\n", blockId); 16 else 17 { 18 char buffer[32]; 19 for (int i = 0; i < depth; ++i) // 对应更多层级,每层前面都有相应层数的 "| " 20 { 21 buffer[3 * i + 0] = '+'; 22 buffer[3 * i + 1] = ' '; 23 buffer[3 * i + 2] = ' '; 24 } 25 buffer[3 * depth] = '\0'; 26 printf("%sBLOCK %d launched by thread %d of block %d\n", buffer, blockId, parent_threadId, parent_blockId); 27 } 28 } 29 __syncthreads(); 30 } 31 32 __global__ void cdp_kernel(int max_depth, int depth, int parent_threadId, int parent_blockId)// 线程块递归 33 { 34 __shared__ int s_blockId; // 当前线程块的编号 35 36 if (threadIdx.x == 0) // 读取当前 g_blockId 到 s_blockId 中,并将 g_blockId 加一 37 s_blockId = atomicAdd(&g_blockId, 1); 38 __syncthreads(); 39 40 print_info(depth, s_blockId, parent_threadId, parent_blockId); // 打印当前线程块信息, 41 42 if (++depth >= max_depth) // 达到最大递归深度则退出,否则继续调用 cdp_kernel() 43 return; 44 cdp_kernel << <gridDim.x, blockDim.x >> >(max_depth, depth, threadIdx.x, s_blockId); 45 } 46 47 int main(int argc, char **argv) 48 { 49 printf("CUDA Dynamic Parallelism\n"); 50 int max_depth = 3; 51 int device_count = 0, device = -1; 52 53 if (checkCmdLineFlag(argc, (const char **)argv, "help") || checkCmdLineFlag(argc, (const char **)argv, "h"))// 帮助模式 54 { 55 printf("Usage: %s depth=<max_depth>\t(where max_depth is a value between 1 and 8).\n", argv[0]); 56 exit(EXIT_SUCCESS); 57 } 58 if (checkCmdLineFlag(argc, (const char **)argv, "depth")) // 手动设置递归深度 59 { 60 max_depth = getCmdLineArgumentInt(argc, (const char **)argv, "depth"); 61 if (max_depth < 1 || max_depth > 8) 62 { 63 printf("depth parameter has to be between 1 and 8\n"); 64 exit(EXIT_FAILURE); 65 } 66 } 67 if (checkCmdLineFlag(argc, (const char **)argv, "device")) // 命令行指定设备 68 { 69 device = getCmdLineArgumentInt(argc, (const char **)argv, "device"); 70 cudaDeviceProp properties; 71 cudaGetDeviceProperties(&properties, device); 72 if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 73 printf("Running on GPU %d (%s)\n", device, properties.name); 74 else 75 { 76 printf("ERROR: required GPU with compute SM 3.5 or higher.\nCurrent GPU compute SM %d.%d\n", properties.major, properties.minor); 77 exit(EXIT_FAILURE); 78 } 79 } 80 else 81 { 82 cudaGetDeviceCount(&device_count); 83 for (int i = 0; i < device_count; ++i) 84 { 85 cudaDeviceProp properties; 86 cudaGetDeviceProperties(&properties, i); 87 if (properties.major > 3 || (properties.major == 3 && properties.minor >= 5)) 88 { 89 device = i; 90 printf("Running on GPU %d (%s)", i, properties.name); 91 break; 92 } 93 printf("Running on GPU %d (%s) does not support CUDA Dynamic Parallelism", i, properties.name); 94 } 95 } 96 if (device == -1) 97 { 98 printf("required GPU with compute SM 3.5 or higher."); 99 exit(EXIT_WAIVED); 100 } 101 cudaSetDevice(device); 102 103 cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, max_depth); 104 printf("Launching cdp_kernel() with CUDA Dynamic Parallelism:\n\n"); 105 cdp_kernel << <2, 2 >> >(max_depth, 0, 0, -1); 106 cudaGetLastError(); 107 cudaDeviceSynchronize(); 108 109 getchar(); 110 exit(EXIT_SUCCESS); 111 }
● 输出结果:主机调用 2 个线程块,每个线程块 2 个线程,每个线程按同样规模递归调用,共 2*4 个二级核函数,2*4*4 个三级核函数,一共 42 个线程块
CUDA Dynamic Parallelism Running on GPU 0 (GeForce GTX 1070)Launching cdp_kernel() with CUDA Dynamic Parallelism: BLOCK 0 launched by the host BLOCK 1 launched by the host + BLOCK 2 launched by thread 0 of block 1 + BLOCK 3 launched by thread 0 of block 1 + BLOCK 4 launched by thread 0 of block 0 + BLOCK 5 launched by thread 0 of block 0 + + BLOCK 10 launched by thread 0 of block 3 + + BLOCK 11 launched by thread 0 of block 3 + + BLOCK 7 launched by thread 0 of block 2 + + BLOCK 6 launched by thread 0 of block 2 + + BLOCK 12 launched by thread 0 of block 5 + + BLOCK 13 launched by thread 0 of block 5 + + BLOCK 8 launched by thread 0 of block 4 + + BLOCK 9 launched by thread 0 of block 4 + + BLOCK 15 launched by thread 1 of block 3 + + BLOCK 14 launched by thread 1 of block 3 + + BLOCK 19 launched by thread 1 of block 2 + + BLOCK 16 launched by thread 1 of block 5 + + BLOCK 17 launched by thread 1 of block 5 + + BLOCK 18 launched by thread 1 of block 2 + + BLOCK 21 launched by thread 1 of block 4 + + BLOCK 20 launched by thread 1 of block 4 + BLOCK 22 launched by thread 1 of block 1 + BLOCK 23 launched by thread 1 of block 1 + BLOCK 24 launched by thread 1 of block 0 + BLOCK 25 launched by thread 1 of block 0 + + BLOCK 28 launched by thread 0 of block 23 + + BLOCK 27 launched by thread 0 of block 22 + + BLOCK 29 launched by thread 0 of block 24 + + BLOCK 26 launched by thread 0 of block 23 + + BLOCK 31 launched by thread 0 of block 24 + + BLOCK 30 launched by thread 0 of block 22 + + BLOCK 33 launched by thread 0 of block 25 + + BLOCK 32 launched by thread 0 of block 25 + + BLOCK 34 launched by thread 1 of block 23 + + BLOCK 35 launched by thread 1 of block 23 + + BLOCK 36 launched by thread 1 of block 22 + + BLOCK 37 launched by thread 1 of block 22 + + BLOCK 38 launched by thread 1 of block 25 + + BLOCK 39 launched by thread 1 of block 25 + + BLOCK 40 launched by thread 1 of block 24 + + BLOCK 41 launched by thread 1 of block 24
▶ 涨姿势:
● 在核函数中递归地调用核函数,注意函数调用的格式
▶ 源代码:动态并行实现快排算法,输出结果只有 Finish!
1 #include <stdio.h> 2 #include <cuda.h> 3 #include <cuda_runtime.h> 4 #include "device_launch_parameters.h" 5 #include <helper_cuda.h> 6 #include <helper_string.h> 7 8 #define MAX_DEPTH 16 9 #define INSERTION_SORT 32 10 11 __device__ void selection_sort(unsigned int *data, int left, int right) //选择排序,单线程完成 12 { 13 for (int i = left; i <= right; ++i) 14 { 15 unsigned min_val = data[i]; 16 int min_idx = i; 17 for (int j = i + 1; j <= right; ++j) // 找最小元素及其下标 18 { 19 unsigned val_j = data[j]; 20 if (val_j < min_val) 21 { 22 min_idx = j; 23 min_val = val_j; 24 } 25 } 26 if (i != min_idx) // 交换第 i 号元素到指定的位置上 27 { 28 data[min_idx] = data[i]; 29 data[i] = min_val; 30 } 31 } 32 } 33 34 __global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth) // 快排主体,内含递归调用 35 { 36 if (depth >= MAX_DEPTH || right - left <= INSERTION_SORT) // 递归深度达到 MAX_DEPTH 或者 数组中元素个数不多于 INSERTION_SORT 时使用选排 37 { 38 selection_sort(data, left, right); 39 return; 40 } 41 unsigned int *lptr = data + left, *rptr = data + right, pivot = data[(left + right) / 2]; 42 while (lptr <= rptr) 43 { 44 unsigned int lval = *lptr, rval = *rptr; // 指定左指针指向的值和右指针指向的值 45 while (lval < pivot) // 递增左指针,等价于 lptr++; lval = *lptr; 46 lval = *(++lptr); 47 while (rval > pivot) // 递减右指针 48 rval = *(--rptr); 49 if (lptr <= rptr) // 交换左右指针指向的值 50 { 51 *lptr++ = rval; 52 *rptr-- = lval; 53 } 54 } 55 if (left < rptr - data) // 将左右分区放到两个不同的流中 56 { 57 cudaStream_t s0; 58 cudaStreamCreateWithFlags(&s0, cudaStreamNonBlocking); // 指定该流不与 0 号流进行同步 59 cdp_simple_quicksort << < 1, 1, 0, s0 >> >(data, left, rptr - data, depth + 1); 60 cudaStreamDestroy(s0); 61 } 62 if (lptr - data < right) 63 { 64 cudaStream_t s1; 65 cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking); 66 cdp_simple_quicksort << < 1, 1, 0, s1 >> >(data, lptr - data, right, depth + 1); 67 cudaStreamDestroy(s1); 68 } 69 } 70 71 void run_qsort(unsigned int *data, unsigned int n) // 快排入口 72 { 73 cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH); // 设置最大递归深度 74 cdp_simple_quicksort << < 1, 1 >> >(data, 0, n - 1, 0); 75 cudaDeviceSynchronize(); 76 } 77 78 int main(int argc, char **argv) 79 { 80 cudaSetDevice(0); 81 const int n = 1024; 82 83 unsigned int *h_data = (unsigned int *)malloc(sizeof(unsigned int) * n); 84 srand(2047); 85 for (unsigned i = 0; i < n; i++) 86 h_data[i] = rand() % n; 87 88 89 unsigned int *d_data; 90 cudaMalloc((void **)&d_data, n * sizeof(unsigned int)); 91 cudaMemcpy(d_data, h_data, n * sizeof(unsigned int), cudaMemcpyHostToDevice); 92 93 run_qsort(d_data, n); 94 95 cudaMemcpy(h_data, d_data, n * sizeof(unsigned), cudaMemcpyDeviceToHost); 96 97 for (int i = 1; i < n; ++i) 98 { 99 if (h_data[i - 1] > h_data[i]) 100 { 101 printf("Error at i == %d, h_data[i-1] == %d, h_data[i] == %d\n", h_data[i - 1], h_data[i]); 102 break; 103 } 104 } 105 printf("Finish!\n"); 106 107 free(h_data); 108 cudaFree(d_data); 109 getchar(); 110 exit(EXIT_SUCCESS); 111 }
▶ 新姿势:
● checkCmdLineFlag 用于检验函数参数 argv 是否等于字符串 string_ref(定义于 helper_string.h 中)
1 inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) 2 { 3 bool bFound = false; 4 if (argc >= 1) 5 { 6 for (int i = 1; i < argc; i++) 7 { 8 int string_start = stringRemoveDelimiter('-', argv[i]); 9 const char *string_argv = &argv[i][string_start], 10 const char*equal_pos = strchr(string_argv, '='); 11 int argv_length = (int)(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 12 int length = (int)strlen(string_ref); 13 if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) 14 { 15 bFound = true; 16 continue; 17 } 18 } 19 } 20 return bFound; 21 } 22 23 inline int stringRemoveDelimiter(char delimiter, const char *string) // 去除特定的符号,上述函数的中用于去除参数前面的 - 或 -- 24 { 25 int string_start = 0; 26 while (string[string_start] == delimiter) 27 string_start++; 28 if (string_start >= (int)strlen(string) - 1) 29 return 0; 30 return string_start; 31 } 32 33 #define STRNCASECMP _strnicmp // 比较字符串(定义于string.h中) 34 35 _ACRTIMP int __cdecl _strnicmp(_In_reads_or_z_(_MaxCount) char const* _String1, _In_reads_or_z_(_MaxCount) char const* _String2, _In_ size_t _MaxCount);
● getCmdLineArgumentInt 用于提取函数参数 argv 中的整数(定义于 helper_string.h 中)
1 inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) 2 { 3 bool bFound = false; 4 int value = -1; 5 if (argc >= 1) 6 { 7 for (int i = 1; i < argc; i++) 8 { 9 int string_start = stringRemoveDelimiter('-', argv[i]); 10 const char *string_argv = &argv[i][string_start]; 11 int length = (int)strlen(string_ref); 12 if (!STRNCASECMP(string_argv, string_ref, length)) 13 { 14 if (length + 1 <= (int)strlen(string_argv)) 15 { 16 int auto_inc = (string_argv[length] == '=') ? 1 : 0; 17 value = atoi(&string_argv[length + auto_inc]); 18 } 19 else 20 value = 0; 21 bFound = true; 22 continue; 23 } 24 } 25 } 26 if (bFound) 27 return value; 28 return 0; 29 }
● 指定最大递归深度
extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
● 带有标识符的 cudaStreamCreateWithFlags ,设置流的优先级
extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
■ 对比 cudaStreamCreate
extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
浙公网安备 33010602011771号