0_Simple__simpleCooperativeGroups - 爨爨爨好

公告

▶ 协作组，CUDA9.0 的新特性

挂白痴：https://blog.csdn.net/bruce_0712/article/details/79116013

▶ 源代码，如何获得协作组的编号？

 1 #include <stdio.h>
 2 #include "cuda_runtime.h"
 3 #include "device_launch_parameters.h"
 4 #include <cooperative_groups.h>
 5 
 6 #define THREAD_PER_BLOCK 64
 7 
 8 using namespace cooperative_groups;                             // 注意使用命名空间
 9 
10 __device__ int sumReduction(thread_group g, int *x, int val)    // 规约设备函数，要求共享内存 int *x 要够放得下 g.size() 个参加规约的元素 
11 {
12     int lane = g.thread_rank();                                 // 线程在协作组中的编号，教程中名字就叫 line ID
13 
14     for (int i = g.size() / 2; i > 0; i /= 2)
15     {
16         x[lane] = val;                                          // 第一次迭代该步相当于初始化，以后迭代该步相当于存储上一次迭代的结果
17         g.sync();                                               // 协作组同步
18         if (lane < i)
19             val += x[lane + i];                                 // 利用每个线程局部变量 val 记录当前结果
20         g.sync();
21     }
22     if (g.thread_rank() == 0)                                   // 零号线程返回计算结果
23         return val;
24     else
25         return -1;
26 }
27 
28 __global__ void cgkernel()
29 {
30     extern __shared__ int workspace[];
31 
32     thread_block group = this_thread_block();                   // 将线程块内所有线程打包为一个协作组
33     int groupSize = group.size();                               // 获得协作组大小（线程个数）
34     int input = group.thread_rank();                            // 获得线程在协作组内的编号，并作为计算输入
35     int output = sumReduction(group, workspace, input);         // 规约计算，注意直接使用共享内存作为工作空间
36     int expectedOutput = (groupSize - 1)*groupSize / 2;         // 预期计算结果，0 + 1 + 2 +...+ 63 = 2016
37 
38     if (group.thread_rank() == 0)                               // 0 号线程报告计算结果，宣布开始新的 4 个协作组的计算任务
39     {
40         printf("\n\tSum of thread 0 ~ %d in group is %d (expected %d)\n", group.size() - 1, output, expectedOutput);
41         printf("\n\tNow creating %d groups, each of size 16 threads:\n", group.size() / 16);
42     }
43     group.sync();                                               // 协作组同步
44 
45     thread_block_tile<16> group16 = tiled_partition<16>(group); // 每16个线程分割为一个协作组（只能使用 2 的整数次幂）
46 
47     int offset = group.thread_rank() - group16.thread_rank();   // 各协作组使用的共享内存的地址偏移量                                                                 
48     printf("%d -> thread_rank = %d, group16.thread_rank = %d， offset = %d\n", threadIdx.x, group.thread_rank(), group16.thread_rank(), offset);
49     // dim3 group.group_index() 打印出来全是 (0, 0, 0)，dim3 group.thread_index() 打印出来跟 group.thread_rank() 一样 
50         
51     input = group16.thread_rank();                              // 获得线程在新协作组中的编号，并作为计算输入
52     output = sumReduction(group16, workspace + offset, input);  // 规约计算，注意工作空间的地址偏移
53     expectedOutput = 15 * 16 / 2;                               // 预期计算结果，0 + 1 + 2 +...+ 16 = 120
54 
55     if (group16.thread_rank() == 0)                             // 各协作组零号线程报告计算结果
56         printf("\n\tSum of all ranks 0..15 in group16 is %d (expected %d)\n", output, expectedOutput);   
57     return;
58 }
59 
60 int main()
61 {
62     printf("\n\tStart with %d threads.\n", THREAD_PER_BLOCK);
63 
64     cgkernel << <1, THREAD_PER_BLOCK, THREAD_PER_BLOCK * sizeof(int) >> > ();
65     cudaDeviceSynchronize();
66 
67     printf("\n\tFinish.\n");
68     getchar();
69     return 0;
70 }

● 输出结果

        Start with 64 threads.

        Sum of thread 0 ~ 63 in group is 2016 (expected 2016)

        Now creating 4 groups, each of size 16 threads:
0 -> thread_rank = 0, group16.thread_rank = 0， offset = 0
1 -> thread_rank = 1, group16.thread_rank = 1， offset = 0
2 -> thread_rank = 2, group16.thread_rank = 2， offset = 0
3 -> thread_rank = 3, group16.thread_rank = 3， offset = 0
4 -> thread_rank = 4, group16.thread_rank = 4， offset = 0
5 -> thread_rank = 5, group16.thread_rank = 5， offset = 0
6 -> thread_rank = 6, group16.thread_rank = 6， offset = 0
7 -> thread_rank = 7, group16.thread_rank = 7， offset = 0
8 -> thread_rank = 8, group16.thread_rank = 8， offset = 0
9 -> thread_rank = 9, group16.thread_rank = 9， offset = 0
10 -> thread_rank = 10, group16.thread_rank = 10， offset = 0
11 -> thread_rank = 11, group16.thread_rank = 11， offset = 0
12 -> thread_rank = 12, group16.thread_rank = 12， offset = 0
13 -> thread_rank = 13, group16.thread_rank = 13， offset = 0
14 -> thread_rank = 14, group16.thread_rank = 14， offset = 0
15 -> thread_rank = 15, group16.thread_rank = 15， offset = 0
16 -> thread_rank = 16, group16.thread_rank = 0， offset = 16
17 -> thread_rank = 17, group16.thread_rank = 1， offset = 16
18 -> thread_rank = 18, group16.thread_rank = 2， offset = 16
19 -> thread_rank = 19, group16.thread_rank = 3， offset = 16
20 -> thread_rank = 20, group16.thread_rank = 4， offset = 16
21 -> thread_rank = 21, group16.thread_rank = 5， offset = 16
22 -> thread_rank = 22, group16.thread_rank = 6， offset = 16
23 -> thread_rank = 23, group16.thread_rank = 7， offset = 16
24 -> thread_rank = 24, group16.thread_rank = 8， offset = 16
25 -> thread_rank = 25, group16.thread_rank = 9， offset = 16
26 -> thread_rank = 26, group16.thread_rank = 10， offset = 16
27 -> thread_rank = 27, group16.thread_rank = 11， offset = 16
28 -> thread_rank = 28, group16.thread_rank = 12， offset = 16
29 -> thread_rank = 29, group16.thread_rank = 13， offset = 16
30 -> thread_rank = 30, group16.thread_rank = 14， offset = 16
31 -> thread_rank = 31, group16.thread_rank = 15， offset = 16
32 -> thread_rank = 32, group16.thread_rank = 0， offset = 32
33 -> thread_rank = 33, group16.thread_rank = 1， offset = 32
34 -> thread_rank = 34, group16.thread_rank = 2， offset = 32
35 -> thread_rank = 35, group16.thread_rank = 3， offset = 32
36 -> thread_rank = 36, group16.thread_rank = 4， offset = 32
37 -> thread_rank = 37, group16.thread_rank = 5， offset = 32
38 -> thread_rank = 38, group16.thread_rank = 6， offset = 32
39 -> thread_rank = 39, group16.thread_rank = 7， offset = 32
40 -> thread_rank = 40, group16.thread_rank = 8， offset = 32
41 -> thread_rank = 41, group16.thread_rank = 9， offset = 32
42 -> thread_rank = 42, group16.thread_rank = 10， offset = 32
43 -> thread_rank = 43, group16.thread_rank = 11， offset = 32
44 -> thread_rank = 44, group16.thread_rank = 12， offset = 32
45 -> thread_rank = 45, group16.thread_rank = 13， offset = 32
46 -> thread_rank = 46, group16.thread_rank = 14， offset = 32
47 -> thread_rank = 47, group16.thread_rank = 15， offset = 32
48 -> thread_rank = 48, group16.thread_rank = 0， offset = 48
49 -> thread_rank = 49, group16.thread_rank = 1， offset = 48
50 -> thread_rank = 50, group16.thread_rank = 2， offset = 48
51 -> thread_rank = 51, group16.thread_rank = 3， offset = 48
52 -> thread_rank = 52, group16.thread_rank = 4， offset = 48
53 -> thread_rank = 53, group16.thread_rank = 5， offset = 48
54 -> thread_rank = 54, group16.thread_rank = 6， offset = 48
55 -> thread_rank = 55, group16.thread_rank = 7， offset = 48
56 -> thread_rank = 56, group16.thread_rank = 8， offset = 48
57 -> thread_rank = 57, group16.thread_rank = 9， offset = 48
58 -> thread_rank = 58, group16.thread_rank = 10， offset = 48
59 -> thread_rank = 59, group16.thread_rank = 11， offset = 48
60 -> thread_rank = 60, group16.thread_rank = 12， offset = 48
61 -> thread_rank = 61, group16.thread_rank = 13， offset = 48
62 -> thread_rank = 62, group16.thread_rank = 14， offset = 48
63 -> thread_rank = 63, group16.thread_rank = 15， offset = 48

        Sum of all ranks 0..15 in group16 is 120 (expected 120)

        Sum of all ranks 0..15 in group16 is 120 (expected 120)

        Sum of all ranks 0..15 in group16 is 120 (expected 120)

        Sum of all ranks 0..15 in group16 is 120 (expected 120)

        Finish.

▶ 涨姿势：

● 相关定义

  1 // cooperative_groups_helper.h
  2 # if !defined(_CG_QUALIFIER)
  3 #  define _CG_QUALIFIER __forceinline__ __device__
  4 # endif
  5 
  6 # define die() assert(0);
  7 
  8 // cooperative_groups.h（调整顺序）
  9 class thread_group                      // 通用线程组类型
 10 {
 11     friend _CG_QUALIFIER thread_group this_thread();
 12     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
 13     friend class thread_block;
 14 
 15 protected:
 16     union __align__(8) 
 17     {
 18         unsigned int type : 8;
 19         struct 
 20         {
 21             unsigned int type : 8;
 22             unsigned int size : 24;
 23             unsigned int mask;
 24         } coalesced;
 25         struct 
 26         {
 27             void* ptr[2];
 28         } buffer;
 29     } _data;
 30     
 31     _CG_QUALIFIER thread_group operator=(const thread_group& src);
 32     
 33     _CG_QUALIFIER thread_group(__internal::groupType type) 
 34     {
 35         _data.type = type;
 36     }
 37 #if __cplusplus >= 201103L
 38     static_assert(sizeof(_data) == 16, "Failed size check");
 39 #endif
 40 
 41 public:
 42     _CG_QUALIFIER unsigned int size() const;
 43     _CG_QUALIFIER unsigned int thread_rank() const;
 44     _CG_QUALIFIER void sync() const;
 45 };
 46 
 47 class thread_block : public thread_group
 48 {
 49     friend _CG_QUALIFIER thread_block this_thread_block();
 50     friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
 51     friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
 52 
 53     _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {}
 54 
 55     _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const
 56     {
 57         const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
 58 
 59         if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz)
 60         {
 61             die();
 62             return (thread_block());
 63         }
 64 
 65         unsigned int mask;
 66         unsigned int base_offset = thread_rank() & (~(tilesz - 1));
 67         unsigned int masklength = min(size() - base_offset, tilesz);
 68         mask = (unsigned int)(-1) >> (32 - masklength);
 69         mask <<= (__internal::laneid() & ~(tilesz - 1));
 70         thread_group tile = thread_group(__internal::CoalescedTile);
 71         tile._data.coalesced.mask = mask;
 72         tile._data.coalesced.size = __popc(mask);
 73         return (tile);
 74     }
 75 
 76 public:
 77     _CG_QUALIFIER void sync() const { __internal::cta::sync(); }
 78     _CG_QUALIFIER unsigned int size() const { return (__internal::cta::size()); }
 79     _CG_QUALIFIER unsigned int thread_rank() const { return (__internal::cta::thread_rank()); }
 80     _CG_QUALIFIER dim3 group_index() const { return (__internal::cta::group_index()); }
 81     _CG_QUALIFIER dim3 thread_index() const { return (__internal::cta::thread_index()); }
 82 };
 83 
 84 _CG_QUALIFIER thread_block this_thread_block()// 范例代码中用到的，实际是调用了 thread_block 的构造函数
 85 {
 86     return (thread_block());
 87 }
 88 
 89 template <unsigned int Size>
 90 class thread_block_tile;
 91 template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { };
 92 template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { };
 93 template <> class thread_block_tile<8> : public __thread_block_tile_base<8> { };
 94 template <> class thread_block_tile<4> : public __thread_block_tile_base<4> { };
 95 template <> class thread_block_tile<2> : public __thread_block_tile_base<2> { };
 96 template <> class thread_block_tile<1> : public __thread_block_tile_base<1> { };
 97 
 98 template <unsigned int Size>
 99 class __thread_block_tile_base : public thread_group
100 {
101     static const unsigned int numThreads = Size;
102     _CG_QUALIFIER unsigned int build_mask() const 
103     {
104         unsigned int mask;
105         if (numThreads == 32)
106             mask = 0xFFFFFFFF;        
107         else 
108         {
109             mask = (unsigned int)(-1) >> (32 - numThreads);
110             mask <<= (__internal::laneid() & (~(numThreads - 1)));
111         }
112         return (mask);
113     }
114 
115 protected:
116     _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) 
117     {
118         _data.coalesced.mask = build_mask();
119         _data.coalesced.size = numThreads;
120     }
121 
122 public:
123     _CG_QUALIFIER void sync() const { __syncwarp(build_mask()); }
124     _CG_QUALIFIER unsigned int thread_rank() const { return (threadIdx.x & (numThreads - 1)); }
125     _CG_QUALIFIER unsigned int size() const { return (numThreads); }
126 
127     // PTX supported collectives
128     _CG_QUALIFIER int shfl(int var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }
129     ...
130 
131 #ifdef _CG_HAS_FP16_COLLECTIVE
132     _CG_QUALIFIER __half shfl(__half var, int srcRank) const { return (__shfl_sync(build_mask(), var, srcRank, numThreads)); }
133     ...
134     
135 #endif
136 
137 #ifdef _CG_HAS_MATCH_COLLECTIVE
138     _CG_QUALIFIER unsigned int match_any(int val) const 
139     {
140         unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
141         return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
142     }
143     ...
144 #endif
145 };

● 用到的线程协作相关函数

1 thread_block threadBlockGroup = this_thread_block();    // 将当前线程块分配为一个协作组
2 
3 thread_block_tile<16> tiledPartition16 = tiled_partition<16>(threadBlockGroup); // 协作组分组
4     
5 int in = tiledPartition16.thread_rank();                // 协作组中线程的编号
6 
7 tiledPartition16.sync();                            // 协作组同步

posted on 2017-11-22 20:21 爨爨爨好阅读(808) 评论(0) 收藏举报

刷新页面返回顶部