CUDA学习4 线程协作
在CUDA学习3 Max pooling (python c++ cuda)中有一个2D grid的CUDA实现,用时141ms。
以下为2D grid 2D blocks实现,耗时进一步降低到16ms。
int x = blockIdx.x; int y = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int index2 = y*gridDim.x*blockDim.y*blockDim.x + x* blockDim.y*blockDim.x + ty*blockDim.x + tx;
线程索引计算方式如上,此处需要的循环为(N,M,PH,PH),因此配置如下。(PH*PH=144未超出本机显卡max threads per block=1024的限制)
dim3 grid(M, N);
dim3 threads(PH, PH);
下面是完整代码。
#include <windows.h> #include <iostream> __global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data) { int x = blockIdx.x; int y = blockIdx.y; int dx = gridDim.x; //int dy = gridDim.y; int tx = threadIdx.x; int ty = threadIdx.y; int dtx = blockDim.x; int dty = blockDim.y; float s = -10000.0; int index2 = y*dx*dtx*dty + x*dtx*dty + ty*dtx + tx; int index = y*dx*height*height + x*height*height + ty*pooled_height*height + tx*pooled_height; for (int u = 0; u < pooled_height && (u + pooled_height*ty)<height; ++u) for (int v = 0; v < pooled_height && (v + pooled_height*tx)<height; ++v) if (*(bottom_data + index + u*height + v)>s) s = *(bottom_data + index + u*height + v); *(top_data + index2) = s; } int main() { const int N = 500, M =100, H = 24, W = 24, D = 2; const int PH = H / D + H % D; int image_size = N*M*H*W*sizeof(float); int out_size = N*M*PH*PH*sizeof(float); float mul_by = 0.01; float *input, *output, *dev_output, *dev_input; input = new float[image_size]; output = new float[out_size]; for (int i = 0; i<N*M*H*W; i++) *(input + i) = i*mul_by; cudaMalloc((void**)&dev_output, out_size); cudaMalloc((void**)&dev_input, image_size); cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice); dim3 grid(M, N); dim3 threads(PH, PH); DWORD start_time = GetTickCount(); MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output); cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost); DWORD end_time = GetTickCount(); std::cout << "Cost: " << end_time - start_time << "ms." << std::endl; for (int i = 0; i<10; i++) std::cout << *(output + i) << std::endl; cudaFree(dev_input); cudaFree(dev_output); delete[] output; delete[] input; system("pause"); } /* Cost: 16ms. 0.25 0.27 0.29 0.31 0.33 0.35 0.37 0.39 0.41 0.43 */
以下是采用3D grid 3D blocks的错误实现,如下每次比较大小时,都是和-1000.0在比较。
#include <windows.h> #include <iostream> __global__ void MaxPool2d(float* bottom_data, const int height, const int pooled_height, float* top_data) { int x = blockIdx.x; int y = blockIdx.y; int z = blockIdx.z; int dx = gridDim.x; int dy = gridDim.y; int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z; int dtx = blockDim.x; int dty = blockDim.y; int dtz = blockDim.z; int index2 = z*dy*dx*dtz + y*dx*dtz + x*dtz + tz; int index = z*dy*height*height + y*height*height + x*pooled_height*height + tz*pooled_height + ty*height + tx; if (tx==0 && ty==0) *(top_data + index2) = -1000.0; if (ty<height - pooled_height*x) if (tx<height - pooled_height*tz) if (*(bottom_data + index)>*(top_data + index2)) *(top_data + index2) = *(bottom_data + index); //__syncthreads(); } int main() { const int N = 500, M =100, H = 24, W = 24, D = 2; const int PH = H / D + H % D; int image_size = N*M*H*W*sizeof(float); int out_size = N*M*PH*PH*sizeof(float); float mul_by = -0.01; float *input, *output, *dev_output, *dev_input; input = new float[image_size]; output = new float[out_size]; for (int i = 0; i<N*M*H*W; i++) *(input + i) = i*mul_by; cudaMalloc((void**)&dev_output, out_size); cudaMalloc((void**)&dev_input, image_size); cudaMemcpy(dev_input, input, image_size, cudaMemcpyHostToDevice); dim3 grid(PH,M, N); dim3 threads(D, D,PH); DWORD start_time = GetTickCount(); MaxPool2d << <grid, threads >> >(dev_input, H, D, dev_output); cudaMemcpy(output, dev_output, out_size, cudaMemcpyDeviceToHost); DWORD end_time = GetTickCount(); std::cout << "Cost: " << end_time - start_time << "ms." << std::endl; for (int i = 0; i<10; i++) std::cout << *(output + i) << std::endl; cudaFree(dev_input); cudaFree(dev_output); delete[] output; delete[] input; system("pause"); } /* Cost: 47ms. -0.25 -0.27 -0.29 -0.31 -0.33 -0.35 -0.37 -0.39 -0.41 -0.43 */