CUDA/GPU下矩阵乘法的几种实现的C++源码——转载

#include "cutil_inline.h"
#include "cublas.h"
void MatrixMul(const float *A, const float *B, float *C, int Width) {
int i, j, k;
for(i=0; i<Width; i++)
for(j=0; j<Width; j++){
float s=0;
for(k=0; k<Width; k++)
s+=A[i*Width+k]*B[k*Width+j];
C[i*Width+j]=s;
}
}
#define TILE_WIDTH 16
//简单方法
__global__ void MatrixMulKernel1(float* Md, float* Nd, float* Pd, int Width)
{
int x = threadIdx.x+blockIdx.x*blockDim.x;
int y = threadIdx.y+blockIdx.y*blockDim.y;
float Pvalue = 0;
for (int k = 0; k < Width; ++k)
Pvalue+=Md[y * Width + k]*Nd[k * Width + x];
Pd[y*Width + x] = Pvalue;
}
//块方法
__global__ void MatrixMulKernel2(float* Md, float* Nd, float* Pd, int Width)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
float Pvalue = 0;
for (int m = 0; m < gridDim.x; ++m) {
__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
Mds[ty][tx] = *(Md + (by*blockDim.y + ty) * Width + m*blockDim.x + tx);
Nds[ty][tx] = *(Nd + (m*blockDim.y + ty) * Width + bx*blockDim.x + tx);
__syncthreads();
for (int k = 0; k < blockDim.x; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
__syncthreads();
}
Pd[(by*blockDim.y+ty)*Width+bx*blockDim.x+tx] = Pvalue;
}
//块方法, 循环展开
__global__ void MatrixMulKernel3(float* Md, float* Nd, float* Pd, int Width)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
float Pvalue = 0;
for (int m = 0; m < gridDim.x; ++m) {
__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];
__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
Mds[ty][tx] = *(Md + (by*blockDim.y + ty) * Width + m*blockDim.x + tx);
Nds[ty][tx] = *(Nd + (m*blockDim.y + ty) * Width + bx*blockDim.x + tx);
__syncthreads();
Pvalue += Mds[ty][0] * Nds[0][tx] + Mds[ty][1] * Nds[1][tx] +
Mds[ty][2] * Nds[2][tx] + Mds[ty][3] * Nds[3][tx] +
Mds[ty][4] * Nds[4][tx] + Mds[ty][5] * Nds[5][tx] +
Mds[ty][6] * Nds[6][tx] + Mds[ty][7] * Nds[7][tx] +
Mds[ty][8] * Nds[8][tx] + Mds[ty][9] * Nds[9][tx] +
Mds[ty][10] * Nds[10][tx] + Mds[ty][11] * Nds[11][tx] +
Mds[ty][12] * Nds[12][tx] + Mds[ty][13] * Nds[13][tx] +
Mds[ty][14] * Nds[14][tx] + Mds[ty][15] * Nds[15][tx];
__syncthreads();
}
Pd[(by*blockDim.y+ty)*Width+bx*blockDim.x+tx] = Pvalue;
}
#define ThreadColumn 2
//块方法, 线程粒度
__global__ void MatrixMulKernel4(float* Md, float* Nd, float* Pd, int Width)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
float Pvalue1 = 0, Pvalue2=0;
for (int m = 0; m < gridDim.x; ++m) {
__shared__ float Mds[TILE_WIDTH][ThreadColumn*TILE_WIDTH];
__shared__ float Nds[ThreadColumn*TILE_WIDTH][ThreadColumn*TILE_WIDTH];
for(int k=0;k<ThreadColumn; ++k)
Mds[ty][tx+k*TILE_WIDTH] = *(Md + (by*blockDim.y + ty) * Width + ThreadColumn*m*blockDim.x + tx + k*TILE_WIDTH );
for(int h=0;h<ThreadColumn; ++h)
for(int k=0; k<ThreadColumn; ++k)
Nds[ty+h*TILE_WIDTH][tx+k*TILE_WIDTH] = *(Nd + (m*blockDim.y*ThreadColumn + ty + h*TILE_WIDTH) * Width +
bx*blockDim.x*ThreadColumn + tx + k*TILE_WIDTH);
__syncthreads();
for(int h=0;h<ThreadColumn*TILE_WIDTH;++h)
Pvalue1 +=Mds[ty][h] * Nds[h][tx];
for(int k=0;k<ThreadColumn*TILE_WIDTH;++k)
Pvalue2 +=Mds[ty][k] * Nds[k][tx+TILE_WIDTH];
__syncthreads();
}
Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx] = Pvalue1;
Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx + TILE_WIDTH] = Pvalue2;
}
//块方法, 循环展开, 线程粒度
__global__ void MatrixMulKernel5(float* Md, float* Nd, float* Pd, int Width)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
float Pvalue1 = 0, Pvalue2=0;
for (int m = 0; m < gridDim.x; ++m) {
__shared__ float Mds[TILE_WIDTH][ThreadColumn*TILE_WIDTH];
__shared__ float Nds[ThreadColumn*TILE_WIDTH][ThreadColumn*TILE_WIDTH];
for(int k=0;k<ThreadColumn; ++k)
Mds[ty][tx+k*TILE_WIDTH] = *(Md + (by*blockDim.y + ty) * Width + ThreadColumn*m*blockDim.x + tx + k*TILE_WIDTH );
for(int h=0;h<ThreadColumn; ++h)
for(int k=0; k<ThreadColumn; ++k)
Nds[ty+h*TILE_WIDTH][tx+k*TILE_WIDTH] = *(Nd + (m*blockDim.y*ThreadColumn + ty + h*TILE_WIDTH) * Width +
bx*blockDim.x*ThreadColumn + tx + k*TILE_WIDTH);
__syncthreads();
Pvalue1 += Mds[ty][0] * Nds[0][tx] + Mds[ty][1] * Nds[1][tx] +
Mds[ty][2] * Nds[2][tx] + Mds[ty][3] * Nds[3][tx] +
Mds[ty][4] * Nds[4][tx] + Mds[ty][5] * Nds[5][tx] +
Mds[ty][6] * Nds[6][tx] + Mds[ty][7] * Nds[7][tx] +
Mds[ty][8] * Nds[8][tx] + Mds[ty][9] * Nds[9][tx] +
Mds[ty][10] * Nds[10][tx] + Mds[ty][11] * Nds[11][tx] +
Mds[ty][12] * Nds[12][tx] + Mds[ty][13] * Nds[13][tx] +
Mds[ty][14] * Nds[14][tx] + Mds[ty][15] * Nds[15][tx] +
Mds[ty][16] * Nds[16][tx] + Mds[ty][17] * Nds[17][tx] +
Mds[ty][18] * Nds[18][tx] + Mds[ty][19] * Nds[19][tx] +
Mds[ty][20] * Nds[20][tx] + Mds[ty][21] * Nds[21][tx] +
Mds[ty][22] * Nds[22][tx] + Mds[ty][23] * Nds[23][tx] +
Mds[ty][24] * Nds[24][tx] + Mds[ty][25] * Nds[25][tx] +
Mds[ty][26] * Nds[26][tx] + Mds[ty][27] * Nds[27][tx] +
Mds[ty][28] * Nds[28][tx] + Mds[ty][29] * Nds[29][tx] +
Mds[ty][30] * Nds[30][tx] + Mds[ty][31] * Nds[31][tx];
Pvalue2 += Mds[ty][0] * Nds[0][tx+TILE_WIDTH] + Mds[ty][1] * Nds[1][tx+TILE_WIDTH] +
Mds[ty][2] * Nds[2][tx+TILE_WIDTH] + Mds[ty][3] * Nds[3][tx+TILE_WIDTH] +
Mds[ty][4] * Nds[4][tx+TILE_WIDTH] + Mds[ty][5] * Nds[5][tx+TILE_WIDTH] +
Mds[ty][6] * Nds[6][tx+TILE_WIDTH] + Mds[ty][7] * Nds[7][tx+TILE_WIDTH] +
Mds[ty][8] * Nds[8][tx+TILE_WIDTH] + Mds[ty][9] * Nds[9][tx+TILE_WIDTH] +
Mds[ty][10] * Nds[10][tx+TILE_WIDTH] + Mds[ty][11] * Nds[11][tx+TILE_WIDTH] +
Mds[ty][12] * Nds[12][tx+TILE_WIDTH] + Mds[ty][13] * Nds[13][tx+TILE_WIDTH] +
Mds[ty][14] * Nds[14][tx+TILE_WIDTH] + Mds[ty][15] * Nds[15][tx+TILE_WIDTH] +
Mds[ty][16] * Nds[16][tx+TILE_WIDTH] + Mds[ty][17] * Nds[17][tx+TILE_WIDTH] +
Mds[ty][18] * Nds[18][tx+TILE_WIDTH] + Mds[ty][19] * Nds[19][tx+TILE_WIDTH] +
Mds[ty][20] * Nds[20][tx+TILE_WIDTH] + Mds[ty][21] * Nds[21][tx+TILE_WIDTH] +
Mds[ty][22] * Nds[22][tx+TILE_WIDTH] + Mds[ty][23] * Nds[23][tx+TILE_WIDTH] +
Mds[ty][24] * Nds[24][tx+TILE_WIDTH] + Mds[ty][25] * Nds[25][tx+TILE_WIDTH] +
Mds[ty][26] * Nds[26][tx+TILE_WIDTH] + Mds[ty][27] * Nds[27][tx+TILE_WIDTH] +
Mds[ty][28] * Nds[28][tx+TILE_WIDTH] + Mds[ty][29] * Nds[29][tx+TILE_WIDTH] +
Mds[ty][30] * Nds[30][tx+TILE_WIDTH] + Mds[ty][31] * Nds[31][tx+TILE_WIDTH];
__syncthreads();
}
Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx] = Pvalue1;
Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx + TILE_WIDTH] = Pvalue2;
}
int main(int argc, char **argv) {
int Width(0);
bool is_test=false;
if(argc==1){
Width=512;
is_test=false;
}
else if(argc==2){
Width=atoi(argv[1]);
is_test=false;
}
else{
Width=atoi(argv[1]);
is_test=(bool)atoi(argv[2]);
}
float *h_A=(float*)malloc(Width*Width*sizeof(float));
float *h_B=(float*)malloc(Width*Width*sizeof(float));
float *h_C=(float*)malloc(Width*Width*sizeof(float));
float *h_C_ref=(float*)malloc(Width*Width*sizeof(float));
float *d_A, *d_B, *d_C;
float t0, error_norm=0, ref_norm=0;
unsigned int timer1=0;
cutCreateTimer(&timer1);
cutStartTimer(timer1);
for(int i=0; i<Width*Width; i++) {
h_A[i]=rand()/(float)RAND_MAX;
h_B[i]=rand()/(float)RAND_MAX;
}
cublasInit();
cublasAlloc(Width*Width, sizeof(float), (void**)&d_A);
cublasAlloc(Width*Width, sizeof(float), (void**)&d_B);
cublasAlloc(Width*Width, sizeof(float), (void**)&d_C);
cublasSetVector(Width*Width, sizeof(float), h_A, 1, d_A, 1);
cublasSetVector(Width*Width, sizeof(float), h_B, 1, d_B, 1);
if(is_test)
MatrixMul(h_A, h_B, h_C_ref,Width);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
dim3 dimGrid(Width/TILE_WIDTH, Width/TILE_WIDTH);
t0=cutGetTimerValue(timer1);
cudaThreadSynchronize();
MatrixMulKernel1<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width);
cudaThreadSynchronize();
float gpu_t1=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);
if(is_test){
error_norm=0;
ref_norm=0;
for(int i=0; i<Width*Width; i++) {
float diff=h_C_ref[i]-h_C[i];
error_norm+=diff*diff;
ref_norm+=h_C_ref[i]*h_C_ref[i];
}
printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}
t0=cutGetTimerValue(timer1);
cudaThreadSynchronize();
MatrixMulKernel2<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width);
cudaThreadSynchronize();
float gpu_t2=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);
if(is_test){
error_norm=0;
ref_norm=0;
for(int i=0; i<Width*Width; i++) {
float diff=h_C_ref[i]-h_C[i];
error_norm+=diff*diff;
ref_norm+=h_C_ref[i]*h_C_ref[i];
}
printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}
t0=cutGetTimerValue(timer1);
cudaThreadSynchronize();
MatrixMulKernel3<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width);
cudaThreadSynchronize();
float gpu_t3=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);
if(is_test){
error_norm=0;
ref_norm=0;
for(int i=0; i<Width*Width; i++) {
float diff=h_C_ref[i]-h_C[i];
error_norm+=diff*diff;
ref_norm+=h_C_ref[i]*h_C_ref[i];
}
printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}
dimGrid.x/=ThreadColumn;
t0=cutGetTimerValue(timer1);
cudaThreadSynchronize();
MatrixMulKernel4<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width);
cudaThreadSynchronize();
float gpu_t4=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);
if(is_test){
error_norm=0;
ref_norm=0;
for(int i=0; i<Width*Width; i++) {
float diff=h_C_ref[i]-h_C[i];
error_norm+=diff*diff;
ref_norm+=h_C_ref[i]*h_C_ref[i];
}
printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}
t0=cutGetTimerValue(timer1);
cudaThreadSynchronize();
MatrixMulKernel5<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width);
cudaThreadSynchronize();
float gpu_t5=(cutGetTimerValue(timer1)-t0)/1000.0f;
cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1);
if(is_test){
error_norm=0;
ref_norm=0;
for(int i=0; i<Width*Width; i++) {
float diff=h_C_ref[i]-h_C[i];
error_norm+=diff*diff;
ref_norm+=h_C_ref[i]*h_C_ref[i];
}
printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED");
}
printf("矩阵阶数为%4d,", Width);
printf("简单方法: %.6fs(%.3fGflops),", gpu_t1, 1e-9*Width*Width*Width*2/gpu_t1);
printf("块方法: %.6fs(%.3fGflops),", gpu_t2, 1e-9*Width*Width*Width*2/gpu_t2);
printf("块+循环展开方法: %.6fs(%.3fGflops),", gpu_t3, 1e-9*Width*Width*Width*2/gpu_t3);
printf("块+线程粒度2: %.6fs(%.3fGflops),", gpu_t4, 1e-9*Width*Width*Width*2/gpu_t4);
printf("块+循环展开方法+线程粒度2: %.6fs(%.3fGflops)/n", gpu_t5, 1e-9*Width*Width*Width*2/gpu_t5);
}

环境：CUDA toolkit3.2+Windows XP+CUDA SDK中的vs2008模板release编译通过，显卡是GeForce GT240。大家可根据自己的情况进行测试，报告一下结果吧.

不同规模的运行结果:
矩阵阶数为 512,简单方法: 0.033887s(7.922Gflops),块方法: 0.008424s(31.865Gflops),块+循环展开方法: 0.003995s(67.191Gflops),块+线程粒度2: 0.008091s(33.179Gflops),块+循环展开方法+线程粒度2: 0.003399s(78.984Gflops)

矩阵阶数为1024,简单方法: 0.264424s(8.121Gflops),块方法: 0.066201s(32.439Gflops),块+循环展开方法: 0.030827s(69.662Gflops),块+线程粒度2: 0.063264s(33.945Gflops),块+循环展开方法+线程粒度2: 0.026282s(81.710Gflops)

矩阵阶数为2048,简单方法: 2.112513s(8.132Gflops),块方法: 0.527002s(32.599Gflops),块+循环展开方法: 0.244058s(70.392Gflops),块+线程粒度2: 0.505495s(33.986Gflops),块+循环展开方法+线程粒度2: 0.208845s(82.261Gflops)

矩阵阶数为4096,简单方法: 17.705070s(7.763Gflops),块方法: 4.205308s(32.682Gflops),块+循环展开方法: 1.966043s(69.906Gflops),块+线程粒度2: 4.059064s(33.860Gflops),块+循环展开方法+线程粒度2: 1.677771s(81.918Gflops)

测试结果表明在块大小16x16时，块方法和循环展开对速度有很显著的影响，而线程粒度的使用对速度只有很小的提高。而块大小是8x8时的情形没有测试，线程粒度对速度应该有较大影响，家可以做一做。

posted @ 2015-11-25 15:38 uestc_summer 阅读(646) 评论(0) 收藏举报

刷新页面返回顶部

uestc_summer

CUDA/GPU下矩阵乘法的几种实现的C++源码——转载

公告