# [CUDA]CUDA编程实战四——矩阵乘法

## CPU版本

#include <iostream>
#include <stdlib.h>
#include <sys/time.h>

const int ROWS = 1024;
const int COLS = 1024;

using namespace std;

void matrix_mul_cpu(float* M, float* N, float* P, int width)
{
for(int i=0;i<width;i++)
for(int j=0;j<width;j++)
{
float sum = 0.0;
for(int k=0;k<width;k++)
{
float a = M[i*width+k];
float b = N[k*width+j];
sum += a*b;
}
P[i*width+j] = sum;
}
}

int main()
{
struct timeval start, end;
gettimeofday( &start, NULL );
float *A, *B, *C;
int total_size = ROWS*COLS*sizeof(float);
A = (float*)malloc(total_size);
B = (float*)malloc(total_size);
C = (float*)malloc(total_size);

//CPU一维数组初始化
for(int i=0;i<ROWS*COLS;i++)
{
A[i] = 80.0;
B[i] = 20.0;
}

matrix_mul_cpu(A, B, C, COLS);

gettimeofday( &end, NULL );
int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
cout << "total time is " << timeuse/1000 << "ms" <<endl;

return 0;
}


## CUDA版本

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <sys/time.h>
#include <stdio.h>
#include <math.h>

const int Row=1024;
const int Col=1024;

__global__
void matrix_mul_gpu(int *M, int* N, int* P, int width)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;

int sum = 0;
for(int k=0;k<width;k++)
{
int a = M[j*width+k];
int b = N[k*width+i];
sum += a*b;
}
P[j*width+i] = sum;
}

int main()
{
struct timeval start, end;
gettimeofday( &start, NULL );

int *A = (int *)malloc(sizeof(int) * Row * Col);
int *B = (int *)malloc(sizeof(int) * Row * Col);
int *C = (int *)malloc(sizeof(int) * Row * Col);
//malloc device memory
int *d_dataA, *d_dataB, *d_dataC;
cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
//set value
for (int i = 0; i < Row*Col; i++) {
A[i] = 90;
B[i] = 10;
}

cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
//拷贝计算数据-一级数据指针
cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);

//释放内存
free(A);
free(B);
free(C);
cudaFree(d_dataA);
cudaFree(d_dataB);
cudaFree(d_dataC);

gettimeofday( &end, NULL );
int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;
printf("total time is %d ms\n", timeuse/1000);

return 0;
}


### 运行结果

posted @ 2021-06-12 13:46  wildkid1024  阅读(999)  评论(0编辑  收藏  举报