1 #include <stdio.h>
2 #include <cuda_runtime.h>
3 #include <device_launch_parameters.h>
4 #include <stdlib.h>
5 #include <time.h>
6
7 #define THREAD_NUM 256
8 #define MATRIX_SIZE 1000
9 const int blocks_num = MATRIX_SIZE * (MATRIX_SIZE + THREAD_NUM - 1) / THREAD_NUM;
10 void matgen(float *a, int n);
11
12 ///生成随机矩阵
13 void matgen(float *a, int n){
14 int i, j;
15
16 for (i = 0; i < n; i++){
17 for (j = 0; j < n; j++){
18 a[i * n + j] = (float)rand();
19 //printf("%f ", a[i*n + j]);
20 }
21 }
22 }
23
24 //__global__函数 并行计算矩阵乘法
25 __global__ static void matMult(const float *a, const float *b, float * c,int n){
26 //表示目前的thread是第几个thread(由0开始计算)
27 const int tid = threadIdx.x;
28
29 //表示目前的thread属于第几个block(由0开始计算)
30 const int bid = blockIdx.x;
31
32 //从bid和tid计算出这个thread应该计算的row和column
33 const int idx = bid * THREAD_NUM + tid;
34 const int row = idx / n;
35 const int column = idx % n;
36
37 //计算矩阵乘法
38 if (row < n && column < n){
39 float t = 0;
40 for (int i = 0; i < n; i++){
41 t += a[row * n + i] * b[i * n + column];
42 }
43 c[row * n + column] = t;
44 }
45 }
46
47 int main(void){
48 cudaEvent_t stop, start;
49 cudaEventCreate(&start);
50 cudaEventCreate(&stop);
51
52 //定义矩阵
53 float *a, *b, *c;
54 int n = MATRIX_SIZE;
55 float elapsedTime = 0;
56
57 //分配内存
58 a = (float *)malloc(sizeof(float) * n * n);
59 b = (float *)malloc(sizeof(float) * n * n);
60 c = (float *)malloc(sizeof(float) * n * n);
61
62 //设置随机数种子
63 srand(0);
64
65 //随机生成矩阵
66 matgen(a, n);
67 matgen(b, n);
68
69 //分配GPU内存
70 float *d_a, *d_b, *d_c;
71 cudaMalloc((void**)&d_a, sizeof(float) * n * n);
72 cudaMalloc((void**)&d_b, sizeof(float) * n * n);
73 cudaMalloc((void**)&d_c, sizeof(float) * n * n);
74 cudaMemcpy(d_a, a, sizeof(float) * n * n, cudaMemcpyHostToDevice);
75 cudaMemcpy(d_b, b, sizeof(float) * n * n, cudaMemcpyHostToDevice);
76
77 cudaEventRecord(start, 0);
78 matMult << <blocks_num, THREAD_NUM, 0 >> >(d_a, d_b, d_c,n);
79 cudaThreadSynchronize();
80 cudaEventRecord(stop, 0);
81 cudaEventSynchronize(stop);
82 cudaEventElapsedTime(&elapsedTime, start, stop);
83 printf("%f\n", elapsedTime);
84
85 cudaMemcpy(c, d_c, sizeof(float) * n * n, cudaMemcpyDeviceToHost);
86 /*for (int i = 0; i < 100; i++){
87 printf("%f ", c[i]);
88 }*/
89
90 cudaFree(d_a);
91 cudaFree(d_b);
92 cudaFree(d_c);
93
94 return 0;
95 }