1 #include <stdio.h>
2 #include "gputimer.h"
3 #include "cuda_runtime.h"
4 #include "device_launch_parameters.h"
5 #include <stdlib.h>
6
7 const int N = 1024;
8 const int K = 32;
9
10 void fill_matrix(float * mat){
11 for (int i = 0; i < N*N; i++)
12 mat[i] = (float)i;
13 }
14
15 void print_matrix(float *mat)
16 {
17 for (int j = 0; j < N; j++)
18 {
19 for (int i = 0; i < N; i++) { printf("%4.4g ", mat[i + j*N]); }
20 printf("\n");
21 }
22 }
23
24 __global__ void transpose_serial(float in[], float out[]){
25 for (int i = 0; i < N; i++)
26 for (int j = 0; j < N; j++)
27 out[i + j*N] = in[j + i*N];
28 }
29
30 __global__ void transpose_parallel_per_row(float in[], float out[]){
31 int i = threadIdx.x;
32
33 for (int j = 0; j < N; j++)
34 out[j + i*N] = in[i + j*N];
35 }
36
37 __global__ void transpose_parallel_per_element(float in[], float out[]){
38 int i = blockIdx.x * K + threadIdx.x;
39 int j = blockIdx.y * K + threadIdx.y;
40 out[j + i*N] = in[i + j*N];
41 }
42 int main(void){
43 int numbytes = N * N * sizeof(float);
44
45 float *in = (float *)malloc(numbytes);
46 float *out = (float *)malloc(numbytes);
47 fill_matrix(in);
48
49 float *d_in, *d_out;
50
51 cudaMalloc((void **)&d_in, numbytes);
52 cudaMalloc((void **)&d_out, numbytes);
53 cudaMemcpy(d_in, in, numbytes, cudaMemcpyHostToDevice);
54
55 GpuTimer timer;
56 timer.Start();
57 transpose_serial << <1, 1 >> >(d_in, d_out);
58 timer.Stop();
59 cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
60 printf("transpose_serial:%g ms.\n", timer.Elapsed());
61
62 timer.Start();
63 transpose_parallel_per_row << <1, N >> >(d_in, d_out);
64 timer.Stop();
65 cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
66 printf("transpose_parallel_per_row:%g ms.\n", timer.Elapsed());
67
68 dim3 blocks(N / K, N / K);
69 dim3 threads(K, K);
70 timer.Start();
71 transpose_parallel_per_element << <blocks, threads >> >(d_in, d_out);
72 timer.Stop();
73 cudaMemcpy(out, d_out, numbytes, cudaMemcpyDeviceToHost);
74 printf("transpose_parallel_per_element: %g ms.\n", timer.Elapsed());
75
76 cudaFree(d_in);
77 cudaFree(d_out);
78
79 return 0;
80 }