# CUDA -- 规约求矩阵的行和

求矩阵每行的和？

可以把每行放入一个不同线程块，这样行与行之间进行粗粒度的并行。而对于每行，其对应的线程块中分配n个线程（对应行宽），使用共享存储器，让每个线程从显存中读取一个数至shared memory中，然后使用规约算法计算和。

#include "cuda_runtime.h" //CUDA运行时API
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>

cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d);

__global__ void addKernel(int *mat, int *ans, size_t pitch)
{
int bid = blockIdx.x;
__shared__ int data[8];
int *row = (int*)((char*)mat + bid*pitch);
data[tid] = row[tid];
for (int i = 4; i > 0; i /= 2) {
if (tid < i)
data[tid] = data[tid] + data[tid + i];
}
if (tid == 0)
ans[bid] = data[0];
}

int main()
{
const int row = 4;
const int col = 8;
dim3 d(col, row);
int mat[row][col] = { 1,2,3,4,5,1,2,3,
6,7,8,9,10,4,5,6,
11,12,13,14,15,7,8,9,
16,17,18,19,20,10,11,12 };
int ans[row];
cudaError_t cudaStatus = addWithCuda(mat, ans, d);
if (cudaStatus != cudaSuccess)
{
return 1;
}
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
if (cudaStatus != cudaSuccess)
{
return 1;
}
for (int i = 0; i < d.y; i++)
{
std::cout << ans[i] << " ";
}
return 0;
}

// 重点理解这个函数
cudaError_t addWithCuda(int mat[4][8], int *ans, dim3 d)
{
int *dev_mat = 0; //GPU设备端数据指针
int *dev_ans = 0;
int pitch;
cudaError_t cudaStatus; //状态指示
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0); //选择运行平台
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// 分配GPU设备端内存
cudaStatus = cudaMallocPitch((void**)&dev_mat, (size_t *)&pitch, d.x * sizeof(int), d.y);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_ans, d.y * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!\n");
goto Error;
}
// 拷贝数据到GPU
cudaStatus = cudaMemcpy2D(dev_mat, pitch, mat, d.x*sizeof(int), d.x*sizeof(int), d.y, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy for dev_mat failed!\n");
goto Error;
}
cudaStatus = cudaMemcpy(dev_ans, ans, d.y * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy for dev_ans failed!\n");
goto Error;
}
// 运行核函数
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//addKernel_thd << <1, size >> >(dev_c, dev_a, dev_b);

cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float tm;
cudaEventElapsedTime(&tm, start, stop);
printf("GPU Elapsed time:%.6f ms.\n", tm);
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
if (cudaStatus != cudaSuccess)
{
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(ans, dev_ans, d.y * sizeof(int), cudaMemcpyDeviceToHost); //拷贝结果回主机
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_mat); //释放GPU设备端内存
cudaFree(dev_ans);
return cudaStatus;
}

posted @ 2019-09-11 14:53  茶飘香~  阅读(561)  评论(0编辑  收藏  举报