转置的好用的cuda程序
通过sample的例子自己改编的一个例子
#include <stdio.h>
#define BLOCK_DIM 5
// Transpose kernel (see transpose CUDA Sample for details)
__global__ void d_transpose(float *odata, float *idata, int width, int height)
{
__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];
// read the matrix tile into shared memory
unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
if ((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex * width + xIndex;
block[threadIdx.y][threadIdx.x] = idata[index_in];
}
__syncthreads();
// write the transposed matrix tile to global memory
xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
if ((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex * height + xIndex;
odata[index_out] = block[threadIdx.x][threadIdx.y];
}
}
void print_arr(float a[],int row,int col,char * info){
printf("%s\n",info);
for(int i=0;i<row;i++){
for(int j=0;j<col;j++){
printf("%f ",a[i*col+j]);
}
printf("\n");
}
}
int iDivUp(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
/*
Transpose a 2D array (see SDK transpose example)
*/
extern "C"
void transpose(float *d_src, float *d_dest,int width, int height)
{
dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1);
dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
d_transpose<<< grid, threads >>>(d_dest, d_src, width, height);
}
int main(){
const int nx = 32;
const int ny = 32;
const int mem_size = nx*ny*sizeof(float);
float *h_idata = (float *)malloc(mem_size);
float *h_cdata = (float *)malloc(mem_size);
float *h_tdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata, *d_tdata;
cudaMalloc(&d_idata, mem_size) ;
cudaMalloc(&d_cdata, mem_size) ;
cudaMalloc(&d_tdata, mem_size) ;
for (int j = 0; j < ny; j++){
for (int i = 0; i < nx; i++){
h_idata[j*nx + i] = j+0.1;//j*nx + i;
}
}
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;;
transpose(d_idata,d_tdata ,nx,ny);
cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);
print_arr(h_idata,nx,ny,"origin data is");
print_arr(h_tdata,nx,ny,"transposed data is");
cudaFree(d_tdata) ;
cudaFree(d_cdata) ;
cudaFree(d_idata) ;
free(h_idata);
free(h_tdata);
free(h_cdata);
return 0;
}
还有一个程序,挺奇怪的现在还不知道怎么用,有知道的可以交流一下
#include <stdio.h>
#include <cuda.h>
const int TILE_DIM = 16;
const int BLOCK_ROWS = 8;
void print_arr(float a[],int row,int col,char * info){
printf("%s\n",info);
for(int i=0;i<row;i++){
for(int j=0;j<col;j++){
printf("%f ",a[i*col+j]);
}
printf("\n");
}
}
__global__ void transposeNaive(float *odata, const float *idata)
{
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
int width = gridDim.x * TILE_DIM;
for (int j = 0; j < TILE_DIM; j+= BLOCK_ROWS)
odata[x*width + (y+j)] = idata[(y+j)*width + x];
}
int main(){
const int nx = 32;
const int ny = 32;
const int mem_size = nx*ny*sizeof(float);
float *h_idata = (float*)malloc(mem_size);
float *h_cdata = (float*)malloc(mem_size);
float *h_tdata = (float*)malloc(mem_size);
float *d_idata, *d_cdata, *d_tdata;
cudaMalloc(&d_idata, mem_size) ;
cudaMalloc(&d_cdata, mem_size) ;
cudaMalloc(&d_tdata, mem_size) ;
for (int j = 0; j < ny; j++)
for (int i = 0; i < nx; i++)
h_idata[j*nx + i] = j%32;//j*nx + i;
cudaMemcpy(d_idata, h_idata, mem_size, cudaMemcpyHostToDevice) ;
dim3 dimGrid(nx/TILE_DIM, ny/TILE_DIM, 1);
dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
cudaMemset(d_tdata, 0, mem_size) ;
transposeNaive<<<dimGrid, dimBlock>>>(d_tdata, d_idata);
cudaMemcpy(h_tdata, d_tdata, mem_size, cudaMemcpyDeviceToHost);
print_arr(h_idata,32,32,"origin data is");
print_arr(h_tdata,32,32,"transposed data is");
cudaFree(d_tdata) ;
cudaFree(d_cdata) ;
cudaFree(d_idata) ;
free(h_idata);
free(h_tdata);
free(h_cdata);
return 0;
}
浙公网安备 33010602011771号