#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cufft.h>
#include <opencv.hpp>
#define NX 3
#define NY 5
#define BATCH 1
#define NRANK 2
using namespace cv;
using std::cout;
using std::endl;
static __global__ void cufftComplexScale(cufftComplex *idata, cufftComplex *odata, const int size, float scale)
{
const int threadID = blockIdx.x * blockDim.x + threadIdx.x;
if (threadID < size)
{
odata[threadID].x = idata[threadID].x * scale;
odata[threadID].y = idata[threadID].y * scale;
}
}
int main()
{
float2* Host_data;
cufftHandle plan;
cufftComplex *Device_data;
int n[NRANK] = { NX, NY };
Host_data = (float2*)malloc(sizeof(float2)*NX*NY);
cudaMalloc((void**)&Device_data, sizeof(cufftComplex)*NX*NY);
for (int i = 0; i < NY; i++)
for (int j = 0; j < NX; j++){
Host_data[i*NX + j].x = i*NX + j;
Host_data[i*NX + j].y = 0;
}
cudaMemcpy(Device_data, Host_data, sizeof(cufftComplex)*NX*NY, cudaMemcpyHostToDevice);
cufftPlanMany(&plan, NRANK, n,
NULL, 1, 0,
NULL, 1, 0,
CUFFT_C2C, BATCH);
cufftExecC2C(plan, Device_data, Device_data, CUFFT_FORWARD);
cufftExecC2C(plan, Device_data, Device_data, CUFFT_INVERSE);
dim3 dimBlock(NX*NY);
dim3 dimGrid(1);
cufftComplexScale << <dimGrid, dimBlock >> >(Device_data, Device_data, NX*NY, 1.0f / (NX*NY));
cudaMemcpy(Host_data, Device_data, sizeof(cufftComplex)*NX*NY, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cufftDestroy(plan);
cudaFree(Device_data);
for (int i = 0; i < NY; i++)
for (int j = 0; j < NX; j++){
printf("%f %f\n",Host_data[i*NX + j].x, Host_data[i*NX + j].y);
}
system("pause");
return 0;
}