Add fft,ifft,fftshift,ifftshift and their unittest

This commit is contained in:
kradchen
2023-12-08 16:20:13 +08:00
parent 1cb8d62f01
commit 382fd18aa0
4 changed files with 311 additions and 3 deletions

View File

@@ -26,6 +26,8 @@
#include "Function1D.cuh"
#include "Matrix.h"
#include "cufft.h"
#include <cufftXt.h>
using namespace Aurora;
namespace
@@ -1051,3 +1053,189 @@ CudaMatrix Aurora::inv(const CudaMatrix &aMatrix)
return CudaMatrix::fromRawData(data, n, n);
}
/**
* @brief
*
*
* @param aMatrix
* @param aDirection 0 FORWARD, 1 BACKWARD
*/
void ExecFFT(CudaMatrix& aMatrix,int aDirection){
cufftHandle plan;
cudaStream_t stream = NULL;
int batch_size = aMatrix.getDimSize(1);
cufftComplex *d_data = (cufftComplex *)aMatrix.getData();
cufftCreate(&plan);
// int gpus[1] ={0};
// cufftXtSetGPUs(plan,1,gpus);
cufftPlan1d(&plan, aMatrix.getDimSize(0), CUFFT_C2C, batch_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cufftSetStream(plan, stream);
cufftExecC2C(plan, d_data, d_data, aDirection==0?CUFFT_FORWARD:CUFFT_INVERSE);
cudaStreamSynchronize(stream);
cufftDestroy(plan);
cudaStreamDestroy(stream);
}
__global__ void complexFillKernel(float* aInputData, float* aOutput,unsigned int aCopySize,
unsigned int aSrcColEleCount, unsigned int aDesColEleCount)
{
unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x;
unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x;
for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x)
{
if(threadIdx.x + offset< aCopySize){
aOutput[2*idx_d] = aInputData[idx_s];
aOutput[2*idx_d + 1] = 0;
}
else if(threadIdx.x + offset< aDesColEleCount){
aOutput[2*idx_d] = 0;
aOutput[2*idx_d + 1] = 0;
}
else{
return;
}
}
}
__global__ void complexCopyKernel(float* aInputData, float* aOutput,unsigned int aCopySize,
unsigned int aSrcColEleCount, unsigned int aDesColEleCount)
{
unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x;
unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x;
for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x)
{
if(threadIdx.x + offset< aCopySize){
aOutput[2*idx_d] = aInputData[idx_s*2];
aOutput[2*idx_d + 1] = aInputData[idx_s*2+1];
}
else if(threadIdx.x + offset< aDesColEleCount){
aOutput[2*idx_d] = 0;
aOutput[2*idx_d + 1] = 0;
}
else{
return;
}
}
}
CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0);
//实际需要copy赋值的非0值
size_t needCopySize = (ColEleCount<aMatrix.getDimSize(0))?ColEleCount:aMatrix.getDimSize(0);
size_t bufferSize = ColEleCount*aMatrix.getDimSize(1);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
ExecFFT(ret,0);
return ret;
}
CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){
if (!aMatrix.isComplex()){
std::cerr<<"ifft input must be complex value"<<std::endl;
return CudaMatrix();
}
size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0);
//实际需要copy赋值的非0值
size_t needCopySize = (ColEleCount<aMatrix.getDimSize(0))?ColEleCount:aMatrix.getDimSize(0);
size_t bufferSize = ColEleCount*aMatrix.getDimSize(1);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
ExecFFT(ret,1);
float colEleCountf = 1.f/ColEleCount;
auto lambda = [=]__device__(const float& v){
return v*colEleCountf;
};
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize()*2,ret.getData(),lambda);
return ret;
}
__global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){
unsigned int idx = blockIdx.x *aColEleCount + threadIdx.x;
unsigned int stride = aColEleCount/2;
for (int i = 0; i < stride; i+=blockDim.x)
{
if (threadIdx.x + i < stride)
{
float temp = aData[idx];
aData[idx] = aData[idx+stride];
aData[idx+stride] = temp;
}
}
}
__global__ void memcpyColKernel(float* aInputData,float* aOutputData, unsigned int aCopySize,
unsigned int aSrcColEleCount,unsigned int aDesColEleCount){
unsigned int idx = blockIdx.x *aSrcColEleCount + threadIdx.x;
unsigned int idx2 = blockIdx.x *aDesColEleCount + threadIdx.x;
for (int i = 0; i < aCopySize; i+=blockDim.x)
{
if (threadIdx.x + i < aCopySize)
{
aOutputData[idx2+i] = aInputData[idx+i];
}
}
}
void Aurora::fftshift(CudaMatrix &aMatrix){
if (aMatrix.getDimSize(0) % 2 == 0) {
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
} else {
int copySize = aMatrix.getDimSize(0) / 2 + 1;
float *data = nullptr;
cudaMalloc((void **)&data, copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), data, copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(),
(copySize - 1) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(),
copySize * aMatrix.getValueType(),
copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
cudaFree(data);
}
}
void Aurora::ifftshift(CudaMatrix &aMatrix){
if (aMatrix.getDimSize(0) % 2 == 0) {
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
} else {
int copySize = aMatrix.getDimSize(0) / 2 + 1;
float *data = nullptr;
cudaMalloc((void **)&data, copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData()+(copySize-1)* aMatrix.getValueType(),
data, copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(),
(copySize-1) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
data, aMatrix.getData(),
copySize * aMatrix.getValueType(),
copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
cudaFree(data);
}
}

View File

@@ -25,6 +25,7 @@ namespace Aurora
* @return CudaMatrix
*/
CudaMatrix mean(const CudaMatrix &aMatrix, FunctionDirection direction = Column);
CudaMatrix sort(const CudaMatrix &aMatrix,FunctionDirection direction = Column);
CudaMatrix sort(CudaMatrix &&aMatrix,FunctionDirection direction = Column);
@@ -44,6 +45,11 @@ namespace Aurora
CudaMatrix inv(CudaMatrix &&aMatrix);
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
void fftshift(CudaMatrix &aMatrix);
void ifftshift(CudaMatrix &aMatrix);
}
#endif // __FUNCTION2D_CUDA_H__