From 382fd18aa0f35792cc8d7c4c157292d36cdeafac Mon Sep 17 00:00:00 2001 From: kradchen Date: Fri, 8 Dec 2023 16:20:13 +0800 Subject: [PATCH] Add fft,ifft,fftshift,ifftshift and their unittest --- CMakeLists.txt | 2 +- src/Function2D.cu | 188 ++++++++++++++++++++++++++++++++++ src/Function2D.cuh | 6 ++ test/Function2D_Cuda_Test.cpp | 118 ++++++++++++++++++++- 4 files changed, 311 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f23d332..b6469cb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,7 +74,7 @@ if (Aurora_USE_CUDA) target_include_directories(Aurora_Test PRIVATE ./src /usr/local/cuda/include) set_target_properties(Aurora_Test PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_compile_options(Aurora_Test PRIVATE $<$: - -arch=sm_75 --expt-extended-lambda -Icub/ + -arch=sm_75 --expt-extended-lambda >) target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) target_link_libraries(Aurora_Test PRIVATE ${CUDA_cublas_LIBRARY}) diff --git a/src/Function2D.cu b/src/Function2D.cu index 5cf6a06..cac7f79 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -26,6 +26,8 @@ #include "Function1D.cuh" #include "Matrix.h" +#include "cufft.h" +#include using namespace Aurora; namespace @@ -1051,3 +1053,189 @@ CudaMatrix Aurora::inv(const CudaMatrix &aMatrix) return CudaMatrix::fromRawData(data, n, n); } + + +/** + * @brief + * + * + * @param aMatrix + * @param aDirection 0 FORWARD, 1 BACKWARD + */ +void ExecFFT(CudaMatrix& aMatrix,int aDirection){ + cufftHandle plan; + cudaStream_t stream = NULL; + int batch_size = aMatrix.getDimSize(1); + cufftComplex *d_data = (cufftComplex *)aMatrix.getData(); + cufftCreate(&plan); + // int gpus[1] ={0}; + // cufftXtSetGPUs(plan,1,gpus); + cufftPlan1d(&plan, aMatrix.getDimSize(0), CUFFT_C2C, batch_size); + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + cufftSetStream(plan, stream); + cufftExecC2C(plan, d_data, d_data, aDirection==0?CUFFT_FORWARD:CUFFT_INVERSE); + cudaStreamSynchronize(stream); + cufftDestroy(plan); + cudaStreamDestroy(stream); +} + +__global__ void complexFillKernel(float* aInputData, float* aOutput,unsigned int aCopySize, + unsigned int aSrcColEleCount, unsigned int aDesColEleCount) +{ + unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; + unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; + + for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x) + { + if(threadIdx.x + offset< aCopySize){ + aOutput[2*idx_d] = aInputData[idx_s]; + aOutput[2*idx_d + 1] = 0; + } + else if(threadIdx.x + offset< aDesColEleCount){ + aOutput[2*idx_d] = 0; + aOutput[2*idx_d + 1] = 0; + } + else{ + return; + } + } +} + +__global__ void complexCopyKernel(float* aInputData, float* aOutput,unsigned int aCopySize, + unsigned int aSrcColEleCount, unsigned int aDesColEleCount) +{ + unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; + unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; + + for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x) + { + if(threadIdx.x + offset< aCopySize){ + aOutput[2*idx_d] = aInputData[idx_s*2]; + aOutput[2*idx_d + 1] = aInputData[idx_s*2+1]; + } + else if(threadIdx.x + offset< aDesColEleCount){ + aOutput[2*idx_d] = 0; + aOutput[2*idx_d + 1] = 0; + } + else{ + return; + } + } +} + +CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){ + size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0); + //实际需要copy赋值的非0值 + size_t needCopySize = (ColEleCount>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); + auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); + ExecFFT(ret,0); + return ret; +} + +CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){ + if (!aMatrix.isComplex()){ + std::cerr<<"ifft input must be complex value"<0)?aFFTSize:aMatrix.getDimSize(0); + //实际需要copy赋值的非0值 + size_t needCopySize = (ColEleCount>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); + auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); + ExecFFT(ret,1); + float colEleCountf = 1.f/ColEleCount; + auto lambda = [=]__device__(const float& v){ + return v*colEleCountf; + }; + thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize()*2,ret.getData(),lambda); + return ret; +} + +__global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){ + unsigned int idx = blockIdx.x *aColEleCount + threadIdx.x; + unsigned int stride = aColEleCount/2; + for (int i = 0; i < stride; i+=blockDim.x) + { + if (threadIdx.x + i < stride) + { + float temp = aData[idx]; + aData[idx] = aData[idx+stride]; + aData[idx+stride] = temp; + } + } +} + +__global__ void memcpyColKernel(float* aInputData,float* aOutputData, unsigned int aCopySize, + unsigned int aSrcColEleCount,unsigned int aDesColEleCount){ + unsigned int idx = blockIdx.x *aSrcColEleCount + threadIdx.x; + unsigned int idx2 = blockIdx.x *aDesColEleCount + threadIdx.x; + for (int i = 0; i < aCopySize; i+=blockDim.x) + { + if (threadIdx.x + i < aCopySize) + { + aOutputData[idx2+i] = aInputData[idx+i]; + } + } +} + +void Aurora::fftshift(CudaMatrix &aMatrix){ + if (aMatrix.getDimSize(0) % 2 == 0) { + fftshiftSwapKernel<<>>( + aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); + } else { + int copySize = aMatrix.getDimSize(0) / 2 + 1; + float *data = nullptr; + cudaMalloc((void **)&data, copySize * aMatrix.getValueType()); + memcpyColKernel<<>>( + aMatrix.getData(), data, copySize * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType(), + copySize * aMatrix.getValueType()); + memcpyColKernel<<>>( + aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(), + (copySize - 1) * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType()); + memcpyColKernel<<>>( + data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(), + copySize * aMatrix.getValueType(), + copySize * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType()); + cudaFree(data); + } +} + +void Aurora::ifftshift(CudaMatrix &aMatrix){ + if (aMatrix.getDimSize(0) % 2 == 0) { + fftshiftSwapKernel<<>>( + aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); + } else { + int copySize = aMatrix.getDimSize(0) / 2 + 1; + float *data = nullptr; + cudaMalloc((void **)&data, copySize * aMatrix.getValueType()); + memcpyColKernel<<>>( + aMatrix.getData()+(copySize-1)* aMatrix.getValueType(), + data, copySize * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType(), + copySize * aMatrix.getValueType()); + memcpyColKernel<<>>( + aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(), + (copySize-1) * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType()); + memcpyColKernel<<>>( + data, aMatrix.getData(), + copySize * aMatrix.getValueType(), + copySize * aMatrix.getValueType(), + aMatrix.getDimSize(0) * aMatrix.getValueType()); + cudaFree(data); + } +} \ No newline at end of file diff --git a/src/Function2D.cuh b/src/Function2D.cuh index 0501f03..bfaa095 100644 --- a/src/Function2D.cuh +++ b/src/Function2D.cuh @@ -25,6 +25,7 @@ namespace Aurora * @return CudaMatrix */ CudaMatrix mean(const CudaMatrix &aMatrix, FunctionDirection direction = Column); + CudaMatrix sort(const CudaMatrix &aMatrix,FunctionDirection direction = Column); CudaMatrix sort(CudaMatrix &&aMatrix,FunctionDirection direction = Column); @@ -44,6 +45,11 @@ namespace Aurora CudaMatrix inv(CudaMatrix &&aMatrix); + CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1); + CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1); + + void fftshift(CudaMatrix &aMatrix); + void ifftshift(CudaMatrix &aMatrix); } #endif // __FUNCTION2D_CUDA_H__ \ No newline at end of file diff --git a/test/Function2D_Cuda_Test.cpp b/test/Function2D_Cuda_Test.cpp index 27076bb..c0329b7 100644 --- a/test/Function2D_Cuda_Test.cpp +++ b/test/Function2D_Cuda_Test.cpp @@ -40,9 +40,9 @@ TEST_F(Function2D_Cuda_Test, min) dB = B.toDeviceMatrix(); long r,c; - auto ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c); + auto ret1 = Aurora::min(B, Aurora::Column,r,c); - auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c); + auto ret2 = Aurora::min(dB, Aurora::Column,r,c); ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); @@ -615,6 +615,120 @@ TEST_F(Function2D_Cuda_Test, sort) } } +TEST_F(Function2D_Cuda_Test, fft) +{ + { + float *input = new float[20]{1,1,0,2,2,0,1,1,0,2,1,1,0,2,2,0,1,1,0,2}; + auto ma = Aurora::Matrix::fromRawData(input,10,2).toDeviceMatrix(); + auto fftrett = Aurora::fft(ma); + auto fftrettH = fftrett.toHostMatrix(); + std::complex* result = (std::complex*)fftrettH.getData(); + //检验fft结果与matlab是否对应 + EXPECT_FLOAT_EQ(0.0729, fourDecimalRound(result[1].real())); + EXPECT_FLOAT_EQ(2.4899, fourDecimalRound(result[2].imag())); + EXPECT_FLOAT_EQ(0.0729, fourDecimalRound(result[11].real())); + EXPECT_FLOAT_EQ(2.4899, fourDecimalRound(result[12].imag())); + //检验fft的结果是否共轭 + EXPECT_FLOAT_EQ(0, result[4].imag()+result[6].imag()); + EXPECT_FLOAT_EQ(0, result[4].real()-result[6].real()); + + + auto ret= Aurora::ifft(fftrett); + auto retH = ret.toHostMatrix(); + std::complex* ifftResult = (std::complex*)retH.getData(); + EXPECT_FLOAT_EQ(fourDecimalRound(ifftResult[1].real()),1.0); + EXPECT_FLOAT_EQ(fourDecimalRound(ifftResult[3].real()),2.0); + EXPECT_FLOAT_EQ(fourDecimalRound(ifftResult[11].real()),1.0); + EXPECT_FLOAT_EQ(fourDecimalRound(ifftResult[13].real()),2.0); + // Aurora::fftshift(fftrett); + // EXPECT_FLOAT_EQ(0.0729, fourDecimalRound(result[6].real())); + // EXPECT_FLOAT_EQ(2.4899, fourDecimalRound(result[7].imag())); + // EXPECT_FLOAT_EQ(-2., fourDecimalRound(result[10].real())); + // EXPECT_FLOAT_EQ(-0.2245, fourDecimalRound(result[11].imag())); + + auto fftrett2 = Aurora::fft(ma,5); + auto fftrett2H =fftrett2.toHostMatrix(); + + auto result2 = (std::complex*)fftrett2H.getData(); + //检验fft结果与matlab是否对应 + EXPECT_FLOAT_EQ(0.3090, fourDecimalRound(result2[1].real())); + EXPECT_FLOAT_EQ(-1.3143, fourDecimalRound(result2[2].imag())); + EXPECT_FLOAT_EQ(-0.8090, fourDecimalRound(result2[7].real())); + EXPECT_FLOAT_EQ(1.3143, fourDecimalRound(result2[8].imag())); + + auto fftrett3 = Aurora::fft(ma,12); + auto fftrett3H = fftrett3.toHostMatrix(); + + auto result3 = (std::complex*)fftrett3H.getData(); + //检验fft结果与matlab是否对应 + EXPECT_FLOAT_EQ(-1.0, fourDecimalRound(result3[1].real())); + EXPECT_FLOAT_EQ(-3.4641, fourDecimalRound(result3[4].imag())); + EXPECT_FLOAT_EQ(-1.0, fourDecimalRound(result3[13].real())); + EXPECT_FLOAT_EQ(-3.4641, fourDecimalRound(result3[16].imag())); + } + { + float *input = new float[20]{1,1,0,2,2,0,1,1,0,2,1,1,0,2,2,0,1,1,0,2}; + auto ma = Aurora::Matrix::fromRawData(input,10,2); + auto maD = ma.toDeviceMatrix(); + auto ret1 = Aurora::ifft(Aurora::fft(ma),12); + auto ret2 = Aurora::ifft(Aurora::fft(maD),12); + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i)); + } + ret1 = Aurora::ifft(Aurora::fft(ma),8); + ret2 = Aurora::ifft(Aurora::fft(maD),8); + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i)); + } + + } + { + float * data = new float[250000]; + for (size_t i = 0; i < 250000; i++) + { + data[i] = i%500; + } + auto fm = Aurora::Matrix::fromRawData(data,500,250,1,Aurora::Complex); + auto fmD = fm.toDeviceMatrix(); + Aurora::fftshift(fm); + Aurora::fftshift(fmD); + for (size_t i = 0; i < fm.getDataSize()*2; i++) + { + EXPECT_FLOAT_AE(fm[i], fmD.getValue(i)); + } + Aurora::ifftshift(fm); + Aurora::ifftshift(fmD); + for (size_t i = 0; i < fm.getDataSize()*2; i++) + { + EXPECT_FLOAT_AE(fm[i], fmD.getValue(i)); + } + } + { + float * data = new float[333*500]; + for (size_t i = 0; i < 333*500; i++) + { + data[i] = i%333; + } + auto fm = Aurora::Matrix::fromRawData(data,333,250,1,Aurora::Complex); + auto fmD = fm.toDeviceMatrix(); + Aurora::fftshift(fm); + Aurora::fftshift(fmD); + for (size_t i = 0; i < fm.getDataSize()*2; i++) + { + ASSERT_FLOAT_EQ(fm[i], fmD.getValue(i))<<"index:"<