feat: memory Improve for ifft & conj

2025-03-26 13:02:43 +08:00
parent 3ea6c84087
commit 9dd7d97237
4 changed files with 51 additions and 3 deletions
--- a/src/Function1D.cu
+++ b/src/Function1D.cu
@@ -988,6 +988,17 @@ __global__ void conjKernel(float *aInputData, float *aOutput, unsigned int aInpu
    }
 }

+__global__ void conjInplaceKernel(float *aInputData, unsigned int aInputSize)
+{
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < aInputSize)
+    {
+        unsigned int index = idx * 2;
+        aInputData[index + 1] = -aInputData[index + 1];
+    }
+}
+
+
 CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
 {
    if (!aMatrix.isComplex())
@@ -1003,6 +1014,19 @@ CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
    return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
 }

+CudaMatrix Aurora::conj(CudaMatrix &&aMatrix)
+{
+    if (!aMatrix.isComplex())
+    {
+        return CudaMatrix::copyFromRawData(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
+    }
+    size_t size = aMatrix.getDataSize();
+    int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    conjInplaceKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), size);
+    cudaDeviceSynchronize();
+    return aMatrix;
+}
+
 float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod)
 {
    float resultValue = 0;
--- a/src/Function1D.cuh
+++ b/src/Function1D.cuh
@@ -63,6 +63,8 @@ namespace Aurora

    CudaMatrix conj(const CudaMatrix& aMatrix);

+    CudaMatrix conj(CudaMatrix&& aMatrix);
+
    float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);

    CudaMatrix transpose(const CudaMatrix& aMatrix);
--- a/src/Function2D.cu
+++ b/src/Function2D.cu
@@ -39,6 +39,9 @@ using namespace Aurora;
 namespace
 {
    const int THREADS_PER_BLOCK = 256;
+    const int FFT_FORWARD = 0;
+    const int FFT_BACKWARD = 1;
+
 }

 __global__ void maxColKernel(float *aInputData, float *aOutput, unsigned int aColSize)
@@ -1454,7 +1457,7 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize)
    }
    cudaDeviceSynchronize();
    auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
-    ExecFFT(ret, 0);
+    ExecFFT(ret, FFT_FORWARD);
    return ret;
 }

@@ -1475,7 +1478,7 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize)
    complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount);
    cudaDeviceSynchronize();
    auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
-    ExecFFT(ret, 1);
+    ExecFFT(ret, FFT_BACKWARD);
    float colEleCountf = 1.f / ColEleCount;
    auto lambda = [=] __device__(const float &v)
    {
@@ -1485,6 +1488,24 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize)
    return ret;
 }

+CudaMatrix Aurora::ifft(CudaMatrix && aMatrix)
+{
+    if (!aMatrix.isComplex())
+    {
+        std::cerr << "ifft input must be complex value" << std::endl;
+        return CudaMatrix();
+    }
+    size_t ColEleCount = aMatrix.getDimSize(0);
+    ExecFFT(aMatrix, FFT_BACKWARD);
+    float colEleCountf = 1.f / ColEleCount;
+    auto lambda = [=] __device__(const float &v)
+    {
+        return v * colEleCountf;
+    } ;
+    thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize() * 2, aMatrix.getData(), lambda);
+    return aMatrix;
+};
+
 __global__ void fftshiftSwapKernel(float *aData, unsigned int aColEleCount)
 {
    unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
--- a/src/Function2D.cuh
+++ b/src/Function2D.cuh
@@ -64,6 +64,7 @@ namespace Aurora

    CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
    CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
+    CudaMatrix ifft(CudaMatrix && aMatrix);

    CudaMatrix hilbert(const CudaMatrix &aMatrix);