feat: memory Improve for ifft & conj

This commit is contained in:
kradchen
2025-03-26 13:02:43 +08:00
parent 3ea6c84087
commit 9dd7d97237
4 changed files with 51 additions and 3 deletions

View File

@@ -988,6 +988,17 @@ __global__ void conjKernel(float *aInputData, float *aOutput, unsigned int aInpu
}
}
__global__ void conjInplaceKernel(float *aInputData, unsigned int aInputSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aInputSize)
{
unsigned int index = idx * 2;
aInputData[index + 1] = -aInputData[index + 1];
}
}
CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
{
if (!aMatrix.isComplex())
@@ -1003,6 +1014,19 @@ CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::conj(CudaMatrix &&aMatrix)
{
if (!aMatrix.isComplex())
{
return CudaMatrix::copyFromRawData(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
}
size_t size = aMatrix.getDataSize();
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
conjInplaceKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), size);
cudaDeviceSynchronize();
return aMatrix;
}
float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod)
{
float resultValue = 0;

View File

@@ -63,6 +63,8 @@ namespace Aurora
CudaMatrix conj(const CudaMatrix& aMatrix);
CudaMatrix conj(CudaMatrix&& aMatrix);
float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
CudaMatrix transpose(const CudaMatrix& aMatrix);

View File

@@ -39,6 +39,9 @@ using namespace Aurora;
namespace
{
const int THREADS_PER_BLOCK = 256;
const int FFT_FORWARD = 0;
const int FFT_BACKWARD = 1;
}
__global__ void maxColKernel(float *aInputData, float *aOutput, unsigned int aColSize)
@@ -1454,7 +1457,7 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize)
}
cudaDeviceSynchronize();
auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
ExecFFT(ret, 0);
ExecFFT(ret, FFT_FORWARD);
return ret;
}
@@ -1475,16 +1478,34 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize)
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount);
cudaDeviceSynchronize();
auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
ExecFFT(ret, 1);
ExecFFT(ret, FFT_BACKWARD);
float colEleCountf = 1.f / ColEleCount;
auto lambda = [=] __device__(const float &v)
{
return v * colEleCountf;
};
} ;
thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize() * 2, ret.getData(), lambda);
return ret;
}
CudaMatrix Aurora::ifft(CudaMatrix && aMatrix)
{
if (!aMatrix.isComplex())
{
std::cerr << "ifft input must be complex value" << std::endl;
return CudaMatrix();
}
size_t ColEleCount = aMatrix.getDimSize(0);
ExecFFT(aMatrix, FFT_BACKWARD);
float colEleCountf = 1.f / ColEleCount;
auto lambda = [=] __device__(const float &v)
{
return v * colEleCountf;
} ;
thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize() * 2, aMatrix.getData(), lambda);
return aMatrix;
};
__global__ void fftshiftSwapKernel(float *aData, unsigned int aColEleCount)
{
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;

View File

@@ -64,6 +64,7 @@ namespace Aurora
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
CudaMatrix ifft(CudaMatrix && aMatrix);
CudaMatrix hilbert(const CudaMatrix &aMatrix);