Add cuda ifft_symmetric and unittest.

This commit is contained in:
sunwen
2023-12-14 17:57:53 +08:00
parent 5f906f78b8
commit d070edfef7
3 changed files with 54 additions and 3 deletions

View File

@@ -1337,12 +1337,12 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
if (aMatrix.isComplex()){
if (aMatrix.isComplex()){
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
}
else{
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
}
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
}
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
ExecFFT(ret,0);
return ret;
@@ -1503,4 +1503,35 @@ CudaMatrix Aurora::sub2ind(const CudaMatrix &aVMatrixSize, std::vector<CudaMatri
cudaFree(indexMatrixData);
delete[] tempPointer;
return CudaMatrix::fromRawData(data, indexMatrixRows);
}
__global__ void ifft_symmetricKernel(float* aMatrix, unsigned int aMatrixDataSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < aMatrixDataSize)
{
unsigned int indexOutput = (idx + aMatrixDataSize + 2) * 2;
unsigned int indexInput = 2 * (aMatrixDataSize - idx);
aMatrix[indexOutput] = aMatrix[indexInput];
aMatrix[indexOutput + 1] = -aMatrix[indexInput + 1];
}
}
CudaMatrix Aurora::ifft_symmetric(const CudaMatrix &aMatrix, long aLength)
{
if(!aMatrix.isVector())
{
std::cerr<<"cuda ifft_symmetric only support vector!"<<std::endl;
return CudaMatrix();
}
int matrixLength = aMatrix.getDataSize();
float* data = nullptr;
unsigned int size = aLength * 2;
cudaMalloc((void **)&data, sizeof(float) * size);
cudaMemset(data, 0.0, size);
cudaMemcpy(data, aMatrix.getData(), sizeof(float) * aLength, cudaMemcpyDeviceToDevice);
int blocksPerGrid = (aLength - 1 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
ifft_symmetricKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(data, aLength / 2 - 1);
cudaDeviceSynchronize();
return real(ifft(CudaMatrix::fromRawData(data,aLength,1,1,Complex)));
}