Add cuda complex real + imag and unittest.

This commit is contained in:
sunwen
2023-12-19 13:31:05 +08:00
parent 81078bd69f
commit 3b8dbe0f31
3 changed files with 37 additions and 0 deletions

View File

@@ -39,6 +39,16 @@ __global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aS
}
}
__global__ void complexKernel(float* aInputRealData, float* aInputImagData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[2*idx] = aInputRealData[idx];
aOutput[2*idx + 1] = aInputImagData[idx];
}
}
CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
{
if(aMatrix.isComplex())
@@ -55,6 +65,22 @@ CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex);
}
CudaMatrix Aurora::complex(const CudaMatrix& aReal, const CudaMatrix& aImag)
{
if(aReal.isComplex() || aImag.isComplex() || aReal.getDataSize() != aImag.getDataSize())
{
return CudaMatrix();
}
size_t size = aReal.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size * Aurora::Complex);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
complexKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aReal.getData(), aImag.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aReal.getDimSize(0), aReal.getDimSize(1), aReal.getDimSize(2), Aurora::Complex);
}
__global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;