feat: memory Improve for ifft & conj
This commit is contained in:
@@ -988,6 +988,17 @@ __global__ void conjKernel(float *aInputData, float *aOutput, unsigned int aInpu
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void conjInplaceKernel(float *aInputData, unsigned int aInputSize)
|
||||||
|
{
|
||||||
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (idx < aInputSize)
|
||||||
|
{
|
||||||
|
unsigned int index = idx * 2;
|
||||||
|
aInputData[index + 1] = -aInputData[index + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
|
CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
|
||||||
{
|
{
|
||||||
if (!aMatrix.isComplex())
|
if (!aMatrix.isComplex())
|
||||||
@@ -1003,6 +1014,19 @@ CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
|
|||||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CudaMatrix Aurora::conj(CudaMatrix &&aMatrix)
|
||||||
|
{
|
||||||
|
if (!aMatrix.isComplex())
|
||||||
|
{
|
||||||
|
return CudaMatrix::copyFromRawData(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
||||||
|
}
|
||||||
|
size_t size = aMatrix.getDataSize();
|
||||||
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||||
|
conjInplaceKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), size);
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
return aMatrix;
|
||||||
|
}
|
||||||
|
|
||||||
float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod)
|
float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod)
|
||||||
{
|
{
|
||||||
float resultValue = 0;
|
float resultValue = 0;
|
||||||
|
|||||||
@@ -63,6 +63,8 @@ namespace Aurora
|
|||||||
|
|
||||||
CudaMatrix conj(const CudaMatrix& aMatrix);
|
CudaMatrix conj(const CudaMatrix& aMatrix);
|
||||||
|
|
||||||
|
CudaMatrix conj(CudaMatrix&& aMatrix);
|
||||||
|
|
||||||
float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
|
float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
|
||||||
|
|
||||||
CudaMatrix transpose(const CudaMatrix& aMatrix);
|
CudaMatrix transpose(const CudaMatrix& aMatrix);
|
||||||
|
|||||||
@@ -39,6 +39,9 @@ using namespace Aurora;
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
const int THREADS_PER_BLOCK = 256;
|
const int THREADS_PER_BLOCK = 256;
|
||||||
|
const int FFT_FORWARD = 0;
|
||||||
|
const int FFT_BACKWARD = 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void maxColKernel(float *aInputData, float *aOutput, unsigned int aColSize)
|
__global__ void maxColKernel(float *aInputData, float *aOutput, unsigned int aColSize)
|
||||||
@@ -1454,7 +1457,7 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize)
|
|||||||
}
|
}
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
|
auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
|
||||||
ExecFFT(ret, 0);
|
ExecFFT(ret, FFT_FORWARD);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1475,16 +1478,34 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize)
|
|||||||
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount);
|
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount);
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
|
auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
|
||||||
ExecFFT(ret, 1);
|
ExecFFT(ret, FFT_BACKWARD);
|
||||||
float colEleCountf = 1.f / ColEleCount;
|
float colEleCountf = 1.f / ColEleCount;
|
||||||
auto lambda = [=] __device__(const float &v)
|
auto lambda = [=] __device__(const float &v)
|
||||||
{
|
{
|
||||||
return v * colEleCountf;
|
return v * colEleCountf;
|
||||||
};
|
} ;
|
||||||
thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize() * 2, ret.getData(), lambda);
|
thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize() * 2, ret.getData(), lambda);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CudaMatrix Aurora::ifft(CudaMatrix && aMatrix)
|
||||||
|
{
|
||||||
|
if (!aMatrix.isComplex())
|
||||||
|
{
|
||||||
|
std::cerr << "ifft input must be complex value" << std::endl;
|
||||||
|
return CudaMatrix();
|
||||||
|
}
|
||||||
|
size_t ColEleCount = aMatrix.getDimSize(0);
|
||||||
|
ExecFFT(aMatrix, FFT_BACKWARD);
|
||||||
|
float colEleCountf = 1.f / ColEleCount;
|
||||||
|
auto lambda = [=] __device__(const float &v)
|
||||||
|
{
|
||||||
|
return v * colEleCountf;
|
||||||
|
} ;
|
||||||
|
thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize() * 2, aMatrix.getData(), lambda);
|
||||||
|
return aMatrix;
|
||||||
|
};
|
||||||
|
|
||||||
__global__ void fftshiftSwapKernel(float *aData, unsigned int aColEleCount)
|
__global__ void fftshiftSwapKernel(float *aData, unsigned int aColEleCount)
|
||||||
{
|
{
|
||||||
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
|
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ namespace Aurora
|
|||||||
|
|
||||||
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
|
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
|
||||||
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
|
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
|
||||||
|
CudaMatrix ifft(CudaMatrix && aMatrix);
|
||||||
|
|
||||||
CudaMatrix hilbert(const CudaMatrix &aMatrix);
|
CudaMatrix hilbert(const CudaMatrix &aMatrix);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user