Add prod and unittest.

This commit is contained in:
sunwen
2023-12-11 15:01:24 +08:00
parent 8b58d05d90
commit bd4a27a17b
3 changed files with 155 additions and 1 deletions

View File

@@ -1159,6 +1159,108 @@ CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, Func
return CudaMatrix::fromRawData(data, 1, column);
}
__global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
{
__shared__ float sharedValue[THREADS_PER_BLOCK];
sharedValue[threadIdx.x] = 1;
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows];
}
}
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
{
if(threadIdx.x < i)
{
sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i];
}
__syncthreads();
}
if(threadIdx.x == 0)
{
aOutputData[blockIdx.x] = sharedValue[0];
}
}
__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
{
__shared__ float sharedValue[THREADS_PER_BLOCK * 2];
unsigned int complexIdx = threadIdx.x * 2;
sharedValue[complexIdx] = 1;
sharedValue[complexIdx + 1] = 0;
for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows);
float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1];
float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index];
sharedValue[complexIdx] = real;
sharedValue[complexIdx + 1] = imag;
}
}
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
{
if(threadIdx.x < i)
{
unsigned int index = 2 * (threadIdx.x + i);
float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1];
float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index];
sharedValue[complexIdx] = real;
sharedValue[complexIdx + 1] = imag;
}
__syncthreads();
}
if(threadIdx.x == 0)
{
aOutputData[2 * blockIdx.x] = sharedValue[0];
aOutputData[2 * blockIdx.x + 1] = sharedValue[1];
}
}
CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
{
if (aMatrix.getDimSize(2) > 1 )
{
std::cerr<< "cuda prod() not support 3D data!"<< std::endl;
return CudaMatrix();
}
unsigned int row = aMatrix.getDimSize(0);
unsigned int column = aMatrix.getDimSize(1);
if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1)
{
column = 1;
row = aMatrix.getDataSize();
}
float* data = nullptr;
cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType());
if(aMatrix.isComplex())
{
prodComplexKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
}
else
{
prodKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
}
cudaDeviceSynchronize();
return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType());
}
/**
* @brief
*