Add prod and unittest.

This commit is contained in:
sunwen
2023-12-11 15:01:24 +08:00
parent 8b58d05d90
commit bd4a27a17b
3 changed files with 155 additions and 1 deletions

View File

@@ -1159,6 +1159,108 @@ CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, Func
return CudaMatrix::fromRawData(data, 1, column);
}
__global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
{
__shared__ float sharedValue[THREADS_PER_BLOCK];
sharedValue[threadIdx.x] = 1;
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows];
}
}
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
{
if(threadIdx.x < i)
{
sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i];
}
__syncthreads();
}
if(threadIdx.x == 0)
{
aOutputData[blockIdx.x] = sharedValue[0];
}
}
__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
{
__shared__ float sharedValue[THREADS_PER_BLOCK * 2];
unsigned int complexIdx = threadIdx.x * 2;
sharedValue[complexIdx] = 1;
sharedValue[complexIdx + 1] = 0;
for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows);
float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1];
float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index];
sharedValue[complexIdx] = real;
sharedValue[complexIdx + 1] = imag;
}
}
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
{
if(threadIdx.x < i)
{
unsigned int index = 2 * (threadIdx.x + i);
float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1];
float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index];
sharedValue[complexIdx] = real;
sharedValue[complexIdx + 1] = imag;
}
__syncthreads();
}
if(threadIdx.x == 0)
{
aOutputData[2 * blockIdx.x] = sharedValue[0];
aOutputData[2 * blockIdx.x + 1] = sharedValue[1];
}
}
CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
{
if (aMatrix.getDimSize(2) > 1 )
{
std::cerr<< "cuda prod() not support 3D data!"<< std::endl;
return CudaMatrix();
}
unsigned int row = aMatrix.getDimSize(0);
unsigned int column = aMatrix.getDimSize(1);
if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1)
{
column = 1;
row = aMatrix.getDataSize();
}
float* data = nullptr;
cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType());
if(aMatrix.isComplex())
{
prodComplexKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
}
else
{
prodKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
}
cudaDeviceSynchronize();
return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType());
}
/**
* @brief
*

View File

@@ -47,6 +47,13 @@ namespace Aurora
CudaMatrix dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction = Column);
/**
* prod支持到2维
* @param aMatrix
* @return
*/
CudaMatrix prod(const CudaMatrix &aMatrix);
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);

View File

@@ -807,10 +807,55 @@ TEST_F(Function2D_Cuda_Test, dot) {
Aurora::CudaMatrix matrixDevice2 = matrixHost2.toDeviceMatrix();
auto result1 = Aurora::dot(matrixHost1, matrixHost2);
auto result2 = Aurora::dot(matrixDevice1, matrixDevice2).toHostMatrix();
std::cout<< result1.getDataSize();
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
for (size_t i = 0; i < result1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(result1[i], result2[i]);
}
}
TEST_F(Function2D_Cuda_Test, prod) {
auto matrixHost = Aurora::Matrix::fromRawData(new float[20], 4,5);
for(unsigned int i=0; i<20;++i)
{
matrixHost[i] = i + 1;
}
auto matrixDevice = matrixHost.toDeviceMatrix();
auto result1 = Aurora::prod(matrixHost);
auto result2 = Aurora::prod(matrixDevice).toHostMatrix();
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
for (size_t i = 0; i < result1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(result1[i], result2[i]);
}
matrixHost = Aurora::Matrix::fromRawData(new float[20], 20,1);
for(unsigned int i=0; i<20;++i)
{
matrixHost[i] = i + 1;
}
matrixDevice = matrixHost.toDeviceMatrix();
result1 = Aurora::prod(matrixHost);
result2 = Aurora::prod(matrixDevice).toHostMatrix();
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
for (size_t i = 0; i < result1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(result1[i], result2[i]);
}
auto matrixHostComplex = Aurora::Matrix::fromRawData(new float[40], 4,5, 1,Aurora::Complex);
for(unsigned int i=0; i<40;++i)
{
matrixHost[i] = i + 1;
}
matrixDevice = matrixHostComplex.toDeviceMatrix();
result1 = Aurora::prod(matrixHostComplex);
result2 = Aurora::prod(matrixDevice).toHostMatrix();
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
ASSERT_FLOAT_EQ(result1.getValueType(), result2.getValueType());
for (size_t i = 0; i < result1.getDataSize() * result1.getValueType(); i++)
{
EXPECT_FLOAT_AE(result1[i], result2[i]);
}
}