Add cuda dot and unitest.

This commit is contained in:
sunwen
2023-12-08 16:55:03 +08:00
parent 0229a132b2
commit 0082800112
3 changed files with 126 additions and 0 deletions

View File

@@ -1054,6 +1054,110 @@ CudaMatrix Aurora::inv(const CudaMatrix &aMatrix)
return CudaMatrix::fromRawData(data, n, n);
}
CudaMatrix Aurora::inv(CudaMatrix &&aMatrix)
{
if (aMatrix.getDims() != 2)
{
std::cerr << "Fail! cuda inv args must be 2d matrix!";
return aMatrix;
}
if (aMatrix.getDimSize(0) != aMatrix.getDimSize(1))
{
std::cerr << "Fail! cuda inv args must be square matrix!";
return aMatrix;
}
if (aMatrix.getValueType() != Normal)
{
std::cerr << "Fail! cuda inv args must be normal value type!";
return aMatrix;
}
unsigned int n = aMatrix.getDimSize(0);
unsigned int size = aMatrix.getDataSize();
cublasHandle_t handle;
cublasCreate(&handle);
float* data;
cudaMalloc((void**)&data, sizeof(float) * size);
float** deviceInputPinter;
cudaMalloc((void**)&deviceInputPinter, sizeof(float *));
float **deviceOutputPointer;
cudaMalloc((void**)&deviceOutputPointer, sizeof(float *));
invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data);
cudaDeviceSynchronize();
int* devicePivotArray;
cudaMalloc((void**)&devicePivotArray, n * sizeof(int));
int* deviceInfoArray;
cudaMalloc((void**)&deviceInfoArray, sizeof(int));
cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1);
cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1);
cudaFree(devicePivotArray);
cudaFree(deviceInfoArray);
cudaFree(deviceOutputPointer);
cudaFree(deviceInputPinter);
cublasDestroy(handle);
return CudaMatrix::fromRawData(data, n, n);
}
__global__ void dotColumnKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputRowSize)
{
__shared__ float sharedValue[THREADS_PER_BLOCK];
sharedValue[threadIdx.x] = 0;
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
sharedValue[threadIdx.x] += aInputData1[blockIdx.x*aInputRowSize + indexByRows] * aInputData2[blockIdx.x*aInputRowSize + indexByRows];
}
}
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
{
if(threadIdx.x < i)
{
sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i];
}
__syncthreads();
}
aOutputData[blockIdx.x] = sharedValue[0];
}
CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction)
{
if ( direction != Aurora::Column)
{
std::cerr<< "cuda dot() only support column data!"<< std::endl;
return CudaMatrix();
}
if (!aMatrix.compareShape(aOther))
{
std::cerr<< "cuda dot() matrix must be same shape!"<< std::endl;
return CudaMatrix();
}
if(aMatrix.getValueType() == Aurora::Complex || aOther.getValueType() == Aurora::Complex)
{
std::cerr<< "cuda dot() do not support complex data!"<< std::endl;
return CudaMatrix();
}
unsigned int column = aMatrix.getDimSize(1);
unsigned int row = aMatrix.getDimSize(0);
if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1)
{
column = 1;
row = aMatrix.getDataSize();
}
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * column);
dotColumnKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), aOther.getData(), data, row);
cudaDeviceSynchronize();
return CudaMatrix::fromRawData(data, 1, column);
}
/**
* @brief

View File

@@ -45,6 +45,8 @@ namespace Aurora
CudaMatrix inv(CudaMatrix &&aMatrix);
CudaMatrix dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction = Column);
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);

View File

@@ -771,4 +771,24 @@ TEST_F(Function2D_Cuda_Test, inv) {
{
EXPECT_FLOAT_AE(result1[i], result2[i]);
}
}
TEST_F(Function2D_Cuda_Test, dot) {
auto matrixHost1 = Aurora::Matrix::fromRawData(new float[200000], 1000,200);
auto matrixHost2 = Aurora::Matrix::fromRawData(new float[200000], 1000,200);
for(unsigned int i=0; i<200000;++i)
{
matrixHost1[i] = i;
matrixHost2[i] = i + 1;
}
auto matrixDevice1 = matrixHost1.toDeviceMatrix();
Aurora::CudaMatrix matrixDevice2 = matrixHost2.toDeviceMatrix();
auto result1 = Aurora::dot(matrixHost1, matrixHost2);
auto result2 = Aurora::dot(matrixDevice1, matrixDevice2).toHostMatrix();
std::cout<< result1.getDataSize();
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
for (size_t i = 0; i < result1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(result1[i], result2[i]);
}
}