Add cuda inv and unittest.
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
#include <Function2D.cuh>
|
||||
#include <cfloat>
|
||||
#include <cstddef>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#include <cmath>
|
||||
@@ -19,6 +20,8 @@
|
||||
#include <thrust/functional.h>
|
||||
#include <thrust/complex.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cusolverDn.h>
|
||||
#include "Function1D.cuh"
|
||||
#include "Matrix.h"
|
||||
|
||||
@@ -984,3 +987,60 @@ CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix)
|
||||
|
||||
return transpose(CudaMatrix::fromRawData(data, rows, columns));
|
||||
}
|
||||
|
||||
__global__ void invKernel(float** aInputPointer, float* aInputData, float** aOutputPointer, float* aOutputData)
|
||||
{
|
||||
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx == 0)
|
||||
{
|
||||
aInputPointer[0] = aInputData;
|
||||
aOutputPointer[0] = aOutputData;
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::inv(const CudaMatrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.getDims() != 2)
|
||||
{
|
||||
std::cerr << "Fail! cuda inv args must be 2d matrix!";
|
||||
return aMatrix;
|
||||
}
|
||||
if (aMatrix.getDimSize(0) != aMatrix.getDimSize(1))
|
||||
{
|
||||
std::cerr << "Fail! cuda inv args must be square matrix!";
|
||||
return aMatrix;
|
||||
}
|
||||
if (aMatrix.getValueType() != Normal)
|
||||
{
|
||||
std::cerr << "Fail! cuda inv args must be normal value type!";
|
||||
return aMatrix;
|
||||
}
|
||||
unsigned int n = aMatrix.getDimSize(0);
|
||||
unsigned int size = aMatrix.getDataSize();
|
||||
cublasHandle_t handle;
|
||||
cublasCreate(&handle);
|
||||
float* data;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
|
||||
float** deviceInputPinter;
|
||||
cudaMalloc((void**)&deviceInputPinter, sizeof(float *));
|
||||
|
||||
float **deviceOutputPointer;
|
||||
cudaMalloc((void**)&deviceOutputPointer, sizeof(float *));
|
||||
|
||||
invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data);
|
||||
cudaDeviceSynchronize();
|
||||
int* devicePivotArray;
|
||||
cudaMalloc((void**)&devicePivotArray, n * sizeof(int));
|
||||
int* deviceInfoArray;
|
||||
cudaMalloc((void**)&deviceInfoArray, sizeof(int));
|
||||
cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1);
|
||||
cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1);
|
||||
cudaFree(devicePivotArray);
|
||||
cudaFree(deviceInfoArray);
|
||||
cudaFree(deviceOutputPointer);
|
||||
cudaFree(deviceInputPinter);
|
||||
cublasDestroy(handle);
|
||||
|
||||
return CudaMatrix::fromRawData(data, n, n);
|
||||
}
|
||||
|
||||
@@ -40,6 +40,10 @@ namespace Aurora
|
||||
*/
|
||||
CudaMatrix sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix);
|
||||
|
||||
CudaMatrix inv(const CudaMatrix &aMatrix);
|
||||
|
||||
CudaMatrix inv(CudaMatrix &&aMatrix);
|
||||
|
||||
}
|
||||
|
||||
#endif // __FUNCTION2D_CUDA_H__
|
||||
@@ -646,3 +646,15 @@ TEST_F(Function2D_Cuda_Test, sortRows) {
|
||||
ASSERT_FLOAT_EQ(result3[i], result4[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(Function2D_Cuda_Test, inv) {
|
||||
auto matrixHost = Aurora::Matrix::fromRawData(new float[16]{4,6,7,8,9,3,7,5,4,3,2,1,2,3,4,5}, 4,4);
|
||||
auto matrixDevice = matrixHost.toDeviceMatrix();
|
||||
auto result1 = Aurora::inv(matrixHost);
|
||||
auto result2 = Aurora::inv(matrixDevice).toHostMatrix();
|
||||
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
|
||||
for (size_t i = 0; i < result1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(result1[i], result2[i]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user