diff --git a/src/Function2D.cu b/src/Function2D.cu index cac7f79..ffbf4fe 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -1054,6 +1054,110 @@ CudaMatrix Aurora::inv(const CudaMatrix &aMatrix) return CudaMatrix::fromRawData(data, n, n); } +CudaMatrix Aurora::inv(CudaMatrix &&aMatrix) +{ + if (aMatrix.getDims() != 2) + { + std::cerr << "Fail! cuda inv args must be 2d matrix!"; + return aMatrix; + } + if (aMatrix.getDimSize(0) != aMatrix.getDimSize(1)) + { + std::cerr << "Fail! cuda inv args must be square matrix!"; + return aMatrix; + } + if (aMatrix.getValueType() != Normal) + { + std::cerr << "Fail! cuda inv args must be normal value type!"; + return aMatrix; + } + unsigned int n = aMatrix.getDimSize(0); + unsigned int size = aMatrix.getDataSize(); + cublasHandle_t handle; + cublasCreate(&handle); + float* data; + cudaMalloc((void**)&data, sizeof(float) * size); + + float** deviceInputPinter; + cudaMalloc((void**)&deviceInputPinter, sizeof(float *)); + + float **deviceOutputPointer; + cudaMalloc((void**)&deviceOutputPointer, sizeof(float *)); + + invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data); + cudaDeviceSynchronize(); + int* devicePivotArray; + cudaMalloc((void**)&devicePivotArray, n * sizeof(int)); + int* deviceInfoArray; + cudaMalloc((void**)&deviceInfoArray, sizeof(int)); + cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1); + cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1); + cudaFree(devicePivotArray); + cudaFree(deviceInfoArray); + cudaFree(deviceOutputPointer); + cudaFree(deviceInputPinter); + cublasDestroy(handle); + + return CudaMatrix::fromRawData(data, n, n); +} + +__global__ void dotColumnKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputRowSize) +{ + __shared__ float sharedValue[THREADS_PER_BLOCK]; + sharedValue[threadIdx.x] = 0; + + for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + { + unsigned int indexByRows = i*blockDim.x + threadIdx.x; + if(indexByRows < aInputRowSize) + { + sharedValue[threadIdx.x] += aInputData1[blockIdx.x*aInputRowSize + indexByRows] * aInputData2[blockIdx.x*aInputRowSize + indexByRows]; + } + } + __syncthreads(); + for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + { + if(threadIdx.x < i) + { + sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; + } + __syncthreads(); + } + aOutputData[blockIdx.x] = sharedValue[0]; +} + +CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction) +{ + if ( direction != Aurora::Column) + { + std::cerr<< "cuda dot() only support column data!"<< std::endl; + return CudaMatrix(); + } + if (!aMatrix.compareShape(aOther)) + { + std::cerr<< "cuda dot() matrix must be same shape!"<< std::endl; + return CudaMatrix(); + } + if(aMatrix.getValueType() == Aurora::Complex || aOther.getValueType() == Aurora::Complex) + { + std::cerr<< "cuda dot() do not support complex data!"<< std::endl; + return CudaMatrix(); + } + + unsigned int column = aMatrix.getDimSize(1); + unsigned int row = aMatrix.getDimSize(0); + if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) + { + column = 1; + row = aMatrix.getDataSize(); + } + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * column); + dotColumnKernel<<>>(aMatrix.getData(), aOther.getData(), data, row); + cudaDeviceSynchronize(); + + return CudaMatrix::fromRawData(data, 1, column); +} /** * @brief diff --git a/src/Function2D.cuh b/src/Function2D.cuh index bfaa095..8d9aba3 100644 --- a/src/Function2D.cuh +++ b/src/Function2D.cuh @@ -45,6 +45,8 @@ namespace Aurora CudaMatrix inv(CudaMatrix &&aMatrix); + CudaMatrix dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction = Column); + CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1); CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1); diff --git a/test/Function2D_Cuda_Test.cpp b/test/Function2D_Cuda_Test.cpp index c0329b7..f60eba7 100644 --- a/test/Function2D_Cuda_Test.cpp +++ b/test/Function2D_Cuda_Test.cpp @@ -771,4 +771,24 @@ TEST_F(Function2D_Cuda_Test, inv) { { EXPECT_FLOAT_AE(result1[i], result2[i]); } +} + +TEST_F(Function2D_Cuda_Test, dot) { + auto matrixHost1 = Aurora::Matrix::fromRawData(new float[200000], 1000,200); + auto matrixHost2 = Aurora::Matrix::fromRawData(new float[200000], 1000,200); + for(unsigned int i=0; i<200000;++i) + { + matrixHost1[i] = i; + matrixHost2[i] = i + 1; + } + auto matrixDevice1 = matrixHost1.toDeviceMatrix(); + Aurora::CudaMatrix matrixDevice2 = matrixHost2.toDeviceMatrix(); + auto result1 = Aurora::dot(matrixHost1, matrixHost2); + auto result2 = Aurora::dot(matrixDevice1, matrixDevice2).toHostMatrix(); + std::cout<< result1.getDataSize(); + ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize()); + for (size_t i = 0; i < result1.getDataSize(); i++) + { + EXPECT_FLOAT_AE(result1[i], result2[i]); + } } \ No newline at end of file