From e3abe9fabe96d544afad609579cf208b0237ecdb Mon Sep 17 00:00:00 2001 From: sunwen Date: Mon, 30 Oct 2023 10:28:24 +0800 Subject: [PATCH] Add CudaMatrix. --- CMakeLists.txt | 20 ++++ src/CudaMatrix.cpp | 238 ++++++++++++++++++++++++++++++++++++++++++ src/CudaMatrix.h | 239 +++++++++++++++++++++++++++++++++++++++++++ src/MatlabReader.cpp | 5 + src/Matrix.cpp | 23 ++--- src/Matrix.h | 5 +- 6 files changed, 510 insertions(+), 20 deletions(-) create mode 100644 src/CudaMatrix.cpp create mode 100644 src/CudaMatrix.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 0136edf..835a70a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,11 @@ project(Aurora) set(CMAKE_CXX_STANDARD 14) set(CMAKE_INCLUDE_CURRENT_DIR ON) + +set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) +enable_language(CUDA) +find_package(CUDAToolkit REQUIRED) + find_package (OpenMP REQUIRED) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") @@ -25,6 +30,14 @@ target_include_directories(Aurora PUBLIC $) target_link_libraries(Aurora PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(Aurora PUBLIC matio) + +target_include_directories(Aurora PRIVATE ./src /usr/local/cuda/include) +set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +target_compile_options(Aurora PRIVATE $<$: + -arch=sm_75 +>) +target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) + find_package(GTest REQUIRED) INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS}) include_directories(./src/util) @@ -41,5 +54,12 @@ target_link_libraries(Aurora_Test PUBLIC $) target_link_libraries(Aurora_Test PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(Aurora_Test PUBLIC matio) target_link_libraries(Aurora_Test PUBLIC ${GTEST_BOTH_LIBRARIES} ) + +target_include_directories(Aurora_Test PRIVATE ./src /usr/local/cuda/include) +set_target_properties(Aurora_Test PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +target_compile_options(Aurora_Test PRIVATE $<$: + -arch=sm_75 +>) +target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) gtest_discover_tests(Aurora_Test ) #target_link_libraries(CreateMatchedFilter PRIVATE TBB::tbb) \ No newline at end of file diff --git a/src/CudaMatrix.cpp b/src/CudaMatrix.cpp new file mode 100644 index 0000000..682da72 --- /dev/null +++ b/src/CudaMatrix.cpp @@ -0,0 +1,238 @@ +#include "CudaMatrix.h" + +#include "Function.h" +#include "Matrix.h" + +#include +#include +#include + +using namespace Aurora; + +CudaMatrix::CudaMatrix(std::shared_ptr aData, std::vector aInfo, ValueType aValueType) + : mValueType(aValueType) + , mData(aData) + , mInfo(aInfo) +{ + size_t infoSize = mInfo.size(); + for(; infoSize<3; ++infoSize) + { + mInfo.push_back(1); + } +} + +bool CudaMatrix::isNull() const +{ + return !mData || mInfo.empty(); +} + +bool CudaMatrix::isNan() const +{ + for(size_t i=0; i1) return false; + if (isScalar()) return false; + return getDimSize(0) == 1 || + getDimSize(1) == 1; +} + +int CudaMatrix::getDims() const +{ + if(mInfo[2] > 1) + { + return 3; + } + return 2; +} + +float *CudaMatrix::getData() const +{ + return mData.get(); +} + +int CudaMatrix::getDimSize(int aIndex) const +{ + if (aIndex >= 0 && aIndex < 3) { + return mInfo.at(aIndex); + } + return 0; +} + +size_t CudaMatrix::getDataSize() const +{ + if (!mData.get())return 0; + size_t ret = 1; + for (auto v: mInfo) { + ret *= v; + } + return ret; +} + +void CudaMatrix::forceReshape(int rows, int columns, int slices) +{ + mInfo = {rows,columns,slices}; +} + +bool CudaMatrix::compareShape(const CudaMatrix &other) const +{ + if (mInfo[2] == 1 && other.mInfo[2] == 1) { + if (mInfo[0]==1 && other.mInfo[1] == 1 && mInfo[1] == other.mInfo[0]) return true; + if (mInfo[1]==1 && other.mInfo[0] == 1 && mInfo[0] == other.mInfo[1]) return true; + } + for (int i = 0; i < mInfo.size(); ++i) { + if (mInfo[i] != other.mInfo[i]) return false; + } + return true; +} + +CudaMatrix CudaMatrix::fromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType) +{ + if (!aData) + { + return CudaMatrix(); + } + std::vector vector{aRows, aCols, aSlices}; + CudaMatrix ret({aData, gpuFree}, vector, aType); + return ret; +} + +CudaMatrix CudaMatrix::copyFromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType) +{ + if (!aData) + { + return CudaMatrix(); + } + float* data = nullptr; + unsigned long long size = aRows * aCols * aSlices * aType; + cudaMalloc((void**)&data, sizeof(float) * size); + cudaMemcpy(data, aData, sizeof(float) * size, cudaMemcpyDeviceToDevice); + std::vector vector{aRows, aCols, aSlices}; + return CudaMatrix({data, gpuFree}, vector, aType); +} + +CudaMatrix CudaMatrix::deepCopy() const +{ + float* data = nullptr; + unsigned long long size = getDataSize() * getValueType(); + cudaMalloc((void**)&data, sizeof(float) * size); + cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToDevice); + return CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType()); +} + +Matrix CudaMatrix::toHostMatrix() const +{ + unsigned long long size = getDataSize() * getValueType(); + float* data = new float[size]; + cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToHost); + return Matrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType()); +} + +CudaMatrix CudaMatrix::block(int aDim,int aBeginIndex, int aEndIndex) const +{ + if(aDim > 2) + { + std::cerr<<"CudaMatrix block only support 1D-3D data!"<1) + { + aDim = 1; + } + + if (aBeginIndex>=getDimSize(aDim) || aBeginIndex<0) + { + std::cerr<<"CudaMatrix block BeginIndx error!BeginIndx:"<=getDimSize(aDim) || aEndIndex<0) + { + std::cerr<<"CudaMatrix block EndIndex error!EndIndex:"<2 ) + { + std::cerr<<"CudaMatrix block only support 1D-3D data!"< aData = std::shared_ptr(), + std::vector aInfo = std::vector(), + ValueType aValueType = Normal); + + /** + * Create from a Raw data(float array). + * Use Raw data which create like new float[size]() as a data source + * and the share_ptr's deleter will be std::default_delete + * @param data + * @param rows + * @param cols + * @param slices + * @param type + * @return + */ + static CudaMatrix fromRawData(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal); + + /** + * Create from a Raw data(float array) with copy the data to a new mkl memory. + * @param data + * @param rows + * @param cols + * @param slices + * @param type + * @return + */ + static CudaMatrix copyFromRawData(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal); + + /** + * 深拷贝操作 + * @return 深拷贝的Matrix对象 + */ + CudaMatrix deepCopy() const; + + // add + CudaMatrix operator+(float aScalar) const; + friend CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix); + friend CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix); + friend CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar); + CudaMatrix operator+(const CudaMatrix &aMatrix) const; + CudaMatrix operator+(CudaMatrix &&aMatrix) const; + friend CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther); + + // sub + CudaMatrix operator-(float aScalar) const; + friend CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix); + friend CudaMatrix& operator-(float aScalar, CudaMatrix &&aMatrix); + friend CudaMatrix& operator-(CudaMatrix &&aMatrix,float aScalar); + CudaMatrix operator-(const CudaMatrix &aMatrix) const; + CudaMatrix operator-(CudaMatrix &&aMatrix) const; + friend CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther); + + //negetive + friend CudaMatrix operator-(CudaMatrix &&aMatrix); + friend CudaMatrix operator-(const CudaMatrix &aMatrix); + + // mul + CudaMatrix operator*(float aScalar) const; + friend CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix); + friend CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix); + friend CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar); + CudaMatrix operator*(const CudaMatrix &aMatrix) const; + CudaMatrix operator*(CudaMatrix &&aMatrix) const; + friend CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther); + + // div + CudaMatrix operator/(float aScalar) const; + friend CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix); + friend CudaMatrix& operator/(float aScalar, CudaMatrix &&aMatrix); + friend CudaMatrix& operator/(CudaMatrix &&aMatrix,float aScalar); + CudaMatrix operator/(const CudaMatrix &aMatrix) const; + CudaMatrix operator/(CudaMatrix &&aMatrix) const; + friend CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther); + + // pow + CudaMatrix operator^(int times) const; + friend CudaMatrix operator^(CudaMatrix &&aMatrix,int times); + + //compare + CudaMatrix operator>(float aScalar) const; + friend CudaMatrix operator>(float aScalar, const CudaMatrix &aMatrix); + CudaMatrix operator>(const CudaMatrix &aMatrix) const; + + CudaMatrix operator<(float aScalar) const; + friend CudaMatrix operator<(float aScalar, const CudaMatrix &aMatrix); + CudaMatrix operator<(const CudaMatrix &aMatrix) const; + + CudaMatrix operator>=(float aScalar) const; + friend CudaMatrix operator>=(float aScalar, const CudaMatrix &aMatrix); + CudaMatrix operator>=(const CudaMatrix &aMatrix) const; + + CudaMatrix operator<=(float aScalar) const; + friend CudaMatrix operator<=(float aScalar, const CudaMatrix &aMatrix); + CudaMatrix operator<=(const CudaMatrix &aMatrix) const; + + CudaMatrix operator==(float aScalar) const; + friend CudaMatrix operator==(float aScalar, const CudaMatrix &aMatrix); + CudaMatrix operator==(const CudaMatrix &aMatrix) const; + + CudaMatrix operator!=(float aScalar) const; + friend CudaMatrix operator!=(float aScalar, const CudaMatrix &aMatrix); + CudaMatrix operator!=(const CudaMatrix &aMatrix) const; + + // sub + float& operator[](size_t index); + float operator[](size_t index) const; + + /** + * 切块操作 + * + * @param aDim 需要切块的维度, + * @param aBeginIndx 起始索引,包含 + * @param aEndIndex 终止索引,包含 + * @return Matrix 返回矩阵 + */ + CudaMatrix block(int aDim,int aBeginIndx, int aEndIndex) const; + + bool setBlockValue(int aDim,int aBeginIndx, int aEndIndex,float value); + bool setBlockComplexValue(int aDim,int aBeginIndx, int aEndIndex,std::complex value); + + + bool setBlock(int aDim,int aBeginIndx, int aEndIndex,const CudaMatrix& src); + + /** + * 矩阵乘法 + * @attention 目前只支持矩阵乘向量 + * @param aOther + * @return Matrix + */ + CudaMatrix mul(const CudaMatrix& aOther) const; + + /** + * 矩阵乘法 + * @attention 目前只支持矩阵乘向量 + * @param aOther + * @return Matrix + */ + CudaMatrix mul(CudaMatrix&& aOther) const; + + /** + * print matrix , only support 2d matrix now + */ + void printf(); + + void printfShape(); + + bool isScalar() const; + + float getScalar() const; + + /** + * Get is the Matrix's data is empty or size is zero. + * @return + */ + bool isNull() const; + + bool isNan() const; + + bool isVector() const; + + bool isMKLAllocated() const; + + /** + * Get dimension count of the matrix + * @return dimension count + */ + int getDims() const; + + float *getData() const; + + int getDimSize(int index) const; + + /** + * Compare matrix shape + * @param other matrix + * @return is identity + */ + bool compareShape(const CudaMatrix& other) const; + + /** + * Get the data size + * @return the unit count of the matrix + */ + size_t getDataSize() const; + + /** + * Get Value type as normal and complex, + * complex use std::complex, + * it's contains two float value. + * @return + */ + ValueType getValueType() const { + return mValueType; + } + + /** + * Set Value type as normal and complex, + * @return + */ + void setValueType(ValueType aValueType) { + mValueType = aValueType; + } + + /** + * return true if the valueType is complex, + * @return + */ + bool isComplex() const { + if(mValueType == Complex) + { + return true; + } + return false; + } + + void forceReshape(int rows, int columns, int slices); + + Matrix toHostMatrix() const; + + + private: + ValueType mValueType = Normal; + std::shared_ptr mData; + std::vector mInfo; + }; +} + +#endif diff --git a/src/MatlabReader.cpp b/src/MatlabReader.cpp index f6fba2c..606fe42 100644 --- a/src/MatlabReader.cpp +++ b/src/MatlabReader.cpp @@ -92,6 +92,11 @@ Aurora::Matrix MatlabReader::read(const std::string& aFieldName) std::copy((float*)matvar->data, (float*)matvar->data + matrixSize, matrixData); break; } + case MAT_T_DOUBLE: + { + std::copy((double*)matvar->data, (double*)matvar->data + matrixSize, matrixData); + break; + } } result = Aurora::Matrix::fromRawData(matrixData,rows,columns,slices); } diff --git a/src/Matrix.cpp b/src/Matrix.cpp index adccf22..cee197a 100644 --- a/src/Matrix.cpp +++ b/src/Matrix.cpp @@ -1,4 +1,5 @@ #include "Matrix.h" +#include "CudaMatrix.h" #include #include @@ -393,24 +394,12 @@ namespace Aurora { return ret; } - Matrix Matrix::NewCuda(float *data, int rows, int cols, int slices, ValueType type) + CudaMatrix Matrix::toDeviceMatrix() const { - if (!data) return Matrix(); - std::vector vector(3); - vector[0]=rows; - vector[1] = (cols > 0?cols:1); - vector[2] = (slices > 0?slices:1); - Matrix ret({data, gpuFree}, vector); - if (type != Normal)ret.setValueType(type); - ret.mCuda_Allocated = true; - return ret; - } - - Matrix Matrix::toHostMatrix() const - { - float* data = new float[getDataSize()]; - cudaMemcpy(data, mData.get(), sizeof(float) * getDataSize() * (mValueType == Normal ? 1 : 2), cudaMemcpyDeviceToHost); - return fromRawData(data, mInfo[0], mInfo[1], mInfo[2], getValueType()); + float* deviceData = nullptr; + cudaMalloc((void**)&deviceData, sizeof(float) * getDataSize() * getValueType() ); + cudaMemcpy(deviceData, mData.get(), sizeof(float) * getDataSize() * getValueType(), cudaMemcpyHostToDevice); + return CudaMatrix::fromRawData(deviceData, mInfo[0], mInfo[1], mInfo[2], getValueType()); } Matrix Matrix::New(float *data, const Matrix &shapeMatrix) { diff --git a/src/Matrix.h b/src/Matrix.h index e2116e6..1a59098 100644 --- a/src/Matrix.h +++ b/src/Matrix.h @@ -12,6 +12,7 @@ namespace Aurora { Complex }; const int $ = -1; + class CudaMatrix; class Matrix { public: @@ -87,7 +88,6 @@ namespace Aurora { */ static Matrix New(float *data, const Matrix &shapeMatrix); - static Matrix NewCuda(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal); /** * New a mkl calculate based Matrix * @attention Memory are allocate by Aurora:malloc function @@ -288,7 +288,7 @@ namespace Aurora { void forceReshape(int rows, int columns, int slices); - Matrix toHostMatrix() const; + CudaMatrix toDeviceMatrix() const; private: @@ -296,7 +296,6 @@ namespace Aurora { std::shared_ptr mData; std::vector mInfo; bool mMKL_Allocated = false; - bool mCuda_Allocated = false; }; }