From e3abe9fabe96d544afad609579cf208b0237ecdb Mon Sep 17 00:00:00 2001
From: sunwen <sunwen@realcan.com>
Date: Mon, 30 Oct 2023 10:28:24 +0800
Subject: [PATCH] Add CudaMatrix.

---
 CMakeLists.txt       |  20 ++++
 src/CudaMatrix.cpp   | 238 ++++++++++++++++++++++++++++++++++++++++++
 src/CudaMatrix.h     | 239 +++++++++++++++++++++++++++++++++++++++++++
 src/MatlabReader.cpp |   5 +
 src/Matrix.cpp       |  23 ++---
 src/Matrix.h         |   5 +-
 6 files changed, 510 insertions(+), 20 deletions(-)
 create mode 100644 src/CudaMatrix.cpp
 create mode 100644 src/CudaMatrix.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0136edf..835a70a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,11 @@ project(Aurora)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_INCLUDE_CURRENT_DIR ON)
+
+set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
+enable_language(CUDA)
+find_package(CUDAToolkit REQUIRED)
+
 find_package (OpenMP REQUIRED)
 set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS}  -march=native")
 set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -march=native")
@@ -25,6 +30,14 @@ target_include_directories(Aurora PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_IN
 target_link_libraries(Aurora PUBLIC $<LINK_ONLY:MKL::MKL>)
 target_link_libraries(Aurora PUBLIC OpenMP::OpenMP_CXX)
 target_link_libraries(Aurora PUBLIC matio)
+
+target_include_directories(Aurora PRIVATE ./src /usr/local/cuda/include) 
+set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_compile_options(Aurora PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+    -arch=sm_75
+>)
+target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
+
 find_package(GTest  REQUIRED)
 INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS})
 include_directories(./src/util)
@@ -41,5 +54,12 @@ target_link_libraries(Aurora_Test PUBLIC $<LINK_ONLY:MKL::MKL>)
 target_link_libraries(Aurora_Test PUBLIC OpenMP::OpenMP_CXX)
 target_link_libraries(Aurora_Test PUBLIC matio)
 target_link_libraries(Aurora_Test PUBLIC ${GTEST_BOTH_LIBRARIES}  )
+
+target_include_directories(Aurora_Test PRIVATE ./src /usr/local/cuda/include) 
+set_target_properties(Aurora_Test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_compile_options(Aurora_Test PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+    -arch=sm_75
+>)
+target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
 gtest_discover_tests(Aurora_Test )
 #target_link_libraries(CreateMatchedFilter PRIVATE TBB::tbb)
\ No newline at end of file
diff --git a/src/CudaMatrix.cpp b/src/CudaMatrix.cpp
new file mode 100644
index 0000000..682da72
--- /dev/null
+++ b/src/CudaMatrix.cpp
@@ -0,0 +1,238 @@
+#include "CudaMatrix.h"
+
+#include "Function.h"
+#include "Matrix.h"
+
+#include <iostream>
+#include <cstddef>
+#include <cuda_runtime.h>
+
+using namespace Aurora;
+
+CudaMatrix::CudaMatrix(std::shared_ptr<float> aData, std::vector<int> aInfo, ValueType aValueType)
+    : mValueType(aValueType)
+    , mData(aData)
+    , mInfo(aInfo)
+{
+    size_t infoSize = mInfo.size();
+    for(; infoSize<3; ++infoSize)
+    {
+        mInfo.push_back(1);
+    }
+}
+
+bool CudaMatrix::isNull() const
+{
+    return !mData || mInfo.empty();
+}
+
+bool CudaMatrix::isNan() const
+{
+    for(size_t i=0; i<getDataSize(); ++i)
+    {
+        if(mData.get()[i] == mData.get()[i])
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool CudaMatrix::isScalar() const
+{
+    return (getDimSize(0) == 1 &&
+            getDimSize(1) == 1 &&
+            getDimSize(2) < 2);
+}
+
+float CudaMatrix::getScalar() const
+{
+    if (isNull()) return 0.0;
+    if (isNull()) return 0.0;
+    return getData()[0];
+}
+
+bool CudaMatrix::isVector() const
+{
+    if (getDimSize(2)>1) return false;
+    if (isScalar()) return false;
+    return getDimSize(0) == 1 ||
+           getDimSize(1) == 1;
+}
+
+int CudaMatrix::getDims() const
+{
+    if(mInfo[2] > 1)
+    {
+        return 3;
+    }
+    return 2;
+}
+
+float *CudaMatrix::getData() const
+{
+    return mData.get();
+}
+
+int CudaMatrix::getDimSize(int aIndex) const
+{
+    if (aIndex >= 0 && aIndex < 3) {
+        return mInfo.at(aIndex);
+    }
+    return 0;
+}
+
+size_t CudaMatrix::getDataSize() const
+{
+    if (!mData.get())return 0;
+    size_t ret = 1;
+    for (auto v: mInfo) {
+        ret *= v;
+    }
+    return ret;
+}
+
+void CudaMatrix::forceReshape(int rows, int columns, int slices)
+{
+    mInfo = {rows,columns,slices};
+}
+
+bool CudaMatrix::compareShape(const CudaMatrix &other) const
+{
+    if (mInfo[2] == 1 && other.mInfo[2] == 1) {
+        if (mInfo[0]==1 && other.mInfo[1] == 1 && mInfo[1] == other.mInfo[0]) return true;
+        if (mInfo[1]==1 && other.mInfo[0] == 1 && mInfo[0] == other.mInfo[1]) return true;
+    }
+    for (int i = 0; i < mInfo.size(); ++i) {
+        if (mInfo[i] != other.mInfo[i]) return false;
+    }
+    return true;
+}
+
+CudaMatrix CudaMatrix::fromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)
+{
+    if (!aData)
+    {
+        return CudaMatrix();
+    }
+    std::vector<int> vector{aRows, aCols, aSlices};
+    CudaMatrix ret({aData, gpuFree}, vector, aType);
+    return ret;
+}
+
+CudaMatrix CudaMatrix::copyFromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)
+{
+    if (!aData)
+    {
+        return CudaMatrix();
+    }
+    float* data = nullptr;
+    unsigned long long size =  aRows * aCols * aSlices * aType;
+    cudaMalloc((void**)&data, sizeof(float) * size);
+    cudaMemcpy(data, aData, sizeof(float) * size, cudaMemcpyDeviceToDevice);
+    std::vector<int> vector{aRows, aCols, aSlices};
+    return CudaMatrix({data, gpuFree}, vector, aType);
+}
+
+CudaMatrix CudaMatrix::deepCopy() const
+{
+    float* data = nullptr;
+    unsigned long long size =  getDataSize() * getValueType();
+    cudaMalloc((void**)&data, sizeof(float) * size);
+    cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToDevice);
+    return CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
+}
+
+Matrix CudaMatrix::toHostMatrix() const
+{
+    unsigned long long size = getDataSize() * getValueType();
+    float* data = new float[size];
+    cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToHost);
+    return Matrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
+}
+
+CudaMatrix CudaMatrix::block(int aDim,int aBeginIndex, int aEndIndex) const
+{
+    if(aDim > 2)
+    {
+        std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;
+        return CudaMatrix();
+    }
+    if (isVector() && aDim == 0 && getDimSize(1)>1)
+    {
+        aDim = 1;
+    }
+
+    if (aBeginIndex>=getDimSize(aDim) || aBeginIndex<0)
+    {
+        std::cerr<<"CudaMatrix block BeginIndx error!BeginIndx:"<<aBeginIndex<<std::endl;
+        return CudaMatrix();
+    }
+
+    if (aEndIndex>=getDimSize(aDim) || aEndIndex<0)
+    {
+        std::cerr<<"CudaMatrix block EndIndex error!EndIndex:"<<aEndIndex<<std::endl;
+        return CudaMatrix();
+    }
+
+    if (aEndIndex < aBeginIndex)
+    {
+        std::cerr<<"CudaMatrix block EndIndex can not less than BeginIndex ! BeginIndex:"<<aBeginIndex <<", EndIndex:"<<aEndIndex<<std::endl;
+        return CudaMatrix();
+    }
+
+    int dimLength = aEndIndex - aBeginIndex + 1;
+    int dataSize = getDataSize()/getDimSize(aDim)*dimLength;
+    float * dataOutput = nullptr;
+    cudaMalloc((void**)&dataOutput, sizeof(float) * dataSize * getValueType());
+    int colStride = getDimSize(0);
+    int sliceStride = getDimSize(0)*getDimSize(1);
+
+    switch (aDim)
+    {
+        case 0:
+        {
+            int colStride2 = dimLength;
+            int sliceStride2 = dimLength*getDimSize(1);
+            for (size_t i = 0; i < getDimSize(2); i++)
+            {
+                for (size_t j = 0; j < getDimSize(1); j++)
+                {
+                    cudaMemcpy(dataOutput + (colStride2 * j + i * sliceStride2)*getValueType(), 
+                               mData.get()+ (aBeginIndex + j * colStride + i * sliceStride)*getValueType(),
+                               sizeof(float) * colStride2*getValueType(), cudaMemcpyDeviceToDevice);
+                }
+            }
+            return CudaMatrix::fromRawData(dataOutput,dimLength,getDimSize(1),getDimSize(2),getValueType());
+        }
+        case 1:
+        {
+            int colStride2 = getDimSize(0);
+            int copySize = dimLength*getDimSize(0);
+            for (size_t i = 0; i < getDimSize(2); i++)
+            {
+                cudaMemcpy(dataOutput +  getValueType()*(i * copySize), 
+                           mData.get() + getValueType()*(aBeginIndex * colStride + i * sliceStride),
+                           sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);
+            }
+            return CudaMatrix::fromRawData(dataOutput,getDimSize(0),dimLength,getDimSize(2),getValueType());
+        }
+        case 2:
+        {
+            int copySize = dimLength*sliceStride;
+            cudaMemcpy(dataOutput, 
+                       mData.get() + aBeginIndex * sliceStride*getValueType(),
+                       sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);
+            return CudaMatrix::fromRawData(dataOutput,getDimSize(0),getDimSize(1),dimLength,getValueType());
+        }
+    }
+}
+
+bool CudaMatrix::setBlockValue(int aDim,int aBeginIndx, int aEndIndex,float value)
+{
+    if(aDim>2 )
+    {
+        std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;
+        return false;
+    }
+}
\ No newline at end of file
diff --git a/src/CudaMatrix.h b/src/CudaMatrix.h
new file mode 100644
index 0000000..9dc4a09
--- /dev/null
+++ b/src/CudaMatrix.h
@@ -0,0 +1,239 @@
+#ifndef CUDAMATRIX_H
+#define CUDAMATRIX_H
+
+#include "Matrix.h"
+
+
+namespace Aurora
+{   
+    class CudaMatrix
+    {
+    public:
+        explicit CudaMatrix(std::shared_ptr<float> aData = std::shared_ptr<float>(),
+                        std::vector<int> aInfo = std::vector<int>(),
+                        ValueType aValueType = Normal);
+
+        /**
+         * Create from a Raw data(float array).
+         * Use Raw data which create like new float[size]() as a data source
+         * and the share_ptr's deleter will be std::default_delete<float[]>
+         * @param data
+         * @param rows
+         * @param cols
+         * @param slices
+         * @param type
+         * @return
+         */
+        static CudaMatrix fromRawData(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal);
+
+        /**
+         * Create from a Raw data(float array) with copy the data to a new mkl memory.
+         * @param data
+         * @param rows
+         * @param cols
+         * @param slices
+         * @param type
+         * @return
+         */
+        static CudaMatrix copyFromRawData(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal);
+
+        /**
+         * 深拷贝操作
+         * @return 深拷贝的Matrix对象
+         */
+        CudaMatrix deepCopy() const;
+
+        // add
+        CudaMatrix operator+(float aScalar) const;
+        friend CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator+(const CudaMatrix &aMatrix) const;
+        CudaMatrix operator+(CudaMatrix &&aMatrix) const;
+        friend CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther);
+
+        // sub
+        CudaMatrix operator-(float aScalar) const;
+        friend CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator-(float aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator-(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator-(const CudaMatrix &aMatrix) const;
+        CudaMatrix operator-(CudaMatrix &&aMatrix) const;
+        friend CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther);
+
+        //negetive
+        friend CudaMatrix operator-(CudaMatrix &&aMatrix);
+        friend CudaMatrix operator-(const CudaMatrix &aMatrix);
+
+        // mul
+        CudaMatrix operator*(float aScalar) const;
+        friend CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator*(const CudaMatrix &aMatrix) const;
+        CudaMatrix operator*(CudaMatrix &&aMatrix) const;
+        friend CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther);
+
+        // div
+        CudaMatrix operator/(float aScalar) const;
+        friend CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator/(float aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator/(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator/(const CudaMatrix &aMatrix) const;
+        CudaMatrix operator/(CudaMatrix &&aMatrix) const;
+        friend CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther);
+
+        // pow
+        CudaMatrix operator^(int times) const;
+        friend CudaMatrix operator^(CudaMatrix &&aMatrix,int times);
+
+        //compare
+        CudaMatrix operator>(float aScalar) const;
+        friend CudaMatrix operator>(float aScalar, const CudaMatrix &aMatrix);
+        CudaMatrix operator>(const CudaMatrix &aMatrix) const;
+
+        CudaMatrix operator<(float aScalar) const;
+        friend CudaMatrix operator<(float aScalar, const CudaMatrix &aMatrix);
+        CudaMatrix operator<(const CudaMatrix &aMatrix) const;
+
+        CudaMatrix operator>=(float aScalar) const;
+        friend CudaMatrix operator>=(float aScalar, const CudaMatrix &aMatrix);
+        CudaMatrix operator>=(const CudaMatrix &aMatrix) const;
+
+        CudaMatrix operator<=(float aScalar) const;
+        friend CudaMatrix operator<=(float aScalar, const CudaMatrix &aMatrix);
+        CudaMatrix operator<=(const CudaMatrix &aMatrix) const;
+
+        CudaMatrix operator==(float aScalar) const;
+        friend CudaMatrix operator==(float aScalar, const CudaMatrix &aMatrix);
+        CudaMatrix operator==(const CudaMatrix &aMatrix) const;
+
+        CudaMatrix operator!=(float aScalar) const;
+        friend CudaMatrix operator!=(float aScalar, const CudaMatrix &aMatrix);
+        CudaMatrix operator!=(const CudaMatrix &aMatrix) const;
+
+        // sub
+        float& operator[](size_t index);
+        float operator[](size_t index) const;
+
+        /**
+         * 切块操作
+         * 
+         * @param aDim 需要切块的维度，
+         * @param aBeginIndx 起始索引，包含
+         * @param aEndIndex 终止索引，包含
+         * @return Matrix 返回矩阵
+         */
+        CudaMatrix block(int aDim,int aBeginIndx, int aEndIndex) const;
+
+        bool setBlockValue(int aDim,int aBeginIndx, int aEndIndex,float value);
+        bool setBlockComplexValue(int aDim,int aBeginIndx, int aEndIndex,std::complex<float> value);
+
+
+        bool setBlock(int aDim,int aBeginIndx, int aEndIndex,const CudaMatrix& src);
+
+        /**
+         * 矩阵乘法
+         * @attention 目前只支持矩阵乘向量
+         * @param aOther 
+         * @return Matrix 
+         */
+        CudaMatrix mul(const CudaMatrix& aOther) const;
+        
+        /**
+         * 矩阵乘法
+         * @attention 目前只支持矩阵乘向量
+         * @param aOther 
+         * @return Matrix 
+         */
+        CudaMatrix mul(CudaMatrix&& aOther) const;
+
+        /**
+         * print matrix , only support 2d matrix now
+         */
+        void printf();
+
+        void printfShape();
+
+        bool isScalar() const;
+
+        float getScalar() const;
+
+        /**
+         * Get is the Matrix's data is empty or size is zero.
+         * @return
+         */
+        bool isNull() const;
+
+        bool isNan() const;
+
+        bool isVector() const;
+
+        bool isMKLAllocated() const;
+
+        /**
+         * Get dimension count of the matrix
+         * @return  dimension count
+         */
+        int getDims() const;
+
+        float *getData() const;
+
+        int getDimSize(int index) const;
+
+        /**
+         * Compare matrix shape
+         * @param other matrix
+         * @return is identity
+         */
+        bool compareShape(const CudaMatrix& other) const;
+
+        /**
+         * Get the data size
+         * @return the unit count of the matrix
+         */
+        size_t getDataSize() const;
+
+        /**
+         * Get Value type as normal and complex,
+         * complex use std::complex<float>,
+         * it's contains two float value.
+         * @return
+         */
+        ValueType getValueType() const {
+            return mValueType;
+        }
+
+        /**
+         * Set Value type as normal and complex,
+         * @return
+         */
+        void setValueType(ValueType aValueType) {
+            mValueType = aValueType;
+        }
+
+        /**
+         * return true if the valueType is complex,
+         * @return
+         */
+        bool isComplex() const {
+            if(mValueType == Complex)
+            {
+                return true;
+            }
+            return false;
+        }
+
+        void forceReshape(int rows, int columns, int slices);
+
+        Matrix toHostMatrix() const;
+
+
+    private:
+        ValueType mValueType = Normal;
+        std::shared_ptr<float> mData;
+        std::vector<int> mInfo;
+    };
+}
+
+#endif
diff --git a/src/MatlabReader.cpp b/src/MatlabReader.cpp
index f6fba2c..606fe42 100644
--- a/src/MatlabReader.cpp
+++ b/src/MatlabReader.cpp
@@ -92,6 +92,11 @@ Aurora::Matrix MatlabReader::read(const std::string& aFieldName)
                 std::copy((float*)matvar->data, (float*)matvar->data + matrixSize, matrixData);
                 break;
             }
+            case MAT_T_DOUBLE:
+            {
+                std::copy((double*)matvar->data, (double*)matvar->data + matrixSize, matrixData);
+                break;
+            }
         }
         result = Aurora::Matrix::fromRawData(matrixData,rows,columns,slices);
     }
diff --git a/src/Matrix.cpp b/src/Matrix.cpp
index adccf22..cee197a 100644
--- a/src/Matrix.cpp
+++ b/src/Matrix.cpp
@@ -1,4 +1,5 @@
 #include "Matrix.h"
+#include "CudaMatrix.h"
 
 #include <cmath>
 #include <cstddef>
@@ -393,24 +394,12 @@ namespace Aurora {
         return ret;
     }
 
-    Matrix Matrix::NewCuda(float *data, int rows, int cols, int slices, ValueType type)
+    CudaMatrix Matrix::toDeviceMatrix() const
     {
-        if (!data) return Matrix();
-        std::vector<int> vector(3);
-        vector[0]=rows;
-        vector[1] = (cols > 0?cols:1);
-        vector[2] = (slices > 0?slices:1);
-        Matrix ret({data, gpuFree}, vector);
-        if (type != Normal)ret.setValueType(type);
-        ret.mCuda_Allocated = true;
-        return ret;
-    }
-
-    Matrix Matrix::toHostMatrix() const
-    {
-        float* data = new float[getDataSize()];
-        cudaMemcpy(data, mData.get(), sizeof(float) * getDataSize() * (mValueType == Normal ? 1 : 2), cudaMemcpyDeviceToHost);
-        return fromRawData(data, mInfo[0], mInfo[1], mInfo[2], getValueType());
+        float* deviceData = nullptr;
+        cudaMalloc((void**)&deviceData, sizeof(float) * getDataSize() * getValueType() );
+        cudaMemcpy(deviceData, mData.get(), sizeof(float) * getDataSize() * getValueType(), cudaMemcpyHostToDevice);
+        return CudaMatrix::fromRawData(deviceData, mInfo[0], mInfo[1], mInfo[2], getValueType());
     }
 
     Matrix Matrix::New(float *data, const Matrix &shapeMatrix) {
diff --git a/src/Matrix.h b/src/Matrix.h
index e2116e6..1a59098 100644
--- a/src/Matrix.h
+++ b/src/Matrix.h
@@ -12,6 +12,7 @@ namespace Aurora {
         Complex
     };
     const int $ = -1;
+    class CudaMatrix;
     
     class Matrix {
     public:
@@ -87,7 +88,6 @@ namespace Aurora {
          */
         static Matrix New(float *data, const Matrix &shapeMatrix);
 
-        static Matrix NewCuda(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal);
         /**
          * New a mkl calculate based Matrix
          * @attention  Memory are allocate by Aurora:malloc function
@@ -288,7 +288,7 @@ namespace Aurora {
 
         void forceReshape(int rows, int columns, int slices);
 
-        Matrix toHostMatrix() const;
+        CudaMatrix toDeviceMatrix() const;
 
 
     private:
@@ -296,7 +296,6 @@ namespace Aurora {
         std::shared_ptr<float> mData;
         std::vector<int> mInfo;
         bool mMKL_Allocated = false;
-        bool mCuda_Allocated = false;
     };
 }