Aurora/src/CudaMatrix.cpp

#ifdef USE_CUDA
#include "CudaMatrix.h"

#include "Function.h"
#include "Matrix.h"

#include <iostream>
#include <cstddef>
#include <cuda_runtime.h>
#include "CudaMatrixPrivate.cuh"

namespace Aurora{

CudaMatrix::CudaMatrix(std::shared_ptr<float> aData, std::vector<int> aInfo, ValueType aValueType)
    : mValueType(aValueType)
    , mData(aData)
    , mInfo(aInfo)
{
    size_t infoSize = mInfo.size();
    for(; infoSize<3; ++infoSize)
    {
        mInfo.push_back(1);
    }
}

bool CudaMatrix::isNull() const
{
    return !mData || mInfo.empty();
}

bool CudaMatrix::isNan() const
{
    for(size_t i=0; i<getDataSize(); ++i)
    {
        if(mData.get()[i] == mData.get()[i])
        {
            return false;
        }
    }
    return true;
}

bool CudaMatrix::isScalar() const
{
    return (getDimSize(0) == 1 &&
            getDimSize(1) == 1 &&
            getDimSize(2) < 2);
}

float CudaMatrix::getScalar() const
{
    if (isNull()) return 0.0;
    if (isNull()) return 0.0;
    return getData()[0];
}

bool CudaMatrix::isVector() const
{
    if (getDimSize(2)>1) return false;
    if (isScalar()) return false;
    return getDimSize(0) == 1 ||
           getDimSize(1) == 1;
}

int CudaMatrix::getDims() const
{
    if(mInfo[2] > 1)
    {
        return 3;
    }
    return 2;
}

float *CudaMatrix::getData() const
{
    return mData.get();
}

int CudaMatrix::getDimSize(int aIndex) const
{
    if (aIndex >= 0 && aIndex < 3) {
        return mInfo.at(aIndex);
    }
    return 0;
}

size_t CudaMatrix::getDataSize() const
{
    if (!mData.get())return 0;
    size_t ret = 1;
    for (auto v: mInfo) {
        ret *= v;
    }
    return ret;
}

void CudaMatrix::forceReshape(int rows, int columns, int slices)
{
    mInfo = {rows,columns,slices};
}

bool CudaMatrix::compareShape(const CudaMatrix &other) const
{
    if (mInfo[2] == 1 && other.mInfo[2] == 1) {
        if (mInfo[0]==1 && other.mInfo[1] == 1 && mInfo[1] == other.mInfo[0]) return true;
        if (mInfo[1]==1 && other.mInfo[0] == 1 && mInfo[0] == other.mInfo[1]) return true;
    }
    for (int i = 0; i < mInfo.size(); ++i) {
        if (mInfo[i] != other.mInfo[i]) return false;
    }
    return true;
}

CudaMatrix CudaMatrix::fromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)
{
    if (!aData)
    {
        return CudaMatrix();
    }
    std::vector<int> vector{aRows, aCols, aSlices};
    CudaMatrix ret({aData, gpuFree}, vector, aType);
    return ret;
}

CudaMatrix CudaMatrix::copyFromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)
{
    if (!aData)
    {
        return CudaMatrix();
    }
    float* data = nullptr;
    unsigned long long size =  aRows * aCols * aSlices * aType;
    cudaMalloc((void**)&data, sizeof(float) * size);
    cudaMemcpy(data, aData, sizeof(float) * size, cudaMemcpyDeviceToDevice);
    std::vector<int> vector{aRows, aCols, aSlices};
    return CudaMatrix({data, gpuFree}, vector, aType);
}

CudaMatrix CudaMatrix::deepCopy() const
{
    float* data = nullptr;
    unsigned long long size =  getDataSize() * getValueType();
    cudaMalloc((void**)&data, sizeof(float) * size);
    cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToDevice);
    return CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
}

Matrix CudaMatrix::toHostMatrix() const
{
    unsigned long long size = getDataSize() * getValueType();
    float* data = new float[size];
    cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToHost);
    return Matrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
}

CudaMatrix CudaMatrix::block(int aDim,int aBeginIndex, int aEndIndex) const
{
    if(aDim > 2)
    {
        std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;
        return CudaMatrix();
    }
    if (isVector() && aDim == 0 && getDimSize(1)>1)
    {
        aDim = 1;
    }

    if (aBeginIndex>=getDimSize(aDim) || aBeginIndex<0)
    {
        std::cerr<<"CudaMatrix block BeginIndx error!BeginIndx:"<<aBeginIndex<<std::endl;
        return CudaMatrix();
    }

    if (aEndIndex>=getDimSize(aDim) || aEndIndex<0)
    {
        std::cerr<<"CudaMatrix block EndIndex error!EndIndex:"<<aEndIndex<<std::endl;
        return CudaMatrix();
    }

    if (aEndIndex < aBeginIndex)
    {
        std::cerr<<"CudaMatrix block EndIndex can not less than BeginIndex ! BeginIndex:"<<aBeginIndex <<", EndIndex:"<<aEndIndex<<std::endl;
        return CudaMatrix();
    }

    int dimLength = aEndIndex - aBeginIndex + 1;
    int dataSize = getDataSize()/getDimSize(aDim)*dimLength;
    float * dataOutput = nullptr;
    cudaMalloc((void**)&dataOutput, sizeof(float) * dataSize * getValueType());
    int colStride = getDimSize(0);
    int sliceStride = getDimSize(0)*getDimSize(1);

    switch (aDim)
    {
        case 0:
        {
            int colStride2 = dimLength;
            int sliceStride2 = dimLength*getDimSize(1);
            for (size_t i = 0; i < getDimSize(2); i++)
            {
                for (size_t j = 0; j < getDimSize(1); j++)
                {
                    cudaMemcpy(dataOutput + (colStride2 * j + i * sliceStride2)*getValueType(), 
                               mData.get()+ (aBeginIndex + j * colStride + i * sliceStride)*getValueType(),
                               sizeof(float) * colStride2*getValueType(), cudaMemcpyDeviceToDevice);
                }
            }
            return CudaMatrix::fromRawData(dataOutput,dimLength,getDimSize(1),getDimSize(2),getValueType());
        }
        case 1:
        {
            int colStride2 = getDimSize(0);
            int copySize = dimLength*getDimSize(0);
            for (size_t i = 0; i < getDimSize(2); i++)
            {
                cudaMemcpy(dataOutput +  getValueType()*(i * copySize), 
                           mData.get() + getValueType()*(aBeginIndex * colStride + i * sliceStride),
                           sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);
            }
            return CudaMatrix::fromRawData(dataOutput,getDimSize(0),dimLength,getDimSize(2),getValueType());
        }
        case 2:
        default:
        {
            int copySize = dimLength*sliceStride;
            cudaMemcpy(dataOutput, 
                       mData.get() + aBeginIndex * sliceStride*getValueType(),
                       sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);
            return CudaMatrix::fromRawData(dataOutput,getDimSize(0),getDimSize(1),dimLength,getValueType());
        }
    }
}

bool CudaMatrix::setBlockValue(int aDim,int aBeginIndx, int aEndIndex,float value)
{
    if(aDim>2 )
    {
        std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;
        return false;
    }
    return true;
}


CudaMatrix CudaMatrix::operator+(float aScalar) const{
    if (isComplex())
    {
        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
        return  CudaMatrix();
    }
    float* data = nullptr;
    unsigned long long size =  getDataSize() * getValueType();
    cudaMalloc((void**)&data, sizeof(float) * size);
    auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
    unaryAdd(getData(),aScalar,out.getData(),getDataSize());
    return out;
}

CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix){
    if (aMatrix.isComplex())
    {
        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
        return  CudaMatrix();
    }
    float* data = nullptr;
    unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
    cudaMalloc((void**)&data, sizeof(float) * size);
    auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
    unaryAdd(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());
    return out;
}

CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix){
    if (aMatrix.isComplex())
    {
        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
        return  aMatrix;
    }
    unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
    return aMatrix;
}

CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar){
    if (aMatrix.isComplex())
    {
        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
        return  aMatrix;
    }
    unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
    return aMatrix;
}

CudaMatrix CudaMatrix::operator+(const CudaMatrix &aMatrix) const{
    if (this->getDataSize() != aMatrix.getDataSize()) {
        std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
        <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
        return CudaMatrix();
    }
    if (this->isComplex() != aMatrix.isComplex()) {
        std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
        <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
        return CudaMatrix();
    }
    float* data = nullptr;
    unsigned long long size =  getDataSize() * getValueType();
    cudaMalloc((void**)&data, sizeof(float) * size);
    auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
    unaryAdd(this->getData(),aMatrix.getData(),out.getData(),this->getDataSize());
    return out;
}

CudaMatrix CudaMatrix::operator+(CudaMatrix &&aMatrix) const{
    if (this->getDataSize() != aMatrix.getDataSize()) {
        std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
        <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
        return CudaMatrix();
    }
    if (this->isComplex() != aMatrix.isComplex()) {
        std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
        <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
        return CudaMatrix();
    }
    unaryAdd(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());
    return aMatrix;
}


CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
    if (aOther.getDataSize() != aMatrix.getDataSize()) {
        std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
        <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
        return CudaMatrix();
    }
    if (aOther.isComplex() != aMatrix.isComplex()) {
        std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
        <<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;
        return CudaMatrix();
    }
    unaryAdd(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
    return aMatrix;
}

        // mul
    CudaMatrix CudaMatrix::operator*(float aScalar) const{
        if (isComplex())
        {
            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
            return  CudaMatrix();
        }
        float* data = nullptr;
        unsigned long long size =  getDataSize() * getValueType();
        cudaMalloc((void**)&data, sizeof(float) * size);
        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
        unaryMul(getData(),aScalar,out.getData(),getDataSize());
        return out;
    }
    CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix){
        if (aMatrix.isComplex())
        {
            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
            return  CudaMatrix();
        }
        float* data = nullptr;
        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
        cudaMalloc((void**)&data, sizeof(float) * size);
        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
        unaryMul(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());
        return out;
    }
    CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix){
        if (aMatrix.isComplex())
        {
            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
            return  aMatrix;
        }
        unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
        return aMatrix;
    }
    CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar){
        if (aMatrix.isComplex())
        {
            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
            return  aMatrix;
        }
        unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
        return aMatrix;
    }
    CudaMatrix CudaMatrix::operator*(const CudaMatrix &aMatrix) const{
        if (this->getDataSize() != aMatrix.getDataSize()) {
            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
            return CudaMatrix();
        }
        if (this->isComplex() != aMatrix.isComplex()) {
            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
            return CudaMatrix();
        }
        float* data = nullptr;
        unsigned long long size =  getDataSize() * getValueType();
        cudaMalloc((void**)&data, sizeof(float) * size);
        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
        unaryMul(this->getData(),aMatrix.getData(),out.getData(),this->getDataSize());
        return out;
    }
    CudaMatrix CudaMatrix::operator*(CudaMatrix &&aMatrix) const{
        if (this->getDataSize() != aMatrix.getDataSize()) {
            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
            return CudaMatrix();
        }
        if (this->isComplex() != aMatrix.isComplex()) {
            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
            return CudaMatrix();
        }
        unaryMul(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());
        return aMatrix;
    }
    CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther){
        if (aOther.getDataSize() != aMatrix.getDataSize()) {
            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
            return CudaMatrix();
        }
        if (aOther.isComplex() != aMatrix.isComplex()) {
            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
            <<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;
            return CudaMatrix();
        }
        unaryMul(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
        return aMatrix;
    }
}
#endif // USE_CUDA
Add cuda compile define and cmake setting 2023-10-30 13:34:51 +08:00			`#ifdef USE_CUDA`
Add CudaMatrix. 2023-10-30 10:28:24 +08:00			`#include "CudaMatrix.h"`

			`#include "Function.h"`
			`#include "Matrix.h"`

			`#include <iostream>`
			`#include <cstddef>`
			`#include <cuda_runtime.h>`
CudaMatrix Operator logic patch1 2023-10-31 14:35:29 +08:00			`#include "CudaMatrixPrivate.cuh"`
Add CudaMatrix. 2023-10-30 10:28:24 +08:00
Fix UnitTest add cudamatrix add and mul 2023-11-01 14:31:29 +08:00			`namespace Aurora{`
Add CudaMatrix. 2023-10-30 10:28:24 +08:00
			`CudaMatrix::CudaMatrix(std::shared_ptr<float> aData, std::vector<int> aInfo, ValueType aValueType)`
			`: mValueType(aValueType)`
			`, mData(aData)`
			`, mInfo(aInfo)`
			`{`
			`size_t infoSize = mInfo.size();`
			`for(; infoSize<3; ++infoSize)`
			`{`
			`mInfo.push_back(1);`
			`}`
			`}`

			`bool CudaMatrix::isNull() const`
			`{`
			`return !mData \|\| mInfo.empty();`
			`}`

			`bool CudaMatrix::isNan() const`
			`{`
			`for(size_t i=0; i<getDataSize(); ++i)`
			`{`
			`if(mData.get()[i] == mData.get()[i])`
			`{`
			`return false;`
			`}`
			`}`
			`return true;`
			`}`

			`bool CudaMatrix::isScalar() const`
			`{`
			`return (getDimSize(0) == 1 &&`
			`getDimSize(1) == 1 &&`
			`getDimSize(2) < 2);`
			`}`

			`float CudaMatrix::getScalar() const`
			`{`
			`if (isNull()) return 0.0;`
			`if (isNull()) return 0.0;`
			`return getData()[0];`
			`}`

			`bool CudaMatrix::isVector() const`
			`{`
			`if (getDimSize(2)>1) return false;`
			`if (isScalar()) return false;`
			`return getDimSize(0) == 1 \|\|`
			`getDimSize(1) == 1;`
			`}`

			`int CudaMatrix::getDims() const`
			`{`
			`if(mInfo[2] > 1)`
			`{`
			`return 3;`
			`}`
			`return 2;`
			`}`

			`float *CudaMatrix::getData() const`
			`{`
			`return mData.get();`
			`}`

			`int CudaMatrix::getDimSize(int aIndex) const`
			`{`
			`if (aIndex >= 0 && aIndex < 3) {`
			`return mInfo.at(aIndex);`
			`}`
			`return 0;`
			`}`

			`size_t CudaMatrix::getDataSize() const`
			`{`
			`if (!mData.get())return 0;`
			`size_t ret = 1;`
			`for (auto v: mInfo) {`
			`ret *= v;`
			`}`
			`return ret;`
			`}`

			`void CudaMatrix::forceReshape(int rows, int columns, int slices)`
			`{`
			`mInfo = {rows,columns,slices};`
			`}`

			`bool CudaMatrix::compareShape(const CudaMatrix &other) const`
			`{`
			`if (mInfo[2] == 1 && other.mInfo[2] == 1) {`
			`if (mInfo[0]==1 && other.mInfo[1] == 1 && mInfo[1] == other.mInfo[0]) return true;`
			`if (mInfo[1]==1 && other.mInfo[0] == 1 && mInfo[0] == other.mInfo[1]) return true;`
			`}`
			`for (int i = 0; i < mInfo.size(); ++i) {`
			`if (mInfo[i] != other.mInfo[i]) return false;`
			`}`
			`return true;`
			`}`

			`CudaMatrix CudaMatrix::fromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)`
			`{`
			`if (!aData)`
			`{`
			`return CudaMatrix();`
			`}`
			`std::vector<int> vector{aRows, aCols, aSlices};`
			`CudaMatrix ret({aData, gpuFree}, vector, aType);`
			`return ret;`
			`}`

			`CudaMatrix CudaMatrix::copyFromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)`
			`{`
			`if (!aData)`
			`{`
			`return CudaMatrix();`
			`}`
			`float* data = nullptr;`
			`unsigned long long size = aRows * aCols * aSlices * aType;`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`cudaMemcpy(data, aData, sizeof(float) * size, cudaMemcpyDeviceToDevice);`
			`std::vector<int> vector{aRows, aCols, aSlices};`
			`return CudaMatrix({data, gpuFree}, vector, aType);`
			`}`

			`CudaMatrix CudaMatrix::deepCopy() const`
			`{`
			`float* data = nullptr;`
			`unsigned long long size = getDataSize() * getValueType();`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToDevice);`
			`return CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());`
			`}`

			`Matrix CudaMatrix::toHostMatrix() const`
			`{`
			`unsigned long long size = getDataSize() * getValueType();`
			`float* data = new float[size];`
			`cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToHost);`
			`return Matrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());`
			`}`

			`CudaMatrix CudaMatrix::block(int aDim,int aBeginIndex, int aEndIndex) const`
			`{`
			`if(aDim > 2)`
			`{`
			`std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;`
			`return CudaMatrix();`
			`}`
			`if (isVector() && aDim == 0 && getDimSize(1)>1)`
			`{`
			`aDim = 1;`
			`}`

			`if (aBeginIndex>=getDimSize(aDim) \|\| aBeginIndex<0)`
			`{`
			`std::cerr<<"CudaMatrix block BeginIndx error!BeginIndx:"<<aBeginIndex<<std::endl;`
			`return CudaMatrix();`
			`}`

			`if (aEndIndex>=getDimSize(aDim) \|\| aEndIndex<0)`
			`{`
			`std::cerr<<"CudaMatrix block EndIndex error!EndIndex:"<<aEndIndex<<std::endl;`
			`return CudaMatrix();`
			`}`

			`if (aEndIndex < aBeginIndex)`
			`{`
			`std::cerr<<"CudaMatrix block EndIndex can not less than BeginIndex ! BeginIndex:"<<aBeginIndex <<", EndIndex:"<<aEndIndex<<std::endl;`
			`return CudaMatrix();`
			`}`

			`int dimLength = aEndIndex - aBeginIndex + 1;`
			`int dataSize = getDataSize()/getDimSize(aDim)*dimLength;`
			`float * dataOutput = nullptr;`
			`cudaMalloc((void*)&dataOutput, sizeof(float) dataSize * getValueType());`
			`int colStride = getDimSize(0);`
			`int sliceStride = getDimSize(0)*getDimSize(1);`

			`switch (aDim)`
			`{`
			`case 0:`
			`{`
			`int colStride2 = dimLength;`
			`int sliceStride2 = dimLength*getDimSize(1);`
			`for (size_t i = 0; i < getDimSize(2); i++)`
			`{`
			`for (size_t j = 0; j < getDimSize(1); j++)`
			`{`
			`cudaMemcpy(dataOutput + (colStride2 * j + i * sliceStride2)*getValueType(),`
			`mData.get()+ (aBeginIndex + j * colStride + i * sliceStride)*getValueType(),`
			`sizeof(float) * colStride2*getValueType(), cudaMemcpyDeviceToDevice);`
			`}`
			`}`
			`return CudaMatrix::fromRawData(dataOutput,dimLength,getDimSize(1),getDimSize(2),getValueType());`
			`}`
			`case 1:`
			`{`
			`int colStride2 = getDimSize(0);`
			`int copySize = dimLength*getDimSize(0);`
			`for (size_t i = 0; i < getDimSize(2); i++)`
			`{`
			`cudaMemcpy(dataOutput + getValueType()(i copySize),`
			`mData.get() + getValueType()(aBeginIndex colStride + i * sliceStride),`
			`sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);`
			`}`
			`return CudaMatrix::fromRawData(dataOutput,getDimSize(0),dimLength,getDimSize(2),getValueType());`
			`}`
			`case 2:`
Add cuda compile define and cmake setting 2023-10-30 13:34:51 +08:00			`default:`
Add CudaMatrix. 2023-10-30 10:28:24 +08:00			`{`
			`int copySize = dimLength*sliceStride;`
			`cudaMemcpy(dataOutput,`
			`mData.get() + aBeginIndex * sliceStride*getValueType(),`
			`sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);`
			`return CudaMatrix::fromRawData(dataOutput,getDimSize(0),getDimSize(1),dimLength,getValueType());`
			`}`
			`}`
			`}`

			`bool CudaMatrix::setBlockValue(int aDim,int aBeginIndx, int aEndIndex,float value)`
			`{`
			`if(aDim>2 )`
			`{`
			`std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;`
			`return false;`
			`}`
Add cuda compile define and cmake setting 2023-10-30 13:34:51 +08:00			`return true;`
			`}`
CudaMatrix Operator logic patch1 2023-10-31 14:35:29 +08:00
Fix UnitTest add cudamatrix add and mul 2023-11-01 14:31:29 +08:00
			`CudaMatrix CudaMatrix::operator+(float aScalar) const{`
			`if (isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return CudaMatrix();`
			`}`
			`float* data = nullptr;`
			`unsigned long long size = getDataSize() * getValueType();`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`auto out = CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());`
			`unaryAdd(getData(),aScalar,out.getData(),getDataSize());`
			`return out;`
			`}`

			`CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix){`
			`if (aMatrix.isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return CudaMatrix();`
			`}`
			`float* data = nullptr;`
			`unsigned long long size = aMatrix.getDataSize() * aMatrix.getValueType();`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`auto out = CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());`
			`unaryAdd(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());`
			`return out;`
			`}`

			`CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix){`
			`if (aMatrix.isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return aMatrix;`
			`}`
			`unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());`
			`return aMatrix;`
			`}`

			`CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar){`
			`if (aMatrix.isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return aMatrix;`
			`}`
			`unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());`
			`return aMatrix;`
			`}`

CudaMatrix Operator logic patch1 2023-10-31 14:35:29 +08:00			`CudaMatrix CudaMatrix::operator+(const CudaMatrix &aMatrix) const{`
Fix UnitTest add cudamatrix add and mul 2023-11-01 14:31:29 +08:00			`if (this->getDataSize() != aMatrix.getDataSize()) {`
			`std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()`
			`<<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;`
			`return CudaMatrix();`
			`}`
			`if (this->isComplex() != aMatrix.isComplex()) {`
			`std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")`
			`<<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;`
			`return CudaMatrix();`
			`}`
CudaMatrix Operator logic patch1 2023-10-31 14:35:29 +08:00			`float* data = nullptr;`
			`unsigned long long size = getDataSize() * getValueType();`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`auto out = CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());`
			`unaryAdd(this->getData(),aMatrix.getData(),out.getData(),this->getDataSize());`
			`return out;`
			`}`
Fix UnitTest add cudamatrix add and mul 2023-11-01 14:31:29 +08:00
			`CudaMatrix CudaMatrix::operator+(CudaMatrix &&aMatrix) const{`
			`if (this->getDataSize() != aMatrix.getDataSize()) {`
			`std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()`
			`<<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;`
			`return CudaMatrix();`
			`}`
			`if (this->isComplex() != aMatrix.isComplex()) {`
			`std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")`
			`<<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;`
			`return CudaMatrix();`
			`}`
			`unaryAdd(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());`
			`return aMatrix;`
			`}`


			`CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){`
			`if (aOther.getDataSize() != aMatrix.getDataSize()) {`
			`std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()`
			`<<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;`
			`return CudaMatrix();`
			`}`
			`if (aOther.isComplex() != aMatrix.isComplex()) {`
			`std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")`
			`<<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;`
			`return CudaMatrix();`
			`}`
			`unaryAdd(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());`
			`return aMatrix;`
			`}`

			`// mul`
			`CudaMatrix CudaMatrix::operator*(float aScalar) const{`
			`if (isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return CudaMatrix();`
			`}`
			`float* data = nullptr;`
			`unsigned long long size = getDataSize() * getValueType();`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`auto out = CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());`
			`unaryMul(getData(),aScalar,out.getData(),getDataSize());`
			`return out;`
			`}`
			`CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix){`
			`if (aMatrix.isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return CudaMatrix();`
			`}`
			`float* data = nullptr;`
			`unsigned long long size = aMatrix.getDataSize() * aMatrix.getValueType();`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`auto out = CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());`
			`unaryMul(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());`
			`return out;`
			`}`
			`CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix){`
			`if (aMatrix.isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return aMatrix;`
			`}`
			`unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());`
			`return aMatrix;`
			`}`
			`CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar){`
			`if (aMatrix.isComplex())`
			`{`
			`std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;`
			`return aMatrix;`
			`}`
			`unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());`
			`return aMatrix;`
			`}`
			`CudaMatrix CudaMatrix::operator*(const CudaMatrix &aMatrix) const{`
			`if (this->getDataSize() != aMatrix.getDataSize()) {`
			`std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()`
			`<<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;`
			`return CudaMatrix();`
			`}`
			`if (this->isComplex() != aMatrix.isComplex()) {`
			`std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")`
			`<<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;`
			`return CudaMatrix();`
			`}`
			`float* data = nullptr;`
			`unsigned long long size = getDataSize() * getValueType();`
			`cudaMalloc((void*)&data, sizeof(float) size);`
			`auto out = CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());`
			`unaryMul(this->getData(),aMatrix.getData(),out.getData(),this->getDataSize());`
			`return out;`
			`}`
			`CudaMatrix CudaMatrix::operator*(CudaMatrix &&aMatrix) const{`
			`if (this->getDataSize() != aMatrix.getDataSize()) {`
			`std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()`
			`<<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;`
			`return CudaMatrix();`
			`}`
			`if (this->isComplex() != aMatrix.isComplex()) {`
			`std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")`
			`<<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;`
			`return CudaMatrix();`
			`}`
			`unaryMul(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());`
			`return aMatrix;`
			`}`
			`CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther){`
			`if (aOther.getDataSize() != aMatrix.getDataSize()) {`
			`std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()`
			`<<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;`
			`return CudaMatrix();`
			`}`
			`if (aOther.isComplex() != aMatrix.isComplex()) {`
			`std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")`
			`<<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;`
			`return CudaMatrix();`
			`}`
			`unaryMul(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());`
			`return aMatrix;`
			`}`
			`}`
CudaMatrix Operator logic patch1 2023-10-31 14:35:29 +08:00			`#endif // USE_CUDA`