diff --git a/src/CudaMatrix.cpp b/src/CudaMatrix.cpp
index d19e3c5..3d1cf5c 100644
--- a/src/CudaMatrix.cpp
+++ b/src/CudaMatrix.cpp
@@ -1,3 +1,7 @@
+#include "AuroraDefs.h"
+#include "Function1D.cuh"
+#include <complex>
+#include <utility>
 #ifdef USE_CUDA
 #include "CudaMatrix.h"
 
@@ -5,10 +9,19 @@
 #include "Matrix.h"
 
 #include <iostream>
+#include <string>
+
 #include <cstddef>
 #include <cuda_runtime.h>
 #include "CudaMatrixPrivate.cuh"
-
+namespace {
+    std::string getMatrixShapeStr(const Aurora::CudaMatrix &aMatrix) {
+    std::string ret;
+    return "(" + std::to_string(aMatrix.getDimSize(0)) + "," +
+            std::to_string(aMatrix.getDimSize(1)) + "," +
+            std::to_string(aMatrix.getDimSize(1)) + "," + ")";
+    }
+}
 namespace Aurora{
 
 CudaMatrix::CudaMatrix(std::shared_ptr<float> aData, std::vector<int> aInfo, ValueType aValueType)
@@ -145,6 +158,15 @@ CudaMatrix CudaMatrix::deepCopy() const
     return CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
 }
 
+CudaMatrix CudaMatrix::deepCopyShape(bool complex) const
+{
+    float* data = nullptr;
+    ValueType newType = complex==true?Complex:getValueType();
+    unsigned long long size =  getDataSize() * newType;
+    cudaMalloc((void**)&data, sizeof(float) * size);
+    return CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), newType);
+}
+
 Matrix CudaMatrix::toHostMatrix() const
 {
     if(!mData.get())
@@ -494,419 +516,891 @@ bool CudaMatrix::setBlock(int aDim,int aBeginIndex, int aEndIndex, const CudaMat
         }
     }    
 }
-    //--Add----------------------------------------------------------------
-    CudaMatrix CudaMatrix::operator+(float aScalar) const{
-        if (isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  getDataSize() * getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-        unaryAdd(getData(),aScalar,out.getData(),getDataSize());
-        return out;
-    }
-
-    CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
-        unaryAdd(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());
-        return out;
-    }
-
-    CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-            return  aMatrix;
-        }
-        unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }
-
-    CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-            return  aMatrix;
-        }
-        unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }
-
-    CudaMatrix CudaMatrix::operator+(const CudaMatrix &aMatrix) const{
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        if (this->isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  getDataSize() * getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-        unaryAdd(this->getData(),aMatrix.getData(),out.getData(),this->getDataSize());
-        return out;
-    }
-
-    CudaMatrix CudaMatrix::operator+(CudaMatrix &&aMatrix) const{
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        if (this->isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        unaryAdd(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());
-        return aMatrix;
-    }
-
-
-    CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
-        if (aOther.getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
-            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        if (aOther.isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        unaryAdd(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
-        return aMatrix;
-    }
-
-    //--mul-----------------------------------------------------
-    CudaMatrix CudaMatrix::operator*(float aScalar) const{
-        if (isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator*(float aScalar)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  getDataSize() * getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-        unaryMul(getData(),aScalar,out.getData(),getDataSize());
-        return out;
-    }
-    CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator*(float aScalar, const CudaMatrix &aMatrix)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
-        unaryMul(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());
-        return out;
-    }
-    CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator*(float aScalar, CudaMatrix &&aMatrix)"<<std::endl;
-            return  aMatrix;
-        }
-        unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }
-    CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator*(float aScalar)"<<std::endl;
-            return  aMatrix;
-        }
-        unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }
-    CudaMatrix CudaMatrix::operator*(const CudaMatrix &aMatrix) const{
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator* must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        if (this->isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator* must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  getDataSize() * getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-        if (isComplex()){
-            unaryMulc(aMatrix.getData(),getData(),data,aMatrix.getDataSize());
-        }
-        else{
-            unaryMul(aMatrix.getData(),getData(),data,aMatrix.getDataSize());
-        } 
-        return out;
-    }
-    CudaMatrix CudaMatrix::operator*(CudaMatrix &&aMatrix) const{
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator* must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        if (this->isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator* must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        if (isComplex()){
-            unaryMulc(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());
-        }
-        else{
-            unaryMul(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());
-        } 
-        return aMatrix;
-    }
-    CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther){
-        if (aOther.getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator* must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
-            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        if (aOther.isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator* must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        if (aMatrix.isComplex()){
-            unaryMulc(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
-        }
-        else{
-            unaryMul(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
-        } 
-        return aMatrix;
-    }
-    
-    //--Sub-----------------------------------------------------------------
-    CudaMatrix CudaMatrix::operator-(float aScalar) const{
-        if (isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator-(float aScalar)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  getDataSize() * getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-        unarySub(getData(),aScalar,out.getData(),getDataSize());
+//--Add----------------------------------------------------------------
+CudaMatrix CudaMatrix::operator+(float aScalar) const{
+    if (isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return (*this)+v;
+    } else {
+        auto out = deepCopyShape();
+        unaryAdd(getData(), aScalar, out.getData(), getDataSize());
         return out;
     }
     
-    CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator-(float aScalar, const CudaMatrix &aMatrix)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
-        unarySub(aScalar,aMatrix.getData(),out.getData(),aMatrix.getDataSize());
-        return out;
-    }
+}
 
-    CudaMatrix& operator-(float aScalar, CudaMatrix &&aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator-(float aScalar, CudaMatrix &&aMatrix)"<<std::endl;
-            return  aMatrix;
-        }
-        unarySub(aScalar,aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }
-    CudaMatrix& operator-(CudaMatrix &&aMatrix,float aScalar){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator-(CudaMatrix &&aMatrix, float aScalar)"<<std::endl;
-            return  aMatrix;
-        }
-        unarySub(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }
-    CudaMatrix CudaMatrix::operator-(const CudaMatrix &aMatrix) const{
-        if (aMatrix.isComplex()!=this->isComplex())
-        {
-            std::cerr<<"operator- must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator- must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
-        unarySub(this->getData(),aMatrix.getData(),out.getData(),aMatrix.getDataSize());
-        return out;
-    }
-    CudaMatrix CudaMatrix::operator-(CudaMatrix &&aMatrix) const{
-        if (aMatrix.isComplex()!=this->isComplex())
-        {
-            std::cerr<<"operator- must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator- must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        unarySub(this->getData(),aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }
-    CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther){
-        if (aMatrix.isComplex()!=aOther.isComplex())
-        {
-            std::cerr<<"operator- must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aOther.isComplex()?"Complex":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        if (aOther.getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator- must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
-            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        unarySub(aMatrix.getData(),aOther.getData(),aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
-    }  
+CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix) {
+    return aMatrix + aScalar;
+}
 
-    // div
-    CudaMatrix CudaMatrix::operator/(float aScalar) const{
-        if (isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator/(float aScalar)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  getDataSize() * getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-        unaryDiv(getData(),aScalar,out.getData(),getDataSize());
+CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix) {
+    return std::forward<CudaMatrix &&>(aMatrix) + aScalar;
+}
+
+CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar) {
+    if (aMatrix.isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return std::forward<CudaMatrix &&>(aMatrix)+v;
+    } else {
+        unaryAdd(aMatrix.getData(), aScalar, aMatrix.getData(),
+                     aMatrix.getDataSize());
+        return aMatrix;
+    }
+    
+}
+
+CudaMatrix CudaMatrix::operator+(std::complex<float> aScalar) const{
+    
+    if (isComplex()) {
+        auto out = deepCopyShape();
+        unaryAddc((std::complex<float>*)getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
+        return out;
+    } else {
+        auto out = Aurora::complex(*this);
+        unaryAddc((std::complex<float>*)out.getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
         return out;
     }
-    CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator/(float aScalar, const CudaMatrix &aMatrix)"<<std::endl;
-            return  CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
-        unaryDiv(aScalar,aMatrix.getData(),out.getData(),aMatrix.getDataSize());
+}
+
+CudaMatrix operator+(std::complex<float> aScalar, const CudaMatrix &aMatrix) {
+    return aMatrix + aScalar;
+}
+
+CudaMatrix &operator+(std::complex<float> aScalar, CudaMatrix &&aMatrix) {
+    return std::forward<CudaMatrix &&>(aMatrix) + aScalar;
+}
+
+CudaMatrix &operator+(CudaMatrix &&aMatrix, std::complex<float> aScalar) {
+    if (aMatrix.isComplex()) {
+        unaryAddc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        aMatrix = Aurora::complex(aMatrix);
+        unaryAddc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix CudaMatrix::operator+(const CudaMatrix &aMatrix) const {
+    auto thisMatrix = *this;
+    if (thisMatrix.isScalar()) {
+        return thisMatrix.getScalar() + aMatrix;
+    }
+    if (aMatrix.isScalar()) {
+        return thisMatrix + aMatrix.getScalar();
+    }
+    if (thisMatrix.compareShape(aMatrix)) {
+        //输出需考虑是否需要装变为complex
+        auto out = thisMatrix.deepCopyShape(aMatrix.isComplex()||thisMatrix.isComplex());
+        unaryAddmm(aMatrix.getData(), thisMatrix.getData(), out.getData(),
+                   out.getDataSize(), aMatrix.mValueType, thisMatrix.mValueType);
         return out;
     }
-    CudaMatrix& operator/(float aScalar, CudaMatrix &&aMatrix){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator/(float aScalar, CudaMatrix &&aMatrix)"<<std::endl;
-            return  aMatrix;
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != thisMatrix.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : (thisMatrix);
+        auto mat = aMatrix.isVector() ? (thisMatrix) : aMatrix;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryAddmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryAddmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M + M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(thisMatrix)<< std::endl;
         }
-        unaryDiv(aScalar,aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
     }
-    CudaMatrix& operator/(CudaMatrix &&aMatrix,float aScalar){
-        if (aMatrix.isComplex())
-        {
-            std::cerr<<"Complex matrix not support operator/(CudaMatrix &&aMatrix, float aScalar)"<<std::endl;
-            return  aMatrix;
-        }
-        unaryDiv(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-        return aMatrix;
+    return CudaMatrix();
+}
+
+CudaMatrix CudaMatrix::operator+(CudaMatrix &&aMatrix) const {
+    return std::forward<CudaMatrix&&>(aMatrix)+ *this;
+}
+
+CudaMatrix operator+(CudaMatrix &&aMatrix, CudaMatrix &aOther) {
+    if (aOther.isScalar()) {
+        return aOther.getScalar() + std::forward<CudaMatrix &&>(aMatrix);
     }
-    CudaMatrix CudaMatrix::operator/(const CudaMatrix &aMatrix) const{
-        if (aMatrix.isComplex()!=this->isComplex())
-        {
-            std::cerr<<"operator/ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
-            return CudaMatrix();
-        }
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator/ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
-        }
-        float* data = nullptr;
-        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
-        cudaMalloc((void**)&data, sizeof(float) * size);
-        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
-        if (isComplex()){
-            unaryDivc(getData(),aMatrix.getData(),data,aMatrix.getDataSize());
-        }
-        else{
-            unaryDiv(getData(),aMatrix.getData(),data,aMatrix.getDataSize());
-        } 
+    if (aMatrix.isScalar()) {
+        return aOther + aMatrix.getScalar();
+    }
+    if (aOther.compareShape(aMatrix)) {
+        //aMatrix的数据类型与aOther一致或者体积比aOther大时，仍旧用CudaMatrix &&aMatrix作为输出
+        auto out = aMatrix.mValueType >= aOther.mValueType? aMatrix : aOther.deepCopyShape();
+        unaryAddmm(aMatrix.getData(), aOther.getData(), out.getData(),
+                   out.getDataSize(), aMatrix.mValueType, aOther.mValueType);
         return out;
     }
-    CudaMatrix CudaMatrix::operator/(CudaMatrix &&aMatrix) const{
-        if (aMatrix.isComplex()!=this->isComplex())
-        {
-            std::cerr<<"operator/ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
-            return CudaMatrix();
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != aOther.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : aOther;
+        auto mat = aMatrix.isVector() ? aOther : aMatrix;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryAddmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row);
+            return out;
         }
-        if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator/ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-            return CudaMatrix();
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryAddmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType);
+            return out;
         }
-        if (aMatrix.isComplex()){
-            unaryDivc(this->getData(),aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M + M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(aOther)<< std::endl;
         }
-        else{
-            unaryDiv(this->getData(),aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
-        }  
+    }
+    return CudaMatrix();
+}
+
+//--mul-----------------------------------------------------
+CudaMatrix CudaMatrix::operator*(float aScalar) const{
+    if (isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return (*this)*v;
+    } else {
+        auto out = deepCopyShape();
+        unaryMul(getData(), aScalar, out.getData(), getDataSize());
+        return out;
+    }
+    
+}
+
+CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix) {
+    return aMatrix * aScalar;
+}
+
+CudaMatrix &operator*(float aScalar, CudaMatrix &&aMatrix) {
+    return std::forward<CudaMatrix &&>(aMatrix) * aScalar;
+}
+
+CudaMatrix &operator*(CudaMatrix &&aMatrix, float aScalar) {
+    if (aMatrix.isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return std::forward<CudaMatrix &&>(aMatrix)*v;
+    } else {
+        unaryMul(aMatrix.getData(), aScalar, aMatrix.getData(),
+                     aMatrix.getDataSize());
         return aMatrix;
     }
-    CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther){
-        if (aMatrix.isComplex()!=aOther.isComplex())
-        {
-            std::cerr<<"operator/ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
-            <<" and the matrix1 type is "<<(aOther.isComplex()?"Complex":"Real")<<std::endl;
-            return CudaMatrix();
+    
+}
+
+CudaMatrix CudaMatrix::operator*(std::complex<float> aScalar) const{
+    
+    if (isComplex()) {
+        auto out = deepCopyShape();
+        unaryMulc((std::complex<float>*)getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
+        return out;
+    } else {
+        auto out = Aurora::complex(*this);
+        unaryMulc((std::complex<float>*)out.getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
+        return out;
+    }
+}
+
+CudaMatrix operator*(std::complex<float> aScalar, const CudaMatrix &aMatrix) {
+    return aMatrix * aScalar;
+}
+
+CudaMatrix &operator*(std::complex<float> aScalar, CudaMatrix &&aMatrix) {
+    return std::forward<CudaMatrix &&>(aMatrix) * aScalar;
+}
+
+CudaMatrix &operator*(CudaMatrix &&aMatrix, std::complex<float> aScalar) {
+    if (aMatrix.isComplex()) {
+        unaryMulc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        aMatrix = Aurora::complex(aMatrix);
+        unaryMulc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix CudaMatrix::operator*(const CudaMatrix &aMatrix) const {
+    auto thisMatrix = *this;
+    if (thisMatrix.isScalar()) {
+        return thisMatrix.getScalar() * aMatrix;
+    }
+    if (aMatrix.isScalar()) {
+        return thisMatrix * aMatrix.getScalar();
+    }
+    if (thisMatrix.compareShape(aMatrix)) {
+        //输出需考虑是否需要装变为complex
+        auto out = thisMatrix.deepCopyShape(aMatrix.isComplex()||thisMatrix.isComplex());
+        unaryMulmm(aMatrix.getData(), thisMatrix.getData(), out.getData(),
+                   out.getDataSize(), aMatrix.mValueType, thisMatrix.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != thisMatrix.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : (thisMatrix);
+        auto mat = aMatrix.isVector() ? (thisMatrix) : aMatrix;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryMulmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row);
+            return out;
         }
-        if (aOther.getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator/ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
-            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
-            return CudaMatrix();
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryMulmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType);
+            return out;
         }
-        if (aMatrix.isComplex()){
-            unaryDivc(aMatrix.getData(),aOther.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M * M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(thisMatrix)<< std::endl;
         }
-        else{
-            unaryDiv(aMatrix.getData(),aOther.getData(),aMatrix.getData(),aMatrix.getDataSize());
-        }  
-        
+    }
+    return CudaMatrix();
+}
+
+CudaMatrix CudaMatrix::operator*(CudaMatrix &&aMatrix) const {
+    return std::forward<CudaMatrix&&>(aMatrix)* (*this);
+}
+
+CudaMatrix operator*(CudaMatrix &&aMatrix, CudaMatrix &aOther) {
+    if (aOther.isScalar()) {
+        return aOther.getScalar() * std::forward<CudaMatrix &&>(aMatrix);
+    }
+    if (aMatrix.isScalar()) {
+        return aOther * aMatrix.getScalar();
+    }
+    if (aOther.compareShape(aMatrix)) {
+        //aMatrix的数据类型与aOther一致或者体积比aOther大时，仍旧用CudaMatrix &&aMatrix作为输出
+        auto out = aMatrix.mValueType >= aOther.mValueType? aMatrix : aOther.deepCopyShape();
+        unaryMulmm(aMatrix.getData(), aOther.getData(), out.getData(),
+                   out.getDataSize(), aMatrix.mValueType, aOther.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != aOther.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : aOther;
+        auto mat = aMatrix.isVector() ? aOther : aMatrix;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryMulmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryMulmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M * M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(aOther)<< std::endl;
+        }
+    }
+    return CudaMatrix();
+}
+    
+//--Sub-----------------------------------------------------------------
+CudaMatrix CudaMatrix::operator-(float aScalar) const{
+    if (isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return (*this)-v;
+    } else {
+        auto out = deepCopyShape();
+        unarySub(getData(), aScalar, out.getData(), getDataSize());
+        return out;
+    }
+    
+}
+
+CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix) {
+    auto out = aMatrix.deepCopyShape();
+    if (aMatrix.isComplex()) {
+        std::complex<float> v(aScalar,0);
+        unarySubc(v, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)out.getData(),
+                  aMatrix.getDataSize());
+        return out;
+    } else {
+        unarySub(aScalar, out.getData(),out.getData(),
+                  out.getDataSize());  
+        return out;
+    }
+}
+
+CudaMatrix &operator-(float aScalar, CudaMatrix &&aMatrix) {
+    if (aMatrix.isComplex()) {
+        unarySubc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        unarySub(aScalar,aMatrix.getData(),aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix &operator-(CudaMatrix &&aMatrix, float aScalar) {
+    if (aMatrix.isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return std::forward<CudaMatrix &&>(aMatrix)-v;
+    } else {
+        unarySub(aMatrix.getData(), aScalar, aMatrix.getData(),
+                     aMatrix.getDataSize());
         return aMatrix;
     }
+    
+}
+
+CudaMatrix CudaMatrix::operator-(std::complex<float> aScalar) const{
+    
+    if (isComplex()) {
+        auto out = deepCopyShape();
+        unarySubc((std::complex<float>*)getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
+        return out;
+    } else {
+        auto out = Aurora::complex(*this);
+        unarySubc((std::complex<float>*)out.getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
+        return out;
+    }
+}
+
+CudaMatrix operator-(std::complex<float> aScalar, const CudaMatrix &aMatrix) {
+    if (aMatrix.isComplex()) {
+        auto out = aMatrix.deepCopyShape();
+        unarySubc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)out.getData(),
+                  aMatrix.getDataSize());
+        return out;
+    } else {
+        auto out = Aurora::complex(aMatrix);
+        unarySubc(aScalar, (std::complex<float> *)out.getData(),
+                  (std::complex<float> *)out.getData(),
+                  out.getDataSize());  
+        return out;
+    }
+}
+
+CudaMatrix &operator-(std::complex<float> aScalar, CudaMatrix &&aMatrix) {
+    if (aMatrix.isComplex()) {
+        unarySubc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        aMatrix = Aurora::complex(aMatrix);
+        unarySubc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix &operator-(CudaMatrix &&aMatrix, std::complex<float> aScalar) {
+    if (aMatrix.isComplex()) {
+        unarySubc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        aMatrix = Aurora::complex(aMatrix);
+        unarySubc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix CudaMatrix::operator-(const CudaMatrix &aMatrix) const {
+    auto thisMatrix = *this;
+    if (thisMatrix.isScalar()) {
+        return thisMatrix.getScalar() - aMatrix;
+    }
+    if (aMatrix.isScalar()) {
+        return thisMatrix - aMatrix.getScalar();
+    }
+    if (thisMatrix.compareShape(aMatrix)) {
+        //输出需考虑是否需要装变为complex
+        auto out = thisMatrix.deepCopyShape(aMatrix.isComplex()||thisMatrix.isComplex());
+        unarySubmm(thisMatrix.getData(), aMatrix.getData(), out.getData(),
+                   out.getDataSize(), thisMatrix.mValueType, aMatrix.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != thisMatrix.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : (thisMatrix);
+        auto mat = aMatrix.isVector() ? (thisMatrix) : aMatrix;
+        int direction = thisMatrix.isVector()?1:0;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unarySubmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row, direction);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unarySubmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Column, direction);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M - M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(thisMatrix)<< std::endl;
+        }
+    }
+    return CudaMatrix();
+}
+
+CudaMatrix CudaMatrix::operator-(CudaMatrix &&aMatrix) const {
+    auto aOther = *this;
+    if (isScalar()) {
+        return getScalar() - std::forward<CudaMatrix &&>(aMatrix);
+    }
+    if (aMatrix.isScalar()) {
+        return aOther - aMatrix.getScalar();
+    }
+    if (aOther.compareShape(aMatrix)) {
+        //aMatrix的数据类型与aOther一致或者体积比aOther大时，仍旧用CudaMatrix &&aMatrix作为输出
+        auto out = aMatrix.mValueType >= aOther.mValueType? aMatrix : aOther.deepCopyShape();
+        unarySubmm(aOther.getData(), aMatrix.getData(), out.getData(),
+                   out.getDataSize(), aOther.mValueType, aMatrix.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != aOther.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : aOther;
+        auto mat = aMatrix.isVector() ? aOther : aMatrix;
+        int direction = aOther.isVector()?1:0;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unarySubmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row,direction);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unarySubmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType,Column,direction);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M - M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(aOther)<< std::endl;
+        }
+    }
+    return CudaMatrix();
+}
+
+CudaMatrix operator-(CudaMatrix &&aMatrix, CudaMatrix &aOther) {
+    if (aOther.isScalar()) {
+        return std::forward<CudaMatrix &&>(aMatrix) - aOther.getScalar();
+    }
+    if (aMatrix.isScalar()) {
+        return aMatrix.getScalar() - aOther;
+    }
+    if (aOther.compareShape(aMatrix)) {
+        //aMatrix的数据类型与aOther一致或者体积比aOther大时，仍旧用CudaMatrix &&aMatrix作为输出
+        auto out = aMatrix.mValueType >= aOther.mValueType? aMatrix : aOther.deepCopyShape();
+        unarySubmm(aMatrix.getData(), aOther.getData(), out.getData(),
+                   out.getDataSize(), aMatrix.mValueType, aOther.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != aOther.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : aOther;
+        auto mat = aMatrix.isVector() ? aOther : aMatrix;
+        int direction = aMatrix.isVector()?1:0;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unarySubmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row,direction);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unarySubmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType,Column,direction);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M - M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(aOther)<< std::endl;
+        }
+    }
+    return CudaMatrix();
+}
+
+// div
+CudaMatrix CudaMatrix::operator/(float aScalar) const{
+    if (isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return (*this)/v;
+    } else {
+        auto out = deepCopyShape();
+        unaryDiv(getData(), aScalar, out.getData(), getDataSize());
+        return out;
+    }
+    
+}
+
+CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix) {
+    auto out = aMatrix.deepCopyShape();
+    if (aMatrix.isComplex()) {
+        std::complex<float> v(aScalar,0);
+        unaryDivc(v, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)out.getData(),
+                  aMatrix.getDataSize());
+        return out;
+    } else {
+        unaryDiv(aScalar, out.getData(),out.getData(),
+                  out.getDataSize());  
+        return out;
+    }
+}
+
+CudaMatrix &operator/(float aScalar, CudaMatrix &&aMatrix) {
+    if (aMatrix.isComplex()) {
+        unaryDivc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        unaryDiv(aScalar,aMatrix.getData(),aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix &operator/(CudaMatrix &&aMatrix, float aScalar) {
+    if (aMatrix.isComplex()) {
+        std::complex<float> v(aScalar,0);
+        return std::forward<CudaMatrix &&>(aMatrix)/v;
+    } else {
+        unaryDiv(aMatrix.getData(), aScalar, aMatrix.getData(),
+                     aMatrix.getDataSize());
+        return aMatrix;
+    }
+    
+}
+
+CudaMatrix CudaMatrix::operator/(std::complex<float> aScalar) const{
+    
+    if (isComplex()) {
+        auto out = deepCopyShape();
+        unaryDivc((std::complex<float>*)getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
+        return out;
+    } else {
+        auto out = Aurora::complex(*this);
+        unaryDivc((std::complex<float>*)out.getData(), aScalar, (std::complex<float>*)out.getData(), getDataSize());
+        return out;
+    }
+}
+
+CudaMatrix operator/(std::complex<float> aScalar, const CudaMatrix &aMatrix) {
+    if (aMatrix.isComplex()) {
+        auto out = aMatrix.deepCopyShape();
+        unaryDivc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)out.getData(),
+                  aMatrix.getDataSize());
+        return out;
+    } else {
+        auto out = Aurora::complex(aMatrix);
+        unaryDivc(aScalar, (std::complex<float> *)out.getData(),
+                  (std::complex<float> *)out.getData(),
+                  out.getDataSize());  
+        return out;
+    }
+}
+
+CudaMatrix &operator/(std::complex<float> aScalar, CudaMatrix &&aMatrix) {
+    if (aMatrix.isComplex()) {
+        unaryDivc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        aMatrix = Aurora::complex(aMatrix);
+        unaryDivc(aScalar, (std::complex<float> *)aMatrix.getData(),
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix &operator/(CudaMatrix &&aMatrix, std::complex<float> aScalar) {
+    if (aMatrix.isComplex()) {
+        unaryDivc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());
+    } else {
+        aMatrix = Aurora::complex(aMatrix);
+        unaryDivc((std::complex<float> *)aMatrix.getData(), aScalar,
+                  (std::complex<float> *)aMatrix.getData(),
+                  aMatrix.getDataSize());  
+    }
+    return aMatrix;
+}
+
+CudaMatrix CudaMatrix::operator/(const CudaMatrix &aMatrix) const {
+    auto thisMatrix = *this;
+    if (thisMatrix.isScalar()) {
+        return thisMatrix.getScalar() / aMatrix;
+    }
+    if (aMatrix.isScalar()) {
+        return thisMatrix / aMatrix.getScalar();
+    }
+    if (thisMatrix.compareShape(aMatrix)) {
+        //输出需考虑是否需要装变为complex
+        auto out = thisMatrix.deepCopyShape(aMatrix.isComplex()||thisMatrix.isComplex());
+        unaryDivmm(thisMatrix.getData(), aMatrix.getData(), out.getData(),
+                   out.getDataSize(), thisMatrix.mValueType, aMatrix.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != thisMatrix.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : (thisMatrix);
+        auto mat = aMatrix.isVector() ? (thisMatrix) : aMatrix;
+        int direction = thisMatrix.isVector()?1:0;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryDivmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row, direction);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            //输出需考虑是否需要装变为complex
+            auto out =  mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryDivmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Column, direction);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M - M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(thisMatrix)<< std::endl;
+        }
+    }
+    return CudaMatrix();
+}
+
+CudaMatrix CudaMatrix::operator/(CudaMatrix &&aMatrix) const {
+    auto aOther = *this;
+    if (isScalar()) {
+        return getScalar() / std::forward<CudaMatrix &&>(aMatrix);
+    }
+    if (aMatrix.isScalar()) {
+        return aOther / aMatrix.getScalar();
+    }
+    if (aOther.compareShape(aMatrix)) {
+        //aMatrix的数据类型与aOther一致或者体积比aOther大时，仍旧用CudaMatrix &&aMatrix作为输出
+        auto out = aMatrix.mValueType >= aOther.mValueType? aMatrix : aOther.deepCopyShape();
+        unaryDivmm(aOther.getData(), aMatrix.getData(), out.getData(),
+                   out.getDataSize(), aOther.mValueType, aMatrix.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != aOther.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : aOther;
+        auto mat = aMatrix.isVector() ? aOther : aMatrix;
+        int direction = aOther.isVector()?1:0;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryDivmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row,direction);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryDivmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType,Column,direction);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M - M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(aOther)<< std::endl;
+        }
+    }
+    return CudaMatrix();
+}
+
+CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther) {
+    if (aOther.isScalar()) {
+        return std::forward<CudaMatrix &&>(aMatrix) / aOther.getScalar();
+    }
+    if (aMatrix.isScalar()) {
+        return aMatrix.getScalar() / aOther;
+    }
+    if (aOther.compareShape(aMatrix)) {
+        //aMatrix的数据类型与aOther一致或者体积比aOther大时，仍旧用CudaMatrix &&aMatrix作为输出
+        auto out = aMatrix.mValueType >= aOther.mValueType? aMatrix : aOther.deepCopyShape();
+        unaryDivmm(aMatrix.getData(), aOther.getData(), out.getData(),
+                   out.getDataSize(), aMatrix.mValueType, aOther.mValueType);
+        return out;
+    }
+    // 一个是向量一个是矩阵
+    else if (aMatrix.isVector() != aOther.isVector()) {
+        auto vec = aMatrix.isVector() ? aMatrix : aOther;
+        auto mat = aMatrix.isVector() ? aOther : aMatrix;
+        int direction = aMatrix.isVector()?1:0;
+        // 行向量
+        if (vec.getDimSize(0) == 1 && vec.getDimSize(1) == mat.getDimSize(1)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryDivmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType, Aurora::Row,direction);
+            return out;
+        }
+        // 列向量
+        else if (vec.getDimSize(1) == 1 &&
+                 vec.getDimSize(0) == mat.getDimSize(0)) {
+            // 判定aMatrix是矩阵，且数据类型与输出结果一致时，继续使用CudaMatrix
+            // &&aMatrix作为输出
+            auto out = mat.deepCopyShape(vec.isComplex() || mat.isComplex());
+            unaryDivmv(mat.getData(), vec.getData(), out.getData(),
+                       out.getDataSize(), vec.getDataSize(), mat.mValueType,
+                       vec.mValueType,Column,direction);
+            return out;
+        }
+        // 尺寸不匹配
+        else {
+            std::cerr
+                << "M - M fail, matrix shape not match the operation rule!"
+                << "with matrix1 shape:"
+                << getMatrixShapeStr(aMatrix)
+                << " and matrix2 shape:"
+                << getMatrixShapeStr(aOther)<< std::endl;
+        }
+    }
+    return CudaMatrix();
+}
 
     // -----------pow-------------------------------------------------------
     CudaMatrix CudaMatrix::operator^(int times) const{
diff --git a/src/CudaMatrix.h b/src/CudaMatrix.h
index 32a1fbb..73b08b5 100644
--- a/src/CudaMatrix.h
+++ b/src/CudaMatrix.h
@@ -43,11 +43,23 @@ namespace Aurora
          */
         CudaMatrix deepCopy() const;
 
+        /**
+         * @brief 深拷贝操作,但不复制数据
+         * 
+         * @return CudaMatrix 
+         */
+        CudaMatrix deepCopyShape(bool complex = false) const;
+
+
         // add
         CudaMatrix operator+(float aScalar) const;
         friend CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix);
         friend CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix);
         friend CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator+(std::complex<float> aScalar) const;
+        friend CudaMatrix operator+(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator+(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator+(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
         CudaMatrix operator+(const CudaMatrix &aMatrix) const;
         CudaMatrix operator+(CudaMatrix &&aMatrix) const;
         friend CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther);
@@ -57,6 +69,10 @@ namespace Aurora
         friend CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix);
         friend CudaMatrix& operator-(float aScalar, CudaMatrix &&aMatrix);
         friend CudaMatrix& operator-(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator-(std::complex<float> aScalar) const;
+        friend CudaMatrix operator-(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator-(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator-(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
         CudaMatrix operator-(const CudaMatrix &aMatrix) const;
         CudaMatrix operator-(CudaMatrix &&aMatrix) const;
         friend CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther);
@@ -70,6 +86,10 @@ namespace Aurora
         friend CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix);
         friend CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix);
         friend CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator*(std::complex<float> aScalar) const;
+        friend CudaMatrix operator*(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator*(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator*(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
         CudaMatrix operator*(const CudaMatrix &aMatrix) const;
         CudaMatrix operator*(CudaMatrix &&aMatrix) const;
         friend CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther);
@@ -79,9 +99,13 @@ namespace Aurora
         friend CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix);
         friend CudaMatrix& operator/(float aScalar, CudaMatrix &&aMatrix);
         friend CudaMatrix& operator/(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator/(std::complex<float> aScalar) const;
+        friend CudaMatrix operator/(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator/(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator/(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
         CudaMatrix operator/(const CudaMatrix &aMatrix) const;
         CudaMatrix operator/(CudaMatrix &&aMatrix) const;
-        friend CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther);
+        friend CudaMatrix operator/(CudaMatrix &&aMatrix,CudaMatrix &aOther);
 
         // pow
         CudaMatrix operator^(int times) const;
diff --git a/src/CudaMatrixPrivate.cu b/src/CudaMatrixPrivate.cu
index 1dbf25e..e5c4d95 100644
--- a/src/CudaMatrixPrivate.cu
+++ b/src/CudaMatrixPrivate.cu
@@ -4,7 +4,9 @@
 #include <thrust/complex.h>
 #include <thrust/functional.h>
 #include <thrust/execution_policy.h>
+
 #include "AuroraDefs.h"
+#include "AuroraThrustIterator.cuh"
 using namespace thrust::placeholders;
 
 struct PowOp: public thrust::unary_function<float, float>{
@@ -127,36 +129,224 @@ struct CompareANEOp{
     }
 };
 
+typedef thrust::complex<float> complexf;
 
-void unaryAdd(float* in1, float* in2, float* out, unsigned long  length)
+
+
+void unaryAddmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+              unsigned long aLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2)
 {
-    thrust::plus<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
+    if (aValType1 == aValType2)
+    {
+        thrust::plus<float> op;
+        thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()+y,x.imag());
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()+y,x.imag());
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn2,(complexf*)aMatrixIn2+aLength,aMatrixIn1,(complexf*)aMatrixOut,lambda);
+    }
+
 }
 
-void unaryAdd(float* in1, const float& in2, float* out, unsigned long  length)
+void unaryAddmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2,
+              Aurora::FunctionDirection aVectype)
 {
-    thrust::transform(thrust::device,in1,in1+length,out,in2 + _1);
+    if (aVectype == Aurora::Column){
+        if(aValType1 == aValType2){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            thrust::plus<float> op;
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength*aValType1,rowVectorIter,aMatrixOut,op);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()+y,x.imag());
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()+x,y.imag());
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            thrust::plus<float> op;
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,op);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()+y,x.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()+x,y.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
 }
 
-void unaryMul(float* in1, float* in2, float* out, unsigned long  length)
+void unaryAdd(float* aMatrixIn, const float& aVectorIn, float* aMatrixOut, unsigned long  aLength)
 {
-    thrust::multiplies<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
+    thrust::transform(thrust::device,aMatrixIn,aMatrixIn+aLength,aMatrixOut,aVectorIn + _1);
 }
 
-void unaryMulc(float* in1, float* in2, float* out, unsigned long  length)
+void unaryAddc(std::complex<float>* aMatrixIn, const std::complex<float>& aScalarIn, 
+                std::complex<float>* aMatrixOut, unsigned long  aLength)
 {
-    thrust::complex<float>* _in1 = (thrust::complex<float>*)in1;
-    thrust::complex<float>* _in2 = (thrust::complex<float>*)in2; 
-    thrust::complex<float>* _out = (thrust::complex<float>*)out;;
-    thrust::multiplies<thrust::complex<float>> op;
-    thrust::transform(thrust::device,_in1,_in1+length,_in2,_out,op);
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device,(complexf*)aMatrixIn,
+                    (complexf*)aMatrixIn + aLength,
+                    (complexf*)aMatrixOut, value + _1);
 }
 
-void unaryMul(float* in1, const float& in2, float* out, unsigned long  length)
+void unaryMul(float* aMatrixIn, const float& aScalarIn, float* aMatrixOut, unsigned long  aLength)
 {
-    thrust::transform(thrust::device,in1, in1+length, out, in2 * _1);
+    thrust::transform(thrust::device,aMatrixIn,aMatrixIn+aLength,aMatrixOut,aScalarIn * _1);
+}
+
+void unaryMulc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength) 
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      value * _1);
+}
+
+void unaryMulmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+              unsigned long aLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2)
+{
+    if (aValType1 == aValType2 )
+    {
+        if(Aurora::Normal == aValType1){
+            thrust::multiplies<float> op;
+            thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+        }
+        else{
+            thrust::multiplies<complexf> op;
+            thrust::transform(thrust::device, (complexf *)aMatrixIn1,
+                              (complexf *)aMatrixIn1 + aLength,
+                              (complexf *)aMatrixIn2, (complexf *)aMatrixOut,
+                              op);
+        }
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()*y,x.imag()*y);
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()*y,x.imag()*y);
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn2,(complexf*)aMatrixIn2+aLength,aMatrixIn1,(complexf*)aMatrixOut,lambda);
+    }
+
+}
+
+void unaryMulmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2,
+              Aurora::FunctionDirection aVectype)
+{
+    if (aVectype == Aurora::Column){
+        if(aValType1 == aValType2){
+            if(aValType1 == Aurora::Normal){
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                thrust::multiplies<float> op;
+                thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength*aValType1,rowVectorIter,aMatrixOut,op);
+            }
+            else{
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                thrust::multiplies<complexf> op;
+                thrust::transform(thrust::device, (complexf*)aMatrixIn1,
+                            (complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,
+                            (complexf*)aMatrixOut,op);
+            }
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()*y,x.imag()*y);
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()*x,y.imag()*x);
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            if(aValType1 == Aurora::Normal){
+                Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                thrust::multiplies<float> op;
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,op);
+            }
+            else{
+                Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                thrust::multiplies<complexf> op;
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,op);
+            }
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()*y,x.imag()*y);
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()*x,y.imag()*x);
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
 }
 
 void unaryNeg(float* in1, float* out, unsigned long length){
@@ -164,42 +354,280 @@ void unaryNeg(float* in1, float* out, unsigned long length){
     thrust::transform(thrust::device,in1,in1+length,out,op);
 }
 
-void unarySub(float* in1, float* in2, float* out, unsigned long length){
-    thrust::minus<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
-}
-
-void unaryDiv(float* in1, float* in2, float* out, unsigned long length){
-    thrust::divides<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
-}
-
-void unaryDivc(float* in1, float* in2, float* out, unsigned long  length)
-{
-    thrust::complex<float>* _in1 = (thrust::complex<float>*)in1;
-    thrust::complex<float>* _in2 = (thrust::complex<float>*)in2; 
-    thrust::complex<float>* _out = (thrust::complex<float>*)out;;
-    thrust::divides<thrust::complex<float>> op;
-    thrust::transform(thrust::device,_in1,_in1+length,_in2,_out,op);
-}
-
-void unarySub(const float& in1, float* in2, float* out, unsigned long length){
-    thrust::transform(thrust::device,in2,in2+length,out,in1-_1);
-}
-
-void unaryDiv(const float& in1, float* in2, float* out, unsigned long length){
-    thrust::transform(thrust::device,in2,in2+length,out,in1/_1);
-
-}
-
 void unarySub(float* in1, const float& in2, float* out, unsigned long length){
     thrust::transform(thrust::device,in1,in1+length,out,_1-in2);
 }
 
+void unarySub(const float& in2, float* in1, float* out, unsigned long length){
+    thrust::transform(thrust::device,in1,in1+length,out,in2-_1);
+}
+
+void unarySubc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      _1 - value);
+}
+
+void unarySubc(const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      value - _1);
+}
+
+void unarySubmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1,
+                Aurora::ValueType aValType2 )
+{
+    if (aValType1 == aValType2 )
+    {
+        if(Aurora::Normal == aValType1){
+            thrust::minus<float> op;
+            thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+        }
+        else{
+            thrust::minus<complexf> op;
+            thrust::transform(thrust::device, (complexf *)aMatrixIn1,
+                              (complexf *)aMatrixIn1 + aLength,
+                              (complexf *)aMatrixIn2, (complexf *)aMatrixOut,
+                              op);
+        }
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()-y,x.imag());
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const float& x, const complexf& y){
+            return complexf(x-y.real(),0-y.imag());
+        };
+        thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength,(complexf*)aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+}
+
+void unarySubmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype,int direction)
+{
+    if (aVectype == Aurora::Column){
+        //类型一样时，逐元素处理即可
+        if(aValType1 == aValType2){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const float& y){
+                return direction==0?x-y:y-x;
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength*aValType1,rowVectorIter,aMatrixOut,lambda);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return  direction==0?complexf(x.real()-y,x.imag()):complexf(y-x.real(),-x.imag());
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return  direction==0?complexf(x-y.real(),-y.imag()):complexf(y.real()-x,y.imag());
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const float& y){
+                return direction==0?x-y:y-x;
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,lambda);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return  direction==0?complexf(x.real()-y,x.imag()):complexf(y-x.real(),-x.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return  direction==0?complexf(x-y.real(),-y.imag()):complexf(y.real()-x,y.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+}
+
 void unaryDiv(float* in1, const float& in2, float* out, unsigned long length){
     thrust::transform(thrust::device,in1,in1+length,out,_1/in2);
 }
 
+void unaryDiv(const float& in2, float* in1, float* out, unsigned long length){
+    thrust::transform(thrust::device,in1,in1+length,out,in2/_1);
+}
+
+void unaryDivc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      _1 / value);
+}
+
+void unaryDivc(const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      value / _1);
+}
+
+void unaryDivmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1,
+                Aurora::ValueType aValType2 )
+{
+    if (aValType1 == aValType2 )
+    {
+        if(Aurora::Normal == aValType1){
+            thrust::divides<float> op;
+            thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+        }
+        else{
+            thrust::divides<complexf> op;
+            thrust::transform(thrust::device, (complexf *)aMatrixIn1,
+                              (complexf *)aMatrixIn1 + aLength,
+                              (complexf *)aMatrixIn2, (complexf *)aMatrixOut,
+                              op);
+        }
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return x/complexf(y,0);
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const float& x, const complexf& y){
+            return complexf(x,0)/y;
+        };
+        thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength,(complexf*)aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+}
+
+void unaryDivmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype,int direction)
+{
+    if (aVectype == Aurora::Column){
+        //类型一样时，逐元素处理即可
+        if(aValType1 == aValType2){
+            if (aValType1==Aurora::Normal){
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const float& x, const float& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,aMatrixOut,lambda);
+            }
+            else{
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const complexf& x, const complexf& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, (complexf*)aMatrixIn1,
+                    (complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+            }
+
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                complexf v (y,0);
+                return  direction==0?x/v:v/x;
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                complexf v (x,0);
+                return  direction==0?v/y:y/v;
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            if (aValType1 == Aurora::Normal)
+            {
+                Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const float& x, const float& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,lambda);
+            }
+            else{
+                Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const complexf& x, const complexf& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda); 
+            }
+
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                complexf v (y,0);
+                return  direction==0?x/v:v/x;
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                complexf v (x,0);
+                return  direction==0?v/y:y/v;
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+}
+
+
+
 void unaryPow(float* in1, float N,float* out, unsigned long length){
     if (N == 0.0f)
     {
@@ -305,3 +733,4 @@ void thrustFill(float* aBegin, float* aEnd, std::complex<float> aValue)
     thrust::fill(thrust::device, (std::complex<float>*)aBegin, (std::complex<float>*)aEnd, aValue);
 }
 
+
diff --git a/src/CudaMatrixPrivate.cuh b/src/CudaMatrixPrivate.cuh
index 89b0fdc..6577a66 100644
--- a/src/CudaMatrixPrivate.cuh
+++ b/src/CudaMatrixPrivate.cuh
@@ -3,32 +3,290 @@
 #define __CUDAMATRIX_CUH__
 
 #include <complex>
+#include "AuroraDefs.h"
 namespace{
     enum CompareType
     {
         G,GE,E,NE,LE,L
     };
+    enum VecType
+    {
+        COL,ROW
+    };
 }
-void unaryAdd(float* in1, float* in2, float* out, unsigned long length);
-void unaryAdd(float* in1, const float& in2, float* out, unsigned long length);
-void unaryMul(float* in1, float* in2, float* out, unsigned long length);
-void unaryMulc(float* in1, float* in2, float* out, unsigned long length);
 
-void unaryMul(float* in1, const float& in2, float* out, unsigned long length);
+/**
+ * @brief 矩阵逐元素加一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryAdd(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素加一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryAddc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相加
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unaryAddmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相加
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ */
+void unaryAddmv(float *aMatrixIn, float *aVectorIn, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aMatValType, Aurora::ValueType aVecValType, 
+              Aurora::FunctionDirection aFuncType = Aurora::FunctionDirection::Column);
+
+/**
+ * @brief 矩阵逐元素乘一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryMul(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素乘一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryMulc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相乘
+ *
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unaryMulmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相乘
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ */
+void unaryMulmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype = Aurora::FunctionDirection::Column);
+
+/**
+ * @brief 矩阵逐元素减一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySub(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个float值减
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySub(const float &aValue,float *aMatrixIn,  float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素减一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySubc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个complex<float>值减
+ * 
+ * @param aValue complex<float>值
+  * @param aMatrixIn 矩阵数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySubc(const std::complex<float> &aValue,
+                std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相减
+ *
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unarySubmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相减
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ * @param direction 减法执行方向, 0 mat - vec, 1 vec- mat
+ */
+void unarySubmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype = Aurora::FunctionDirection::Column,
+              int direction = 0);
+
+/**
+ * @brief 矩阵逐元素除以一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDiv(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个float值除
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDiv(const float &aValue,float *aMatrixIn,  float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素除以一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDivc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个complex<float>值除
+ * 
+ * @param aValue complex<float>值
+  * @param aMatrixIn 矩阵数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDivc(const std::complex<float> &aValue,
+                std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相除
+ *
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unaryDivmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相除
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ * @param direction 减法执行方向, 0 mat - vec, 1 vec- mat
+ */
+void unaryDivmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype = Aurora::FunctionDirection::Column,
+              int direction = 0);
 
 void unaryNeg(float* in1, float* out, unsigned long length);
 void unaryPow(float* in1, float N,float* out, unsigned long length);
 
-void unarySub(float* in1, float* in2, float* out, unsigned long length);
-void unaryDiv(float* in1, float* in2, float* out, unsigned long length);
-void unaryDivc(float* in1, float* in2, float* out, unsigned long length);
-
-void unarySub(const float& in1, float* in2, float* out, unsigned long length);
-void unaryDiv(const float& in1, float* in2, float* out, unsigned long length);
-void unarySub(float* in1, const float& in2, float* out, unsigned long length);
-void unaryDiv(float* in1, const float& in2, float* out, unsigned long length);
-
-
 void unaryCompare(float* in1, const float& in2, float* out, unsigned long length,int type);
 void unaryCompare(const float& in1, float* in2, float* out, unsigned long length, int type);
 void unaryCompare(float* in1, float* in2, float* out, unsigned long length, int type);
diff --git a/src/Matrix.cpp b/src/Matrix.cpp
index b480b58..475cd9d 100644
--- a/src/Matrix.cpp
+++ b/src/Matrix.cpp
@@ -2,8 +2,10 @@
 #include "CudaMatrix.h"
 
 #include <cmath>
+#include <complex>
 #include <cstddef>
 #include <mkl_cblas.h>
+#include <mkl_vml_functions.h>
 #include <string>
 #include <cstring>
 #include <iostream>
@@ -16,6 +18,7 @@
 #include "Eigen/src/Core/Map.h"
 #include "Eigen/src/Core/Matrix.h"
 #include "Function.h"
+#include "Function1D.h"
 
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
@@ -130,7 +133,7 @@ namespace Aurora{
         }
         else{
             float zero = 0.0;
-            aFuncD(size, yC+ 1, 2, &zero, 0, output + 1, 2);
+            aFuncD(size,  &zero, 0, yC+ 1, 2, output + 1, 2);
         }
     }
 
@@ -560,7 +563,14 @@ namespace Aurora {
     }
     //operation /
     Matrix Matrix::operator/(float aScalar) const { return operatorMxA(&vsDivI, aScalar, *this);}
-    Matrix operator/(float aScalar, const Matrix &matrix) {return operatorAxM(&vsDivI, aScalar, matrix);}
+    Matrix operator/(float aScalar, const Matrix &aMatrix) {
+        if(!aMatrix.isComplex())return operatorAxM(&vsDivI, aScalar, aMatrix);
+        float *output = malloc(aMatrix.getDataSize(), true);
+        std::complex<float> v(aScalar,0);
+        vcDivI(aMatrix.getDataSize(),  &v, 0,(std::complex<float>*)aMatrix.getData(), 1, (std::complex<float>*)output, 1);
+        return Matrix::New(output, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2),
+                                   aMatrix.getValueType());
+    }
     Matrix Matrix::operator/(const Matrix &aMatrix) const {
         if (isScalar()){
             return getScalar()/aMatrix;
@@ -568,6 +578,9 @@ namespace Aurora {
         if (aMatrix.isScalar()){
             return (*this)/aMatrix.getScalar();
         }
+        if(aMatrix.isComplex()&& !isComplex()){
+            return operatorMxM_RR(&vsDivI,&vcDivI,aMatrix,complex(*this),true);
+        }
         return operatorMxM(vsDivI, vcDivI, *this, aMatrix);
     }
     Matrix &operator/(float aScalar, Matrix &&matrix) {
@@ -583,7 +596,8 @@ namespace Aurora {
         if (aMatrix.isScalar()){
             return (*this)/aMatrix.getScalar();
         }
-        return operatorMxM_RR(&vsDivI,&vcDivI,*this,std::forward<Matrix&&>(aMatrix));
+        Matrix leftMat = (aMatrix.isComplex()&& !isComplex())? complex(*this):(*this);
+        return operatorMxM_RR(&vsDivI,&vcDivI,leftMat,std::forward<Matrix&&>(aMatrix));
     }
 
     Matrix operator/(Matrix &&aMatrix, Matrix &aOther) {
@@ -593,7 +607,12 @@ namespace Aurora {
         if (aMatrix.isScalar()){
             return aMatrix.getScalar()/aOther;
         }
+        if( aOther.isComplex()&& !aMatrix.isComplex())
+        {
+            return operatorMxM_RR(&vsDivI,&vcDivI,aOther,std::forward<Matrix&&>(complex(aMatrix)),true);
+        }
         return operatorMxM_RR(&vsDivI,&vcDivI,aOther,std::forward<Matrix&&>(aMatrix),true);
+     
     }
     //operator ^ (pow)
     Matrix Matrix::operator^(int times) const { return operatorMxA(&vsPowI, times, *this);}
diff --git a/test/CudaMatrix_Test.cpp b/test/CudaMatrix_Test.cpp
index 31dabf6..e84baf8 100644
--- a/test/CudaMatrix_Test.cpp
+++ b/test/CudaMatrix_Test.cpp
@@ -1,4 +1,5 @@
 #include <complex>
+#include <cstdio>
 #include <gtest/gtest.h>
 #include "AuroraDefs.h"
 #include "Matrix.h"
@@ -26,332 +27,2181 @@ protected:
 };
 
 TEST_F(CudaMatrix_Test, MatrixAdd) {
-    auto A = Aurora::zeros(1000,1,1);
-    auto B = Aurora::zeros(1000,1,1);
-    for (size_t i = 0; i < 1000; i++)
     {
-        A[i] = -1;
-        B[i] =  i;
-    }
-    printf("Test CudaMatrix operator+(const CudaMatrix &aMatrix) const \r\n");
-    //CudaMatrix operator+(const CudaMatrix &aMatrix) const
-    auto C = A+B;
-    auto dA = A.toDeviceMatrix();
-    auto dB = B.toDeviceMatrix();
-    auto dC = (dA+dB);
-    auto dhC = dC.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(C[i],dhC[i]);
-    }
-    printf("Test CudaMatrix operator+(float aScalar) const \r\n");
-    //CudaMatrix operator+(float aScalar) const
-    auto D = C+0.5;
-    auto dD = dC+0.5;
-    auto dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
-    printf("Test CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix) \r\n");
-    // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
-    dD = 0.5 + dC;
-    dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
-    printf("Test CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix) \r\n");
-    // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
-    {
-        auto dD2 = 0.5 + (dA+dB);
-        dhD = dD2.toHostMatrix();
+        auto A = Aurora::zeros(1000,1,1);
+        auto B = Aurora::zeros(1000,1,1);
+        for (size_t i = 0; i < 1000; i++)
+        {
+            A[i] = -1;
+            B[i] =  i;
+        }
+        printf("Test CudaMatrix operator+(const CudaMatrix &aMatrix) const \r\n");
+        //CudaMatrix operator+(const CudaMatrix &aMatrix) const
+        auto C = A+B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        auto dC = (dA+dB);
+        auto dhC = dC.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dhC[i]);
+        }
+        printf("Test CudaMatrix operator+(float aScalar) const \r\n");
+        //CudaMatrix operator+(float aScalar) const
+        auto D = C+0.5;
+        auto dD = dC+0.5;
+        auto dhD = dD.toHostMatrix();
         for (size_t i = 0; i < 1000; i++)
         {
             ASSERT_FLOAT_EQ(D[i],dhD[i]);
         }
-    }
-    printf("Test CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar) \r\n");
-    // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
-    {
-        
-        auto dD2 =  (dA+dB)+0.5;
-        dhD = dD2.toHostMatrix();
+        printf("Test CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix) \r\n");
+        // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
+        dD = 0.5 + dC;
+        dhD = dD.toHostMatrix();
         for (size_t i = 0; i < 1000; i++)
         {
             ASSERT_FLOAT_EQ(D[i],dhD[i]);
         }
-    }
-    //CudaMatrix operator+(CudaMatrix &&aMatrix) const
-    printf("Test CudaMatrix operator+(CudaMatrix &&aMatrix) const \r\n");
-    {
-        auto D = A+C;
-        auto dD2 = dA+(dA+dB);
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        printf("Test CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix) \r\n");
+        // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
         {
-            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            auto dD2 = 0.5 + (dA+dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        printf("Test CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar) \r\n");
+        // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
+        {
+            
+            auto dD2 =  (dA+dB)+0.5;
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator+(CudaMatrix &&aMatrix) const
+        printf("Test CudaMatrix operator+(CudaMatrix &&aMatrix) const \r\n");
+        {
+            auto D = A+C;
+            auto dD2 = dA+(dA+dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
+        printf("Test CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
+        {
+            auto D = A+C;
+            auto dD2 = (dA+dB)+dA;
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
         }
     }
-    //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
-    printf("Test CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
     {
-        auto D = A+C;
-        auto dD2 = (dA+dB)+dA;
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        float *dataA = new float[10000];
+        float *dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
         {
-            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            dataA[i] = 8;
+            dataB[i] = -1;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,100,1,Aurora::Complex);
+        auto C = A+B;
+        std::complex<float> scalar(-1,-1);
+        auto dA = A.toDeviceMatrix();
+
+        auto dC = (dA+scalar);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = A.toDeviceMatrix()+scalar;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = scalar+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = A+1;
+        dC = (dA+1);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index"<<i;
+        }
+
+        dC = A.toDeviceMatrix()+1;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = 1+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+}
+
+TEST_F(CudaMatrix_Test, MatrixAddmm) {
+    //real
+    {
+        auto A = Aurora::zeros(100,100,1);
+        auto B = Aurora::zeros(100,100,1);
+        for (size_t i = 0; i < 10000; i++)
+        {
+            A[i] = -1;
+            B[i] =  8;
+        }
+        auto C = B-1;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complex
+    {
+        float *dataA = Aurora::random(100*100*2);
+        float *dataB = Aurora::random(100*100*2);
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,100,1,Aurora::Complex);
+
+        auto C = B+A;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, 
+        auto dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&,
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, 
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+}
+
+TEST_F(CudaMatrix_Test, MatrixAddmv) {
+    //real
+    {
+        auto A = Aurora::zeros(100,100,1);
+        auto B = Aurora::ones(100,1,1);
+        auto C = A+1;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA+B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()+dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+
+        //Matrix& + Matrix&, row mode
+        dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + Matrix&&, row mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA+B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, row mode
+        dC = B.toDeviceMatrix()+dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complex
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            dataC[i] = -1;
+            if (i<100)dataB[i] = -1;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto C = A+D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA+B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()+dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        B.forceReshape(1, 50, 1);
+        A.forceReshape(100, 50, 1);
+        dA.forceReshape(100, 50, 1);
+
+        //Matrix& + Matrix&, row mode
+        dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + Matrix&&, row mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA+B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, row mode
+        dC = B.toDeviceMatrix()+dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complexM - realv
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            if (i<100)dataB[i] = -1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,1,1);
+
+        auto C = A-1;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + vec&, col mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec& + Matrix&&, col mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA+B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&, col mode
+        dC = B.toDeviceMatrix()+dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+        dA.forceReshape(50, 100, 1);
+        A.forceReshape(50, 100, 1);
+        B.forceReshape(1, 100, 1);
+
+        //Matrix& + vec&, row mode
+        dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+
+        //Matrix&& + vec&, row mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec& + Matrix&&, row mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA+B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&, row mode
+        dC = B.toDeviceMatrix()+dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //realM - complexV
+    {
+        float *dataA = new float[5000];
+        float *dataC = new float[10000];
+        float *dataB = new float[100];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            if (i<5000)dataA[i] = 8;
+            dataC[i] = -1;
+            if (i<100)dataB[i] = -1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+
+
+        auto C = A+D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        dA.forceReshape(100, 50, 1);
+        A.forceReshape(100, 50, 1);
+        B.forceReshape(1, 50, 1);
+        //Matrix& + Matrix&, row mode
+        dC = (dA+dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()+dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, row mode
+        dC = dB+A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
         }
     }
 }
 
 TEST_F(CudaMatrix_Test, MatrixMul) {
-    auto A = Aurora::zeros(1000,1,1);
-    auto B = Aurora::zeros(1000,1,1);
-    for (size_t i = 0; i < 1000; i++)
     {
-        A[i] = -1;
-        B[i] =  i;
+        auto A = Aurora::zeros(1000,1,1);
+        auto B = Aurora::zeros(1000,1,1);
+        for (size_t i = 0; i < 1000; i++)
+        {
+            A[i] = -1;
+            B[i] =  i;
+        }
+        printf("Test CudaMatrix operator*(const CudaMatrix &aMatrix) const \r\n");
+        //CudaMatrix operator+(const CudaMatrix &aMatrix) const
+        auto C = A*B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        auto dC = (dA*dB);
+        auto dhC = dC.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dhC[i]);
+        }
+        printf("Test CudaMatrix operator*(float aScalar) const \r\n");
+        //CudaMatrix operator+(float aScalar) const
+        auto D = C*0.5;
+        auto dD = dC*0.5;
+        auto dhD = dD.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+        printf("Test CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix) \r\n");
+        // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
+        dD = 0.5 * dC;
+        dhD = dD.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+        printf("Test CudaMatrix &operator*(float aScalar, CudaMatrix &&aMatrix) \r\n");
+        // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
+        {
+            auto dD2 = 0.5 * (dA*dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        printf("Test CudaMatrix &operator*(CudaMatrix &&aMatrix, float aScalar) \r\n");
+        // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
+        {
+            
+            auto dD2 =  (dA*dB)*0.5;
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator+(CudaMatrix &&aMatrix) const
+        printf("Test CudaMatrix operator*(CudaMatrix &&aMatrix) const \r\n");
+        {
+            auto D = A*C;
+            auto dD2 = dA*(dA*dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther)
+        printf("Test CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
+        {
+            auto D = A*C;
+            auto dD2 = (dA*dB)*dA;
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
     }
-    printf("Test CudaMatrix operator*(const CudaMatrix &aMatrix) const \r\n");
-    //CudaMatrix operator*(const CudaMatrix &aMatrix) const
-    auto C = A*B;
-    auto dA = A.toDeviceMatrix();
-    auto dB = B.toDeviceMatrix();
-    auto dC = (dA*dB);
-    auto dhC = dC.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
     {
-        ASSERT_FLOAT_EQ(C[i],dhC[i]);
-    }
-    printf("Test CudaMatrix operator*(float aScalar) const \r\n");
-    //CudaMatrix operator*(float aScalar) const
-    auto D = C*0.5;
-    auto dD = dC*0.5;
-    auto dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
+        float *dataA = new float[100];
+        float *dataB = new float[100];
+        for (size_t i = 0; i < 100; i++)
+        {
+            dataA[i] = 8;
+            dataB[i] = -1;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 10,5,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 10,5,1,Aurora::Complex);
+        auto C = A*B;
+        std::complex<float> scalar(-1,-1);
+        auto dA = A.toDeviceMatrix();
 
-    printf("Test CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix) \r\n");
-    // CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix)
-    dD = 0.5 * dC;
-    dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
-    printf("Test CudaMatrix &operator*(float aScalar, CudaMatrix &&aMatrix) \r\n");
-    // CudaMatrix &operator*(float aScalar, CudaMatrix &&aMatrix)
-    {
-        auto dD2 = 0.5 * (dA*dB);
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        auto dC = (dA*scalar);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
         {
-            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = A.toDeviceMatrix()*scalar;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = scalar*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = A*2;
+        dC = (dA*2);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index"<<i;
+        }
+
+        dC = A.toDeviceMatrix()*2;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = 2*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
         }
     }
-    printf("Test CudaMatrix &operator*(CudaMatrix &&aMatrix, float aScalar) \r\n");
-    // CudaMatrix &operator*(CudaMatrix &&aMatrix, float aScalar)
+}
+
+TEST_F(CudaMatrix_Test, MatrixMulmm) {
+    //real
     {
-        
-        auto dD2 =  (dA*dB)*0.5;
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        auto A = Aurora::zeros(100,100,1);
+        auto B = Aurora::zeros(100,100,1);
+        for (size_t i = 0; i < 10000; i++)
         {
-            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            A[i] = -1;
+            B[i] =  8;
+        }
+        auto C = B*-1;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& * Matrix&, col mode
+        auto dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& * Matrix&, col mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& * Matrix&&, col mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
         }
     }
-    //CudaMatrix operator*(CudaMatrix &&aMatrix) const
-    printf("Test CudaMatrix operator*(CudaMatrix &&aMatrix) const \r\n");
+    //complex
     {
-        auto D1 = A*C;
-        auto dD2 = dA*(dA*dB);
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        float *dataA = new float[10000];
+        float *dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
         {
-            ASSERT_FLOAT_EQ(D1[i],dhD[i]);
+            dataA[i] = 8;
+            dataB[i] = -1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,50,1,Aurora::Complex);
+
+        auto C = B*A;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
         }
     }
-    //CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther)
-    printf("Test CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
+}
+
+TEST_F(CudaMatrix_Test, MatrixMulmv) {
+    //real
     {
-        auto D1 = A*C;
-        auto dD2 = (dA*dB)*dA;
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        for (size_t i = 0; i < 10000; i++)
         {
-            ASSERT_FLOAT_EQ(D1[i],dhD[i]);
+            dataA[i] = 8;
+            if(i<100)dataB[i] = -1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,100,1);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,1,1);
+        auto C = A*(-1);
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA*B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()*dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+
+        //Matrix& + Matrix&, row mode
+        dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + Matrix&&, row mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA*B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, row mode
+        dC = B.toDeviceMatrix()*dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complex
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            dataC[i] = -1;
+            if (i<100)dataB[i] = -1;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto C = A*D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA*B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()*dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        B.forceReshape(1, 50, 1);
+        A.forceReshape(100, 50, 1);
+        dA.forceReshape(100, 50, 1);
+
+        //Matrix& + Matrix&, row mode
+        dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + Matrix&&, row mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA*B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, row mode
+        dC = B.toDeviceMatrix()*dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complexM - realv
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            if (i<100)dataB[i] = -1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,1,1);
+
+        auto C = A*(-1);
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + vec&, col mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec& * Matrix&&, col mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA*B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&, col mode
+        dC = B.toDeviceMatrix()*dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+        dA.forceReshape(50, 100, 1);
+        A.forceReshape(50, 100, 1);
+        B.forceReshape(1, 100, 1);
+
+        //Matrix& + vec&, row mode
+        dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+
+        //Matrix&& + vec&, row mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec& + Matrix&&, row mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA*B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&, row mode
+        dC = B.toDeviceMatrix()*dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //realM - complexV
+    {
+        float *dataA = new float[5000];
+        float *dataC = new float[10000];
+        float *dataB = new float[100];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            if (i<5000)dataA[i] = 8;
+            dataC[i] = -1;
+            if (i<100)dataB[i] = -1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+
+
+        auto C = A*D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, col mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        dA.forceReshape(100, 50, 1);
+        A.forceReshape(100, 50, 1);
+        B.forceReshape(1, 50, 1);
+        //Matrix& + Matrix&, row mode
+        dC = (dA*dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()*dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + Matrix&&, row mode
+        dC = dB*A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
         }
     }
 }
 
 TEST_F(CudaMatrix_Test, MatrixSub) {
-    auto A = Aurora::zeros(1000,1,1);
-    auto B = Aurora::zeros(1000,1,1);
-    for (size_t i = 0; i < 1000; i++)
     {
-        A[i] = -1;
-        B[i] =  i;
-    }
-    printf("Test CudaMatrix operator-(const CudaMatrix &aMatrix) const \r\n");
-    //CudaMatrix operator+(const CudaMatrix &aMatrix) const
-    auto C = A-B;
-    auto dA = A.toDeviceMatrix();
-    auto dB = B.toDeviceMatrix();
-    auto dC = (dA-dB);
-    auto dhC = dC.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(C[i],dhC[i]);
-    }
-    printf("Test CudaMatrix operator-(float aScalar) const \r\n");
-    //CudaMatrix operator+(float aScalar) const
-    auto D = C-0.5;
-    auto dD = dC-0.5;
-    auto dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
-    printf("Test CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix) \r\n");
-    // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
-    D = 0.5-C;
-    dD = 0.5 - dC;
-    dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
-    printf("Test CudaMatrix &operator-(float aScalar, CudaMatrix &&aMatrix) \r\n");
-    // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
-    {
-        auto dD2 = 0.5 - (dA-dB);
-        dhD = dD2.toHostMatrix();
+        auto A = Aurora::zeros(1000,1,1);
+        auto B = Aurora::zeros(1000,1,1);
+        for (size_t i = 0; i < 1000; i++)
+        {
+            A[i] = -1;
+            B[i] =  i;
+        }
+        printf("Test CudaMatrix operator-(const CudaMatrix &aMatrix) const \r\n");
+        //CudaMatrix operator+(const CudaMatrix &aMatrix) const
+        auto C = A-B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        auto dC = (dA-dB);
+        auto dhC = dC.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dhC[i]);
+        }
+        printf("Test CudaMatrix operator-(float aScalar) const \r\n");
+        //CudaMatrix operator+(float aScalar) const
+        auto D = C-0.5;
+        auto dD = dC-0.5;
+        auto dhD = dD.toHostMatrix();
         for (size_t i = 0; i < 1000; i++)
         {
             ASSERT_FLOAT_EQ(D[i],dhD[i]);
         }
-    }
-    printf("Test CudaMatrix &operator-(CudaMatrix &&aMatrix, float aScalar) \r\n");
-    // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
-    {
-        auto E = C-0.5;
-        auto dE2 =  (dA-dB)-0.5;
-        auto dhE = dE2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
-        {
-            ASSERT_FLOAT_EQ(E[i],dhE[i]);
-        }
-    }
-    //CudaMatrix operator+(CudaMatrix &&aMatrix) const
-    printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix) const \r\n");
-    {
-        auto D = A-C;
-        auto dD2 = dA-(dA-dB);
-        dhD = dD2.toHostMatrix();
+        printf("Test CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix) \r\n");
+        // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
+        D = 0.5 - C;
+        dD = 0.5 - C.toDeviceMatrix();
+        dhD = dD.toHostMatrix();
         for (size_t i = 0; i < 1000; i++)
         {
             ASSERT_FLOAT_EQ(D[i],dhD[i]);
         }
-    }
-    //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
-    printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
-    {
-        auto D = C-A;
-        auto dD2 = (dA-dB)-dA;
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        printf("Test CudaMatrix &operator-(float aScalar, CudaMatrix &&aMatrix) \r\n");
+        // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
         {
-            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            auto dD2 = 0.5 - (dA-dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        D =  C - 0.5;
+        printf("Test CudaMatrix &operator-(CudaMatrix &&aMatrix, float aScalar) \r\n");
+        // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
+        {
+            
+            auto dD2 =  (dA-dB)-0.5;
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator+(CudaMatrix &&aMatrix) const
+        printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix) const \r\n");
+        {
+            auto D = A-C;
+            auto dD2 = dA-(dA-dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
+        printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
+        {
+            auto D = A-C;
+            auto dD2 = dA- (dA-dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+    }
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            dataB[i] = -1;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,100,1,Aurora::Complex);
+        auto C = A-B;
+        std::complex<float> scalar(-1,-1);
+        auto dA = A.toDeviceMatrix();
+
+        auto dC = (dA-scalar);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = A.toDeviceMatrix()-scalar;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = B-A;
+        dC = scalar-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = A-1;
+        dC = (dA-1);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index"<<i;
+        }
+
+        dC = A.toDeviceMatrix()-1;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = 1-A;
+        dC = 1-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+}
+
+TEST_F(CudaMatrix_Test, MatrixSubmm) {
+    //real
+    {
+        auto A = Aurora::zeros(100,100,1);
+        auto B = Aurora::zeros(100,100,1);
+        for (size_t i = 0; i < 10000; i++)
+        {
+            A[i] = 1;
+            B[i] =  8;
+        }
+        auto C = B-1;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+
+        auto dC = (dB-dA);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = dB - A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complex
+    {
+        auto dataA = new float[10000];
+        auto dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 1;
+            dataB[i] = 8;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,50,1,Aurora::Complex);
+
+        auto C = A-B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&
+        auto dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = B-A;
+        //Matrix& + Matrix&&,
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //real & complex
+    {
+        auto dataA = new float[10000];
+        auto dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 1;
+            dataB[i] = 8;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,50,1,Aurora::Complex);
+
+        auto C = A-B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix&(real) - Matrix&(complex)
+        auto dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i<<", A:"<<A[i]<<", B:"<<B[i/2];
+        }
+        //Matrix&&(real) - Matrix&(complex)
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&(real) - Matrix&&(complex)
+        dC = dA-B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = B-A;
+        //Matrix&(complex) - Matrix&&(real)
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&&(complex) - Matrix&(real)
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&(complex) - Matrix&(real)
+        dC = dB-dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+}
+
+TEST_F(CudaMatrix_Test, MatrixSubmv) {
+    //real
+    {
+        auto A = Aurora::ones(100,100,1)*8;
+        auto B = Aurora::ones(100,1,1);
+        auto C = A-1;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA-B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = 1- A;
+        //Matrix& + Matrix&&, col mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+        C = A-1;
+        //Matrix& - Matrix&, row mode
+        dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& - Matrix&, row mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& - vec&&, row mode
+        dC = dA-B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = 1- A;
+        //Matrix& + Matrix&&, row mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, row mode
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complex
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            dataC[i] = -1;
+            if (i<100)dataB[i] = -1;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto C = A-D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA-B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = D-A;
+        //Matrix& + Matrix&&, col mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        B.forceReshape(1, 50, 1);
+        A.forceReshape(100, 50, 1);
+        D.forceReshape(100, 50, 1);
+        dA.forceReshape(100, 50, 1);
+
+        C= A-D;
+        //Matrix& - Matrix&, row mode
+        dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& - Matrix&, row mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& - vec&&, row mode
+        dC = dA-B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        C= D - A;
+        //Matrix& + Matrix&&, row mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        //vec&& + Matrix&&, row mode
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complexM - realv
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            if (i<100)dataB[i] = 1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1,Aurora::Complex);
+
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,1,1);
+
+        auto C = A-1;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + vec&, col mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA-B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C=1-A;
+        //vec& + Matrix&&, col mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        //vec&& + Matrix&, col mode
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+        dA.forceReshape(50, 100, 1);
+        A.forceReshape(50, 100, 1);
+        B.forceReshape(1, 100, 1);
+
+        C = A-1;
+        //Matrix& + vec&, row mode
+        dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& + vec&, row mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA-B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        C = 1-A;
+        //vec& + Matrix&&, row mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        //vec&& + Matrix&, row mode
+        dC = B.toDeviceMatrix()-dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //realM - complexV
+    {
+        float *dataA = new float[5000];
+        float *dataC = new float[10000];
+        float *dataD = new float[10000];
+        float *dataB = new float[100];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            if (i<5000)dataA[i] = 8;
+            dataC[i] = -1;
+            dataD[i] = i%2==0?8:0;
+            if (i<100)dataB[i] = -1;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+        auto E = Aurora::Matrix::fromRawData(dataD, 50,100,1,Aurora::Complex);
+
+
+
+        auto C = E-D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C= D-E;
+        //Matrix& + Matrix&&, col mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        dA.forceReshape(100, 50, 1);
+        A.forceReshape(100, 50, 1);
+        B.forceReshape(1, 50, 1);
+        C= E-D;
+        //Matrix& + Matrix&, row mode
+        dC = (dA-dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()-dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C= D-E;
+        //Matrix& + Matrix&&, row mode
+        dC = dB-A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
         }
     }
 }
 
 TEST_F(CudaMatrix_Test, MatrixDiv) {
-    auto A = Aurora::zeros(1000,1,1);
-    auto B = Aurora::zeros(1000,1,1);
-    for (size_t i = 0; i < 1000; i++)
+    //real
     {
-        A[i] = -1;
-        B[i] =  i;
-    }
-    printf("Test CudaMatrix operator/(const CudaMatrix &aMatrix) const \r\n");
-    //CudaMatrix operator+(const CudaMatrix &aMatrix) const
-    auto C = A/B;
-    auto dA = A.toDeviceMatrix();
-    auto dB = B.toDeviceMatrix();
-    auto dC = (dA/dB);
-    auto dhC = dC.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(C[i],dhC[i]);
-    }
-    printf("Test CudaMatrix operator/(float aScalar) const \r\n");
-    //CudaMatrix operator+(float aScalar) const
-    auto D = C/0.5;
-    auto dD = dC/0.5;
-    auto dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
-    printf("Test CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix) \r\n");
-    // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
-    D = 0.5/C;
-    dD = 0.5 / dC;
-    dhD = dD.toHostMatrix();
-    for (size_t i = 0; i < 1000; i++)
-    {
-        ASSERT_FLOAT_EQ(D[i],dhD[i]);
-    }
-    printf("Test CudaMatrix &operator/(float aScalar, CudaMatrix &&aMatrix) \r\n");
-    // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
-    {
-        auto dD2 = 0.5 / (dA/dB);
-        dhD = dD2.toHostMatrix();
+        auto A = Aurora::zeros(1000,1,1);
+        auto B = Aurora::zeros(1000,1,1);
+        for (size_t i = 0; i < 1000; i++)
+        {
+            A[i] = -2.0;
+            B[i] =  i;
+        }
+        printf("Test CudaMatrix operator/(const CudaMatrix &aMatrix) const \r\n");
+        //CudaMatrix operator+(const CudaMatrix &aMatrix) const
+        auto C = A/B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        auto dC = (dA/dB);
+        auto dhC = dC.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dhC[i]);
+        }
+        printf("Test CudaMatrix operator/(float aScalar) const \r\n");
+        //CudaMatrix operator+(float aScalar) const
+        auto D = C-0.5;
+        auto dD = dC-0.5;
+        auto dhD = dD.toHostMatrix();
         for (size_t i = 0; i < 1000; i++)
         {
             ASSERT_FLOAT_EQ(D[i],dhD[i]);
         }
-    }
-    printf("Test CudaMatrix &operator/(CudaMatrix &&aMatrix, float aScalar) \r\n");
-    // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
-    {
-        auto E = C/0.5;
-        auto dE2 =  (dA/dB)/0.5;
-        auto dhE = dE2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
-        {
-            ASSERT_FLOAT_EQ(E[i],dhE[i]);
-        }
-    }
-    //CudaMatrix operator+(CudaMatrix &&aMatrix) const
-    printf("Test CudaMatrix operator/(CudaMatrix &&aMatrix) const \r\n");
-    {
-        auto D = A/C;
-        auto dD2 = dA/(dA/dB);
-        dhD = dD2.toHostMatrix();
+        printf("Test CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix) \r\n");
+        // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
+        D = 0.5 / C;
+        dD = 0.5 / C.toDeviceMatrix();
+        dhD = dD.toHostMatrix();
         for (size_t i = 0; i < 1000; i++)
         {
             ASSERT_FLOAT_EQ(D[i],dhD[i]);
         }
-    }
-    //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
-    printf("Test CudaMatrix operator/(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
-    {
-        auto D = C/A;
-        auto dD2 = (dA/dB)/dA;
-        dhD = dD2.toHostMatrix();
-        for (size_t i = 0; i < 1000; i++)
+        printf("Test CudaMatrix &operator/(float aScalar, CudaMatrix &&aMatrix) \r\n");
+        // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
         {
-            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            auto dD2 = 0.5 / (dA/dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        D =  C / 0.5;
+        printf("Test CudaMatrix &operator/(CudaMatrix &&aMatrix, float aScalar) \r\n");
+        // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
+        {
+            
+            auto dD2 =  (dA/dB)/0.5;
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator+(CudaMatrix &&aMatrix) const
+        printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix) const \r\n");
+        {
+            auto D = A/C;
+            auto dD2 = dA/(dA/dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+        //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
+        printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
+        {
+            auto D = A/C;
+            auto dD2 = dA/ (dA/dB);
+            dhD = dD2.toHostMatrix();
+            for (size_t i = 0; i < 1000; i++)
+            {
+                ASSERT_FLOAT_EQ(D[i],dhD[i]);
+            }
+        }
+    }
+    //complex
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            dataB[i] = -2;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,100,1,Aurora::Complex);
+        auto C = A/B;
+        std::complex<float> scalar(-2,-2);
+        auto dA = A.toDeviceMatrix();
+
+        auto dC = (dA/scalar);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = A.toDeviceMatrix()/scalar;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = B/A;
+        dC = scalar/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = A/2.0;
+        dC = (dA/2.0);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index"<<i;
+        }
+
+        dC = A.toDeviceMatrix()/2.0;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = 2.0/A;
+        dC = 2.0/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
         }
     }
 }
 
+TEST_F(CudaMatrix_Test, MatrixDivmm) {
+    //real
+    {
+        auto A = Aurora::zeros(100,100,1);
+        auto B = Aurora::zeros(100,100,1);
+        for (size_t i = 0; i < 10000; i++)
+        {
+            A[i] = 2;
+            B[i] =  8;
+        }
+        auto C = B/2.0;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+
+        auto dC = (dB/dA);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = dB / A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complex
+    {
+        auto dataA = new float[10000];
+        auto dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 2;
+            dataB[i] = 8;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,50,1,Aurora::Complex);
+
+        auto C = A/B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& / Matrix&
+        auto dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& / Matrix&
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = B/A;
+        //Matrix& / Matrix&&,
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //real & complex
+    {
+        auto dataA = new float[10000];
+        auto dataB = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 2;
+            dataB[i] = 8;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1);
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,50,1,Aurora::Complex);
+
+        auto C = A/B;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix&(real) / Matrix&(complex)
+        auto dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i<<", A:"<<A[i]<<", B:"<<B[i/2];
+        }
+        //Matrix&&(real) / Matrix&(complex)
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&(real) / Matrix&&(complex)
+        dC = dA/B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = B/A;
+        //Matrix&(complex) / Matrix&&(real)
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&&(complex) / Matrix&(real)
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&(complex) / Matrix&(real)
+        dC = dB/dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+}
+
+TEST_F(CudaMatrix_Test, MatrixDivmv) {
+    //real
+    {
+        auto A = Aurora::ones(100,100,1)*8;
+        auto B = Aurora::ones(100,1,1)*2;
+        auto C = A/2;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA/B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = 2/A;
+        //Matrix& + Matrix&&, col mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+        C = A/2;
+        //Matrix& - Matrix&, row mode
+        dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& - Matrix&, row mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& - vec&&, row mode
+        dC = dA/B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = 2/ A;
+        //Matrix& + Matrix&&, row mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //vec&& + Matrix&&, row mode
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complex
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            dataC[i] = -2;
+            if (i<100)dataB[i] = -2;
+        }
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto C = A/D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, col mode
+        dC = dA/B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C = D/A;
+        //Matrix& + Matrix&&, col mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //vec&& + Matrix&&, col mode
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        B.forceReshape(1, 50, 1);
+        A.forceReshape(100, 50, 1);
+        D.forceReshape(100, 50, 1);
+        dA.forceReshape(100, 50, 1);
+
+        C= A/D;
+        //Matrix& / Matrix&, row mode
+        dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& / Matrix&, row mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            EXPECT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix& / vec&&, row mode
+        dC = dA/B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        C= D / A;
+        //Matrix& / Matrix&&, row mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        //vec&& / Matrix&&, row mode
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize(); i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //complexM - realv
+    {
+        float *dataA = new float[10000];
+        float *dataB = new float[100];
+        float *dataC = new float[10000];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            dataA[i] = 8;
+            if (i<100)dataB[i] = 2;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 100,50,1,Aurora::Complex);
+
+        auto B = Aurora::Matrix::fromRawData(dataB, 100,1,1);
+
+        auto C = A/2;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& / Matrix&, col mode
+        auto dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix&& / vec&, col mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& / vec&&, col mode
+        dC = dA/B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C=2/A;
+        //vec& + Matrix&&, col mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        //vec&& + Matrix&, col mode
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 100, 1);
+        dA.forceReshape(50, 100, 1);
+        A.forceReshape(50, 100, 1);
+        B.forceReshape(1, 100, 1);
+
+        C = A/2;
+        //Matrix& + vec&, row mode
+        dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& + vec&, row mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        //Matrix& + vec&&, row mode
+        dC = dA/B.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        C = 2/A;
+        //vec& + Matrix&&, row mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+
+        //vec&& + Matrix&, row mode
+        dC = B.toDeviceMatrix()/dA;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+    }
+    //realM - complexV
+    {
+        float *dataA = new float[5000];
+        float *dataC = new float[10000];
+        float *dataD = new float[10000];
+        float *dataB = new float[100];
+        for (size_t i = 0; i < 10000; i++)
+        {
+            if (i<5000)dataA[i] = 8;
+            dataC[i] = -2;
+            dataD[i] = i%2==0?8:0;
+            if (i<100)dataB[i] = -2;
+        }
+
+        auto A = Aurora::Matrix::fromRawData(dataA, 50,100,1);
+        auto B = Aurora::Matrix::fromRawData(dataB, 50,1,1,Aurora::Complex);
+        auto D = Aurora::Matrix::fromRawData(dataC, 50,100,1,Aurora::Complex);
+        auto E = Aurora::Matrix::fromRawData(dataD, 50,100,1,Aurora::Complex);
+
+
+
+        auto C = E/D;
+        auto dA = A.toDeviceMatrix();
+        auto dB = B.toDeviceMatrix();
+        //Matrix& + Matrix&, col mode
+        auto dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+        //Matrix&& + Matrix&, col mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C= D/E;
+        //Matrix& + Matrix&&, col mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        dB.forceReshape(1, 50, 1);
+        dA.forceReshape(100, 50, 1);
+        A.forceReshape(100, 50, 1);
+        B.forceReshape(1, 50, 1);
+        C= E/D;
+        //Matrix& + Matrix&, row mode
+        dC = (dA/dB);
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+
+        //Matrix&& + Matrix&, row mode
+        dC = A.toDeviceMatrix()/dB;
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i));
+        }
+        C= D/E;
+        //Matrix& + Matrix&&, row mode
+        dC = dB/A.toDeviceMatrix();
+        for (size_t i = 0; i < C.getDataSize()*2; i++)
+        {
+            ASSERT_FLOAT_EQ(C[i],dC.getValue(i))<<"index:"<<i;
+        }
+    }
+}
+
+
 TEST_F(CudaMatrix_Test, MatrixComplexMulAndDiv){
     float * dataA = new float[100];
     float * dataB = new float[100];