diff --git a/src/CudaMatrix.cpp b/src/CudaMatrix.cpp
index 092a2a7..8df61c0 100644
--- a/src/CudaMatrix.cpp
+++ b/src/CudaMatrix.cpp
@@ -490,107 +490,8 @@ bool CudaMatrix::setBlock(int aDim,int aBeginIndex, int aEndIndex, const CudaMat
         }
     }    
 }
-
-CudaMatrix CudaMatrix::operator+(float aScalar) const{
-    if (isComplex())
-    {
-        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-        return  CudaMatrix();
-    }
-    float* data = nullptr;
-    unsigned long long size =  getDataSize() * getValueType();
-    cudaMalloc((void**)&data, sizeof(float) * size);
-    auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-    unaryAdd(getData(),aScalar,out.getData(),getDataSize());
-    return out;
-}
-
-CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix){
-    if (aMatrix.isComplex())
-    {
-        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-        return  CudaMatrix();
-    }
-    float* data = nullptr;
-    unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
-    cudaMalloc((void**)&data, sizeof(float) * size);
-    auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
-    unaryAdd(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());
-    return out;
-}
-
-CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix){
-    if (aMatrix.isComplex())
-    {
-        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-        return  aMatrix;
-    }
-    unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-    return aMatrix;
-}
-
-CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar){
-    if (aMatrix.isComplex())
-    {
-        std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
-        return  aMatrix;
-    }
-    unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
-    return aMatrix;
-}
-
-CudaMatrix CudaMatrix::operator+(const CudaMatrix &aMatrix) const{
-    if (this->getDataSize() != aMatrix.getDataSize()) {
-        std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-        <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-        return CudaMatrix();
-    }
-    if (this->isComplex() != aMatrix.isComplex()) {
-        std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-        <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
-        return CudaMatrix();
-    }
-    float* data = nullptr;
-    unsigned long long size =  getDataSize() * getValueType();
-    cudaMalloc((void**)&data, sizeof(float) * size);
-    auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
-    unaryAdd(this->getData(),aMatrix.getData(),out.getData(),this->getDataSize());
-    return out;
-}
-
-CudaMatrix CudaMatrix::operator+(CudaMatrix &&aMatrix) const{
-    if (this->getDataSize() != aMatrix.getDataSize()) {
-        std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
-        <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
-        return CudaMatrix();
-    }
-    if (this->isComplex() != aMatrix.isComplex()) {
-        std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
-        <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
-        return CudaMatrix();
-    }
-    unaryAdd(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());
-    return aMatrix;
-}
-
-
-CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
-    if (aOther.getDataSize() != aMatrix.getDataSize()) {
-        std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
-        <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
-        return CudaMatrix();
-    }
-    if (aOther.isComplex() != aMatrix.isComplex()) {
-        std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
-        <<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;
-        return CudaMatrix();
-    }
-    unaryAdd(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
-    return aMatrix;
-}
-
-        // mul
-    CudaMatrix CudaMatrix::operator*(float aScalar) const{
+    //--Add----------------------------------------------------------------
+    CudaMatrix CudaMatrix::operator+(float aScalar) const{
         if (isComplex())
         {
             std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
@@ -600,13 +501,112 @@ CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
         unsigned long long size =  getDataSize() * getValueType();
         cudaMalloc((void**)&data, sizeof(float) * size);
         auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
+        unaryAdd(getData(),aScalar,out.getData(),getDataSize());
+        return out;
+    }
+
+    CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
+        unaryAdd(aMatrix.getData(),aScalar,out.getData(),aMatrix.getDataSize());
+        return out;
+    }
+
+    CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
+            return  aMatrix;
+        }
+        unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+
+    CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
+            return  aMatrix;
+        }
+        unaryAdd(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+
+    CudaMatrix CudaMatrix::operator+(const CudaMatrix &aMatrix) const{
+        if (this->getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        if (this->isComplex() != aMatrix.isComplex()) {
+            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  getDataSize() * getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
+        unaryAdd(this->getData(),aMatrix.getData(),out.getData(),this->getDataSize());
+        return out;
+    }
+
+    CudaMatrix CudaMatrix::operator+(CudaMatrix &&aMatrix) const{
+        if (this->getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        if (this->isComplex() != aMatrix.isComplex()) {
+            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        unaryAdd(this->getData(),aMatrix.getData(),aMatrix.getData(),this->getDataSize());
+        return aMatrix;
+    }
+
+
+    CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
+        if (aOther.getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
+            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        if (aOther.isComplex() != aMatrix.isComplex()) {
+            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        unaryAdd(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
+        return aMatrix;
+    }
+
+    //--mul-----------------------------------------------------
+    CudaMatrix CudaMatrix::operator*(float aScalar) const{
+        if (isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator*(float aScalar)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  getDataSize() * getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
         unaryMul(getData(),aScalar,out.getData(),getDataSize());
         return out;
     }
     CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix){
         if (aMatrix.isComplex())
         {
-            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
+            std::cerr<<"Complex matrix not support operator*(float aScalar, const CudaMatrix &aMatrix)"<<std::endl;
             return  CudaMatrix();
         }
         float* data = nullptr;
@@ -619,7 +619,7 @@ CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
     CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix){
         if (aMatrix.isComplex())
         {
-            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
+            std::cerr<<"Complex matrix not support operator*(float aScalar, CudaMatrix &&aMatrix)"<<std::endl;
             return  aMatrix;
         }
         unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
@@ -628,7 +628,7 @@ CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
     CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar){
         if (aMatrix.isComplex())
         {
-            std::cerr<<"Complex matrix not support operator+(float aScalar)"<<std::endl;
+            std::cerr<<"Complex matrix not support operator*(float aScalar)"<<std::endl;
             return  aMatrix;
         }
         unaryMul(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
@@ -636,12 +636,12 @@ CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
     }
     CudaMatrix CudaMatrix::operator*(const CudaMatrix &aMatrix) const{
         if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            std::cerr<<"operator* must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
             <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
             return CudaMatrix();
         }
         if (this->isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            std::cerr<<"operator* must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
             <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
             return CudaMatrix();
         }
@@ -654,12 +654,12 @@ CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
     }
     CudaMatrix CudaMatrix::operator*(CudaMatrix &&aMatrix) const{
         if (this->getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            std::cerr<<"operator* must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
             <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
             return CudaMatrix();
         }
         if (this->isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            std::cerr<<"operator* must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
             <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Comples":"Real")<<std::endl;
             return CudaMatrix();
         }
@@ -668,17 +668,249 @@ CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther){
     }
     CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther){
         if (aOther.getDataSize() != aMatrix.getDataSize()) {
-            std::cerr<<"operator+ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
+            std::cerr<<"operator* must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
             <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
             return CudaMatrix();
         }
         if (aOther.isComplex() != aMatrix.isComplex()) {
-            std::cerr<<"operator+ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
+            std::cerr<<"operator* must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
             <<" and the matrix1 type is "<<(aOther.isComplex()?"Comples":"Real")<<std::endl;
             return CudaMatrix();
         }
         unaryMul(aOther.getData(),aMatrix.getData(),aMatrix.getData(),aOther.getDataSize());
         return aMatrix;
     }
+    
+    //--Sub-----------------------------------------------------------------
+    CudaMatrix CudaMatrix::operator-(float aScalar) const{
+        if (isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator-(float aScalar)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  getDataSize() * getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
+        unarySub(getData(),aScalar,out.getData(),getDataSize());
+        return out;
+    }
+    
+    CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator-(float aScalar, const CudaMatrix &aMatrix)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
+        unarySub(aScalar,aMatrix.getData(),out.getData(),aMatrix.getDataSize());
+        return out;
+    }
+
+    CudaMatrix& operator-(float aScalar, CudaMatrix &&aMatrix){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator-(float aScalar, CudaMatrix &&aMatrix)"<<std::endl;
+            return  aMatrix;
+        }
+        unarySub(aScalar,aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+    CudaMatrix& operator-(CudaMatrix &&aMatrix,float aScalar){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator-(CudaMatrix &&aMatrix, float aScalar)"<<std::endl;
+            return  aMatrix;
+        }
+        unarySub(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+    CudaMatrix CudaMatrix::operator-(const CudaMatrix &aMatrix) const{
+        if (aMatrix.isComplex()!=this->isComplex())
+        {
+            std::cerr<<"operator- must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        if (this->getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator- must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
+        unarySub(this->getData(),aMatrix.getData(),out.getData(),aMatrix.getDataSize());
+        return out;
+    }
+    CudaMatrix CudaMatrix::operator-(CudaMatrix &&aMatrix) const{
+        if (aMatrix.isComplex()!=this->isComplex())
+        {
+            std::cerr<<"operator- must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        if (this->getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator- must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        unarySub(this->getData(),aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+    CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther){
+        if (aMatrix.isComplex()!=aOther.isComplex())
+        {
+            std::cerr<<"operator- must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aOther.isComplex()?"Complex":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        if (aOther.getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator- must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
+            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        unarySub(aMatrix.getData(),aOther.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }  
+
+    // div
+    CudaMatrix CudaMatrix::operator/(float aScalar) const{
+        if (isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator/(float aScalar)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  getDataSize() * getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
+        unaryDiv(getData(),aScalar,out.getData(),getDataSize());
+        return out;
+    }
+    CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator/(float aScalar, const CudaMatrix &aMatrix)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
+        unaryDiv(aScalar,aMatrix.getData(),out.getData(),aMatrix.getDataSize());
+        return out;
+    }
+    CudaMatrix& operator/(float aScalar, CudaMatrix &&aMatrix){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator/(float aScalar, CudaMatrix &&aMatrix)"<<std::endl;
+            return  aMatrix;
+        }
+        unaryDiv(aScalar,aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+    CudaMatrix& operator/(CudaMatrix &&aMatrix,float aScalar){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator/(CudaMatrix &&aMatrix, float aScalar)"<<std::endl;
+            return  aMatrix;
+        }
+        unaryDiv(aMatrix.getData(),aScalar,aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+    CudaMatrix CudaMatrix::operator/(const CudaMatrix &aMatrix) const{
+        if (aMatrix.isComplex()!=this->isComplex())
+        {
+            std::cerr<<"operator/ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        if (this->getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator/ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        float* data = nullptr;
+        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
+        unaryDiv(this->getData(),aMatrix.getData(),out.getData(),aMatrix.getDataSize());
+        return out;
+    }
+    CudaMatrix CudaMatrix::operator/(CudaMatrix &&aMatrix) const{
+        if (aMatrix.isComplex()!=this->isComplex())
+        {
+            std::cerr<<"operator/ must with Data type, now the matrix0 type is "<<(this->isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aMatrix.isComplex()?"Complex":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        if (this->getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator/ must with Same DataSize, now the matrix0 size is "<<this->getDataSize()
+            <<" and the matrix1 size is "<<aMatrix.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        unaryDiv(this->getData(),aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+    CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther){
+        if (aMatrix.isComplex()!=aOther.isComplex())
+        {
+            std::cerr<<"operator/ must with Data type, now the matrix0 type is "<<(aMatrix.isComplex()?"Comples":"Real")
+            <<" and the matrix1 type is "<<(aOther.isComplex()?"Complex":"Real")<<std::endl;
+            return CudaMatrix();
+        }
+        if (aOther.getDataSize() != aMatrix.getDataSize()) {
+            std::cerr<<"operator/ must with Same DataSize, now the matrix0 size is "<<aMatrix.getDataSize()
+            <<" and the matrix1 size is "<<aOther.getDataSize()<<std::endl;
+            return CudaMatrix();
+        }
+        unaryDiv(aMatrix.getData(),aOther.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+
+    // -----------pow-------------------------------------------------------
+    CudaMatrix CudaMatrix::operator^(int times) const{
+        if (isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator^(int times)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float scalar = (float)times;
+        float* data = nullptr;
+        unsigned long long size =  getDataSize() * getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
+        unaryPow(getData(),scalar,out.getData(),getDataSize());
+        return out;
+    }
+    CudaMatrix operator^(CudaMatrix &&aMatrix,int times){
+        if (aMatrix.isComplex())
+        {
+            std::cerr<<"Complex matrix not support operator^(int times)"<<std::endl;
+            return  CudaMatrix();
+        }
+        float scalar = (float)times;
+        unaryPow(aMatrix.getData(),scalar,aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+
+    //----------negetive------------------------------------------------
+    CudaMatrix operator-(CudaMatrix &&aMatrix){
+        unaryNeg(aMatrix.getData(),aMatrix.getData(),aMatrix.getDataSize());
+        return aMatrix;
+    }
+    CudaMatrix operator-(const CudaMatrix &aMatrix){
+        float* data = nullptr;
+        unsigned long long size =  aMatrix.getDataSize() * aMatrix.getValueType();
+        cudaMalloc((void**)&data, sizeof(float) * size);
+        auto out =  CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
+        unaryNeg(aMatrix.getData(),out.getData(),aMatrix.getDataSize());
+        return out;
+    }
 }
 #endif // USE_CUDA
diff --git a/src/CudaMatrixPrivate.cu b/src/CudaMatrixPrivate.cu
index 82a791e..8feb8b7 100644
--- a/src/CudaMatrixPrivate.cu
+++ b/src/CudaMatrixPrivate.cu
@@ -11,6 +11,7 @@ struct PowOperator: public thrust::unary_function<float, float>{
     void setExponent(float v){
         exponent = v;
     }
+    
     __host__ __device__
     float operator()(const float& x)  {
         return powf(x, exponent);
@@ -74,12 +75,12 @@ void unaryDiv(float* in1, const float& in2, float* out, unsigned long length){
 void unaryPow(float* in1, float N,float* out, unsigned long length){
     if (N == 0.0f)
     {
-            thrust::fill(out,out+length,0);
+            thrust::fill(thrust::device,out,out+length,1);
             return;
     }
     if (N == 1.0f)
     {
-        thrust::copy(in1,in1+length,out);
+        thrust::copy(thrust::device,in1,in1+length,out);
         return;
     }
     if (N == 2.0f){
diff --git a/test/CudaMatrix_Test.cpp b/test/CudaMatrix_Test.cpp
index ab0972d..6321dce 100644
--- a/test/CudaMatrix_Test.cpp
+++ b/test/CudaMatrix_Test.cpp
@@ -186,6 +186,246 @@ TEST_F(CudaMatrix_Test, MatrixMul) {
     }
 }
 
+TEST_F(CudaMatrix_Test, MatrixSub) {
+    auto A = Aurora::zeros(1000,1,1);
+    auto B = Aurora::zeros(1000,1,1);
+    for (size_t i = 0; i < 1000; i++)
+    {
+        A[i] = -1;
+        B[i] =  i;
+    }
+    printf("Test CudaMatrix operator-(const CudaMatrix &aMatrix) const \r\n");
+    //CudaMatrix operator+(const CudaMatrix &aMatrix) const
+    auto C = A-B;
+    auto dA = A.toDeviceMatrix();
+    auto dB = B.toDeviceMatrix();
+    auto dC = (dA-dB);
+    auto dhC = dC.toHostMatrix();
+    for (size_t i = 0; i < 1000; i++)
+    {
+        ASSERT_FLOAT_EQ(C[i],dhC[i]);
+    }
+    printf("Test CudaMatrix operator-(float aScalar) const \r\n");
+    //CudaMatrix operator+(float aScalar) const
+    auto D = C-0.5;
+    auto dD = dC-0.5;
+    auto dhD = dD.toHostMatrix();
+    for (size_t i = 0; i < 1000; i++)
+    {
+        ASSERT_FLOAT_EQ(D[i],dhD[i]);
+    }
+    printf("Test CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix) \r\n");
+    // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
+    D = 0.5-C;
+    dD = 0.5 - dC;
+    dhD = dD.toHostMatrix();
+    for (size_t i = 0; i < 1000; i++)
+    {
+        ASSERT_FLOAT_EQ(D[i],dhD[i]);
+    }
+    printf("Test CudaMatrix &operator-(float aScalar, CudaMatrix &&aMatrix) \r\n");
+    // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
+    {
+        auto dD2 = 0.5 - (dA-dB);
+        dhD = dD2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+    }
+    printf("Test CudaMatrix &operator-(CudaMatrix &&aMatrix, float aScalar) \r\n");
+    // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
+    {
+        auto E = C-0.5;
+        auto dE2 =  (dA-dB)-0.5;
+        auto dhE = dE2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(E[i],dhE[i]);
+        }
+    }
+    //CudaMatrix operator+(CudaMatrix &&aMatrix) const
+    printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix) const \r\n");
+    {
+        auto D = A-C;
+        auto dD2 = dA-(dA-dB);
+        dhD = dD2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+    }
+    //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
+    printf("Test CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
+    {
+        auto D = C-A;
+        auto dD2 = (dA-dB)-dA;
+        dhD = dD2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+    }
+}
+
+TEST_F(CudaMatrix_Test, MatrixDiv) {
+    auto A = Aurora::zeros(1000,1,1);
+    auto B = Aurora::zeros(1000,1,1);
+    for (size_t i = 0; i < 1000; i++)
+    {
+        A[i] = -1;
+        B[i] =  i;
+    }
+    printf("Test CudaMatrix operator/(const CudaMatrix &aMatrix) const \r\n");
+    //CudaMatrix operator+(const CudaMatrix &aMatrix) const
+    auto C = A/B;
+    auto dA = A.toDeviceMatrix();
+    auto dB = B.toDeviceMatrix();
+    auto dC = (dA/dB);
+    auto dhC = dC.toHostMatrix();
+    for (size_t i = 0; i < 1000; i++)
+    {
+        ASSERT_FLOAT_EQ(C[i],dhC[i]);
+    }
+    printf("Test CudaMatrix operator/(float aScalar) const \r\n");
+    //CudaMatrix operator+(float aScalar) const
+    auto D = C/0.5;
+    auto dD = dC/0.5;
+    auto dhD = dD.toHostMatrix();
+    for (size_t i = 0; i < 1000; i++)
+    {
+        ASSERT_FLOAT_EQ(D[i],dhD[i]);
+    }
+    printf("Test CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix) \r\n");
+    // CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix)
+    D = 0.5/C;
+    dD = 0.5 / dC;
+    dhD = dD.toHostMatrix();
+    for (size_t i = 0; i < 1000; i++)
+    {
+        ASSERT_FLOAT_EQ(D[i],dhD[i]);
+    }
+    printf("Test CudaMatrix &operator/(float aScalar, CudaMatrix &&aMatrix) \r\n");
+    // CudaMatrix &operator+(float aScalar, CudaMatrix &&aMatrix)
+    {
+        auto dD2 = 0.5 / (dA/dB);
+        dhD = dD2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+    }
+    printf("Test CudaMatrix &operator/(CudaMatrix &&aMatrix, float aScalar) \r\n");
+    // CudaMatrix &operator+(CudaMatrix &&aMatrix, float aScalar)
+    {
+        auto E = C/0.5;
+        auto dE2 =  (dA/dB)/0.5;
+        auto dhE = dE2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(E[i],dhE[i]);
+        }
+    }
+    //CudaMatrix operator+(CudaMatrix &&aMatrix) const
+    printf("Test CudaMatrix operator/(CudaMatrix &&aMatrix) const \r\n");
+    {
+        auto D = A/C;
+        auto dD2 = dA/(dA/dB);
+        dhD = dD2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+    }
+    //CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther)
+    printf("Test CudaMatrix operator/(CudaMatrix &&aMatrix,CudaMatrix &aOther) \r\n");
+    {
+        auto D = C/A;
+        auto dD2 = (dA/dB)/dA;
+        dhD = dD2.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(D[i],dhD[i]);
+        }
+    }
+}
+
+TEST_F(CudaMatrix_Test, MatrixPow){
+    auto A = Aurora::zeros(1000,1,1);
+    auto B = Aurora::zeros(1000,1,1);
+    for (size_t i = 0; i < 1000; i++)
+    {
+        A[i] = -1+0.2*i;
+    }
+    auto dA= A.toDeviceMatrix();
+    {
+        auto R = A^0;
+        auto dR = dA^0;
+        auto dhR = dR.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(R[i],dhR[i]);
+        }
+    }
+    {
+        auto R = A^1;
+        auto dR = dA^1;
+        auto dhR = dR.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(R[i],dhR[i]);
+        }
+    }
+        {
+        auto R = A^2;
+        auto dR = dA^2;
+        auto dhR = dR.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(R[i],dhR[i]);
+        }
+    }
+    {
+        auto R = A^3;
+        auto dR = dA^3;
+        auto dhR = dR.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(R[i],dhR[i]);
+        }
+    }
+    {
+        auto R = A^5;
+        auto dR = dA^5;
+        auto dhR = dR.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(R[i],dhR[i]);
+        }
+    }
+
+}
+
+TEST_F(CudaMatrix_Test, MatrixNeg){
+    auto A = Aurora::zeros(1000,1,1);
+    auto B = Aurora::zeros(1000,1,1);
+    for (size_t i = 0; i < 1000; i++)
+    {
+        A[i] = -1+0.2*i;
+    }
+    auto dA= A.toDeviceMatrix();
+    {
+        auto R = -A;
+        auto dR = -dA;
+        auto dhR = dR.toHostMatrix();
+        for (size_t i = 0; i < 1000; i++)
+        {
+            ASSERT_FLOAT_EQ(R[i],dhR[i]);
+        }
+    }
+}
+
+
 TEST_F(CudaMatrix_Test, matrixfunction)
 {
     printf("Test CudaMatrix block(int aDim,int aBeginIndx, int aEndIndex) const\r\n");