Cuda +-*/ with mm.mv and scalar(complex and real) support

2023-12-13 16:40:11 +08:00
parent f2b81eddc3
commit deaac03704
6 changed files with 3783 additions and 709 deletions
--- a/src/CudaMatrix.cpp
+++ b/src/CudaMatrix.cpp
--- a/src/CudaMatrix.h
+++ b/src/CudaMatrix.h
@@ -43,11 +43,23 @@ namespace Aurora
         */
        CudaMatrix deepCopy() const;

+        /**
+         * @brief 深拷贝操作,但不复制数据
+         * 
+         * @return CudaMatrix 
+         */
+        CudaMatrix deepCopyShape(bool complex = false) const;
+
+
        // add
        CudaMatrix operator+(float aScalar) const;
        friend CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix);
        friend CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix);
        friend CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator+(std::complex<float> aScalar) const;
+        friend CudaMatrix operator+(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator+(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator+(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
        CudaMatrix operator+(const CudaMatrix &aMatrix) const;
        CudaMatrix operator+(CudaMatrix &&aMatrix) const;
        friend CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther);
@@ -57,6 +69,10 @@ namespace Aurora
        friend CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix);
        friend CudaMatrix& operator-(float aScalar, CudaMatrix &&aMatrix);
        friend CudaMatrix& operator-(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator-(std::complex<float> aScalar) const;
+        friend CudaMatrix operator-(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator-(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator-(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
        CudaMatrix operator-(const CudaMatrix &aMatrix) const;
        CudaMatrix operator-(CudaMatrix &&aMatrix) const;
        friend CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther);
@@ -70,6 +86,10 @@ namespace Aurora
        friend CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix);
        friend CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix);
        friend CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator*(std::complex<float> aScalar) const;
+        friend CudaMatrix operator*(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator*(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator*(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
        CudaMatrix operator*(const CudaMatrix &aMatrix) const;
        CudaMatrix operator*(CudaMatrix &&aMatrix) const;
        friend CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther);
@@ -79,9 +99,13 @@ namespace Aurora
        friend CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix);
        friend CudaMatrix& operator/(float aScalar, CudaMatrix &&aMatrix);
        friend CudaMatrix& operator/(CudaMatrix &&aMatrix,float aScalar);
+        CudaMatrix operator/(std::complex<float> aScalar) const;
+        friend CudaMatrix operator/(std::complex<float> aScalar, const CudaMatrix &aMatrix);
+        friend CudaMatrix& operator/(std::complex<float> aScalar, CudaMatrix &&aMatrix);
+        friend CudaMatrix& operator/(CudaMatrix &&aMatrix,std::complex<float>  aScalar);
        CudaMatrix operator/(const CudaMatrix &aMatrix) const;
        CudaMatrix operator/(CudaMatrix &&aMatrix) const;
-        friend CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther);
+        friend CudaMatrix operator/(CudaMatrix &&aMatrix,CudaMatrix &aOther);

        // pow
        CudaMatrix operator^(int times) const;
--- a/src/CudaMatrixPrivate.cu
+++ b/src/CudaMatrixPrivate.cu
@@ -4,7 +4,9 @@
 #include <thrust/complex.h>
 #include <thrust/functional.h>
 #include <thrust/execution_policy.h>
+
 #include "AuroraDefs.h"
+#include "AuroraThrustIterator.cuh"
 using namespace thrust::placeholders;

 struct PowOp: public thrust::unary_function<float, float>{
@@ -127,36 +129,224 @@ struct CompareANEOp{
    }
 };

+typedef thrust::complex<float> complexf;

-void unaryAdd(float* in1, float* in2, float* out, unsigned long  length)
+
+
+void unaryAddmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+              unsigned long aLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2)
 {
-    thrust::plus<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
+    if (aValType1 == aValType2)
+    {
+        thrust::plus<float> op;
+        thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()+y,x.imag());
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()+y,x.imag());
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn2,(complexf*)aMatrixIn2+aLength,aMatrixIn1,(complexf*)aMatrixOut,lambda);
+    }
+
 }

-void unaryAdd(float* in1, const float& in2, float* out, unsigned long  length)
+void unaryAddmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2,
+              Aurora::FunctionDirection aVectype)
 {
-    thrust::transform(thrust::device,in1,in1+length,out,in2 + _1);
+    if (aVectype == Aurora::Column){
+        if(aValType1 == aValType2){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            thrust::plus<float> op;
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength*aValType1,rowVectorIter,aMatrixOut,op);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()+y,x.imag());
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()+x,y.imag());
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            thrust::plus<float> op;
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,op);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()+y,x.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()+x,y.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
 }

-void unaryMul(float* in1, float* in2, float* out, unsigned long  length)
+void unaryAdd(float* aMatrixIn, const float& aVectorIn, float* aMatrixOut, unsigned long  aLength)
 {
-    thrust::multiplies<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
+    thrust::transform(thrust::device,aMatrixIn,aMatrixIn+aLength,aMatrixOut,aVectorIn + _1);
 }

-void unaryMulc(float* in1, float* in2, float* out, unsigned long  length)
+void unaryAddc(std::complex<float>* aMatrixIn, const std::complex<float>& aScalarIn, 
+                std::complex<float>* aMatrixOut, unsigned long  aLength)
 {
-    thrust::complex<float>* _in1 = (thrust::complex<float>*)in1;
-    thrust::complex<float>* _in2 = (thrust::complex<float>*)in2; 
-    thrust::complex<float>* _out = (thrust::complex<float>*)out;;
-    thrust::multiplies<thrust::complex<float>> op;
-    thrust::transform(thrust::device,_in1,_in1+length,_in2,_out,op);
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device,(complexf*)aMatrixIn,
+                    (complexf*)aMatrixIn + aLength,
+                    (complexf*)aMatrixOut, value + _1);
 }

-void unaryMul(float* in1, const float& in2, float* out, unsigned long  length)
+void unaryMul(float* aMatrixIn, const float& aScalarIn, float* aMatrixOut, unsigned long  aLength)
 {
-    thrust::transform(thrust::device,in1, in1+length, out, in2 * _1);
+    thrust::transform(thrust::device,aMatrixIn,aMatrixIn+aLength,aMatrixOut,aScalarIn * _1);
+}
+
+void unaryMulc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength) 
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      value * _1);
+}
+
+void unaryMulmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+              unsigned long aLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2)
+{
+    if (aValType1 == aValType2 )
+    {
+        if(Aurora::Normal == aValType1){
+            thrust::multiplies<float> op;
+            thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+        }
+        else{
+            thrust::multiplies<complexf> op;
+            thrust::transform(thrust::device, (complexf *)aMatrixIn1,
+                              (complexf *)aMatrixIn1 + aLength,
+                              (complexf *)aMatrixIn2, (complexf *)aMatrixOut,
+                              op);
+        }
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()*y,x.imag()*y);
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()*y,x.imag()*y);
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn2,(complexf*)aMatrixIn2+aLength,aMatrixIn1,(complexf*)aMatrixOut,lambda);
+    }
+
+}
+
+void unaryMulmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2,
+              Aurora::FunctionDirection aVectype)
+{
+    if (aVectype == Aurora::Column){
+        if(aValType1 == aValType2){
+            if(aValType1 == Aurora::Normal){
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                thrust::multiplies<float> op;
+                thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength*aValType1,rowVectorIter,aMatrixOut,op);
+            }
+            else{
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                thrust::multiplies<complexf> op;
+                thrust::transform(thrust::device, (complexf*)aMatrixIn1,
+                            (complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,
+                            (complexf*)aMatrixOut,op);
+            }
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()*y,x.imag()*y);
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()*x,y.imag()*x);
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            if(aValType1 == Aurora::Normal){
+                Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                thrust::multiplies<float> op;
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,op);
+            }
+            else{
+                Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                thrust::multiplies<complexf> op;
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,op);
+            }
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return complexf(x.real()*y,x.imag()*y);
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return complexf(y.real()*x,y.imag()*x);
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
 }

 void unaryNeg(float* in1, float* out, unsigned long length){
@@ -164,42 +354,280 @@ void unaryNeg(float* in1, float* out, unsigned long length){
    thrust::transform(thrust::device,in1,in1+length,out,op);
 }

-void unarySub(float* in1, float* in2, float* out, unsigned long length){
-    thrust::minus<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
-}
-
-void unaryDiv(float* in1, float* in2, float* out, unsigned long length){
-    thrust::divides<float> op;
-    thrust::transform(thrust::device,in1,in1+length,in2,out,op);
-}
-
-void unaryDivc(float* in1, float* in2, float* out, unsigned long  length)
-{
-    thrust::complex<float>* _in1 = (thrust::complex<float>*)in1;
-    thrust::complex<float>* _in2 = (thrust::complex<float>*)in2; 
-    thrust::complex<float>* _out = (thrust::complex<float>*)out;;
-    thrust::divides<thrust::complex<float>> op;
-    thrust::transform(thrust::device,_in1,_in1+length,_in2,_out,op);
-}
-
-void unarySub(const float& in1, float* in2, float* out, unsigned long length){
-    thrust::transform(thrust::device,in2,in2+length,out,in1-_1);
-}
-
-void unaryDiv(const float& in1, float* in2, float* out, unsigned long length){
-    thrust::transform(thrust::device,in2,in2+length,out,in1/_1);
-
-}
-
 void unarySub(float* in1, const float& in2, float* out, unsigned long length){
    thrust::transform(thrust::device,in1,in1+length,out,_1-in2);
 }

+void unarySub(const float& in2, float* in1, float* out, unsigned long length){
+    thrust::transform(thrust::device,in1,in1+length,out,in2-_1);
+}
+
+void unarySubc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      _1 - value);
+}
+
+void unarySubc(const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      value - _1);
+}
+
+void unarySubmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1,
+                Aurora::ValueType aValType2 )
+{
+    if (aValType1 == aValType2 )
+    {
+        if(Aurora::Normal == aValType1){
+            thrust::minus<float> op;
+            thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+        }
+        else{
+            thrust::minus<complexf> op;
+            thrust::transform(thrust::device, (complexf *)aMatrixIn1,
+                              (complexf *)aMatrixIn1 + aLength,
+                              (complexf *)aMatrixIn2, (complexf *)aMatrixOut,
+                              op);
+        }
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return complexf(x.real()-y,x.imag());
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const float& x, const complexf& y){
+            return complexf(x-y.real(),0-y.imag());
+        };
+        thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength,(complexf*)aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+}
+
+void unarySubmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype,int direction)
+{
+    if (aVectype == Aurora::Column){
+        //类型一样时，逐元素处理即可
+        if(aValType1 == aValType2){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const float& y){
+                return direction==0?x-y:y-x;
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength*aValType1,rowVectorIter,aMatrixOut,lambda);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return  direction==0?complexf(x.real()-y,x.imag()):complexf(y-x.real(),-x.imag());
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return  direction==0?complexf(x-y.real(),-y.imag()):complexf(y.real()-x,y.imag());
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const float& y){
+                return direction==0?x-y:y-x;
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,lambda);
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                return  direction==0?complexf(x.real()-y,x.imag()):complexf(y-x.real(),-x.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                return  direction==0?complexf(x-y.real(),-y.imag()):complexf(y.real()-x,y.imag());
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+}
+
 void unaryDiv(float* in1, const float& in2, float* out, unsigned long length){
    thrust::transform(thrust::device,in1,in1+length,out,_1/in2);
 }

+void unaryDiv(const float& in2, float* in1, float* out, unsigned long length){
+    thrust::transform(thrust::device,in1,in1+length,out,in2/_1);
+}
+
+void unaryDivc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      _1 / value);
+}
+
+void unaryDivc(const std::complex<float> &aScalarIn,
+               std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength)
+{
+    complexf value (aScalarIn.real(), aScalarIn.imag());
+    thrust::transform(thrust::device, (complexf *)aMatrixIn,
+                      (complexf *)aMatrixIn + aLength, (complexf *)aMatrixOut,
+                      value / _1);
+}
+
+void unaryDivmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1,
+                Aurora::ValueType aValType2 )
+{
+    if (aValType1 == aValType2 )
+    {
+        if(Aurora::Normal == aValType1){
+            thrust::divides<float> op;
+            thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength*aValType1,aMatrixIn2,aMatrixOut,op);
+        }
+        else{
+            thrust::divides<complexf> op;
+            thrust::transform(thrust::device, (complexf *)aMatrixIn1,
+                              (complexf *)aMatrixIn1 + aLength,
+                              (complexf *)aMatrixIn2, (complexf *)aMatrixOut,
+                              op);
+        }
+    }
+    else if(Aurora::Complex == aValType1){
+        auto lambda = [=] __device__(const complexf& x, const float& y){
+            return x/complexf(y,0);
+        };
+        thrust::transform(thrust::device,(complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aLength,aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+    else{
+        auto lambda = [=] __device__(const float& x, const complexf& y){
+            return complexf(x,0)/y;
+        };
+        thrust::transform(thrust::device,aMatrixIn1,aMatrixIn1+aLength,(complexf*)aMatrixIn2,(complexf*)aMatrixOut,lambda);
+    }
+}
+
+void unaryDivmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype,int direction)
+{
+    if (aVectype == Aurora::Column){
+        //类型一样时，逐元素处理即可
+        if(aValType1 == aValType2){
+            if (aValType1==Aurora::Normal){
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const float& x, const float& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,aMatrixOut,lambda);
+            }
+            else{
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const complexf& x, const complexf& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, (complexf*)aMatrixIn1,
+                    (complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+            }
+
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                complexf v (y,0);
+                return  direction==0?x/v:v/x;
+            };
+            thrust::transform(thrust::device, (complexf*)aMatrixIn1,(complexf*)aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                complexf v (x,0);
+                return  direction==0?v/y:y/v;
+            };
+            thrust::transform(thrust::device, aMatrixIn1,aMatrixIn1+aMatrixLength,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+    else{
+        int colElementCount = aMatrixLength/aVectorLength;
+        if(aValType1 == aValType2){
+            if (aValType1 == Aurora::Normal)
+            {
+                Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const float& x, const float& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,aMatrixOut,lambda);
+            }
+            else{
+                Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+                Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+                Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+                auto lambda = [=] __device__(const complexf& x, const complexf& y){
+                    return direction==0?x/y:y/x;
+                };
+                thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda); 
+            }
+
+        }
+        else if (aValType1 == Aurora::Complex){
+            Aurora::RowWiseIterator<complexf> rowIter_Begin((complexf*)aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<complexf> rowIter_End((complexf*)aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<float> rowVectorIter(aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const complexf& x, const float& y){
+                complexf v (y,0);
+                return  direction==0?x/v:v/x;
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+        else{
+            Aurora::RowWiseIterator<float> rowIter_Begin(aMatrixIn1,aVectorLength,colElementCount,0);
+            Aurora::RowWiseIterator<float> rowIter_End(aMatrixIn1,aVectorLength,colElementCount,aMatrixLength);
+            Aurora::LoopVectorIterator<complexf> rowVectorIter((complexf*)aVectorIn2,aVectorLength);
+            auto lambda = [=] __device__(const float& x, const complexf& y){
+                complexf v (x,0);
+                return  direction==0?v/y:y/v;
+            };
+            thrust::transform(thrust::device, rowIter_Begin,rowIter_End,rowVectorIter,(complexf*)aMatrixOut,lambda);
+        }
+    }
+}
+
+
+
 void unaryPow(float* in1, float N,float* out, unsigned long length){
    if (N == 0.0f)
    {
@@ -305,3 +733,4 @@ void thrustFill(float* aBegin, float* aEnd, std::complex<float> aValue)
    thrust::fill(thrust::device, (std::complex<float>*)aBegin, (std::complex<float>*)aEnd, aValue);
 }

+
--- a/src/CudaMatrixPrivate.cuh
+++ b/src/CudaMatrixPrivate.cuh
@@ -3,32 +3,290 @@
 #define __CUDAMATRIX_CUH__

 #include <complex>
+#include "AuroraDefs.h"
 namespace{
    enum CompareType
    {
        G,GE,E,NE,LE,L
    };
+    enum VecType
+    {
+        COL,ROW
+    };
 }
-void unaryAdd(float* in1, float* in2, float* out, unsigned long length);
-void unaryAdd(float* in1, const float& in2, float* out, unsigned long length);
-void unaryMul(float* in1, float* in2, float* out, unsigned long length);
-void unaryMulc(float* in1, float* in2, float* out, unsigned long length);

-void unaryMul(float* in1, const float& in2, float* out, unsigned long length);
+/**
+ * @brief 矩阵逐元素加一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryAdd(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素加一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryAddc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相加
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unaryAddmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相加
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ */
+void unaryAddmv(float *aMatrixIn, float *aVectorIn, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aMatValType, Aurora::ValueType aVecValType, 
+              Aurora::FunctionDirection aFuncType = Aurora::FunctionDirection::Column);
+
+/**
+ * @brief 矩阵逐元素乘一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryMul(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素乘一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryMulc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相乘
+ *
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unaryMulmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相乘
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ */
+void unaryMulmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype = Aurora::FunctionDirection::Column);
+
+/**
+ * @brief 矩阵逐元素减一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySub(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个float值减
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySub(const float &aValue,float *aMatrixIn,  float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素减一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySubc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个complex<float>值减
+ * 
+ * @param aValue complex<float>值
+  * @param aMatrixIn 矩阵数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unarySubc(const std::complex<float> &aValue,
+                std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相减
+ *
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unarySubmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相减
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ * @param direction 减法执行方向, 0 mat - vec, 1 vec- mat
+ */
+void unarySubmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype = Aurora::FunctionDirection::Column,
+              int direction = 0);
+
+/**
+ * @brief 矩阵逐元素除以一个float值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDiv(float *aMatrixIn, const float &aValue, float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个float值除
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue float值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDiv(const float &aValue,float *aMatrixIn,  float *aMatrixOut,
+              unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素除以一个complex<float>值
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aValue complex<float>值
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDivc(std::complex<float> *aMatrixIn,
+               const std::complex<float> &aValue,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/**
+ * @brief 矩阵逐元素被一个complex<float>值除
+ * 
+ * @param aValue complex<float>值
+  * @param aMatrixIn 矩阵数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ */
+void unaryDivc(const std::complex<float> &aValue,
+                std::complex<float> *aMatrixIn,
+               std::complex<float> *aMatrixOut, unsigned long aLength);
+
+/** 
+ * @brief 两相同大小矩阵逐元素相除
+ *
+ * @param aMatrixIn1 矩阵1数据指针
+ * @param aMatrixIn2 矩阵2数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aLength 矩阵总元素长度
+ * @param aValType1 矩阵1数据类型
+ * @param aValType2 矩阵2数据类型
+ */
+void unaryDivmm(float *aMatrixIn1, float *aMatrixIn2, float *aMatrixOut,
+                unsigned long aLength,
+                Aurora::ValueType aValType1 = Aurora::Normal,
+                Aurora::ValueType aValType2 = Aurora::Normal);
+
+/**
+ * @brief 矩阵与向量点对点相除
+ * 
+ * @param aMatrixIn 矩阵数据指针
+ * @param aVectorIn 向量数据指针
+ * @param aMatrixOut 输出矩阵数据指针
+ * @param aMatrixLength 矩阵数据总长度
+ * @param aVectorLength 向量数据总长度
+ * @param aMatValType 矩阵数据类型，normal or complex
+ * @param aVecValType 向量数据类型
+ * @param aFuncType 函数执行方向，只支持 Row or Column
+ * @param direction 减法执行方向, 0 mat - vec, 1 vec- mat
+ */
+void unaryDivmv(float *aMatrixIn1, float *aVectorIn2, float *aMatrixOut,
+              unsigned long aMatrixLength,unsigned long aVectorLength,
+              Aurora::ValueType aValType1, Aurora::ValueType aValType2, 
+              Aurora::FunctionDirection aVectype = Aurora::FunctionDirection::Column,
+              int direction = 0);

 void unaryNeg(float* in1, float* out, unsigned long length);
 void unaryPow(float* in1, float N,float* out, unsigned long length);

-void unarySub(float* in1, float* in2, float* out, unsigned long length);
-void unaryDiv(float* in1, float* in2, float* out, unsigned long length);
-void unaryDivc(float* in1, float* in2, float* out, unsigned long length);
-
-void unarySub(const float& in1, float* in2, float* out, unsigned long length);
-void unaryDiv(const float& in1, float* in2, float* out, unsigned long length);
-void unarySub(float* in1, const float& in2, float* out, unsigned long length);
-void unaryDiv(float* in1, const float& in2, float* out, unsigned long length);
-
-
 void unaryCompare(float* in1, const float& in2, float* out, unsigned long length,int type);
 void unaryCompare(const float& in1, float* in2, float* out, unsigned long length, int type);
 void unaryCompare(float* in1, float* in2, float* out, unsigned long length, int type);
--- a/src/Matrix.cpp
+++ b/src/Matrix.cpp
@@ -2,8 +2,10 @@
 #include "CudaMatrix.h"

 #include <cmath>
+#include <complex>
 #include <cstddef>
 #include <mkl_cblas.h>
+#include <mkl_vml_functions.h>
 #include <string>
 #include <cstring>
 #include <iostream>
@@ -16,6 +18,7 @@
 #include "Eigen/src/Core/Map.h"
 #include "Eigen/src/Core/Matrix.h"
 #include "Function.h"
+#include "Function1D.h"

 #ifdef USE_CUDA
 #include <cuda_runtime.h>
@@ -130,7 +133,7 @@ namespace Aurora{
        }
        else{
            float zero = 0.0;
-            aFuncD(size, yC+ 1, 2, &zero, 0, output + 1, 2);
+            aFuncD(size,  &zero, 0, yC+ 1, 2, output + 1, 2);
        }
    }

@@ -560,7 +563,14 @@ namespace Aurora {
    }
    //operation /
    Matrix Matrix::operator/(float aScalar) const { return operatorMxA(&vsDivI, aScalar, *this);}
-    Matrix operator/(float aScalar, const Matrix &matrix) {return operatorAxM(&vsDivI, aScalar, matrix);}
+    Matrix operator/(float aScalar, const Matrix &aMatrix) {
+        if(!aMatrix.isComplex())return operatorAxM(&vsDivI, aScalar, aMatrix);
+        float *output = malloc(aMatrix.getDataSize(), true);
+        std::complex<float> v(aScalar,0);
+        vcDivI(aMatrix.getDataSize(),  &v, 0,(std::complex<float>*)aMatrix.getData(), 1, (std::complex<float>*)output, 1);
+        return Matrix::New(output, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2),
+                                   aMatrix.getValueType());
+    }
    Matrix Matrix::operator/(const Matrix &aMatrix) const {
        if (isScalar()){
            return getScalar()/aMatrix;
@@ -568,6 +578,9 @@ namespace Aurora {
        if (aMatrix.isScalar()){
            return (*this)/aMatrix.getScalar();
        }
+        if(aMatrix.isComplex()&& !isComplex()){
+            return operatorMxM_RR(&vsDivI,&vcDivI,aMatrix,complex(*this),true);
+        }
        return operatorMxM(vsDivI, vcDivI, *this, aMatrix);
    }
    Matrix &operator/(float aScalar, Matrix &&matrix) {
@@ -583,7 +596,8 @@ namespace Aurora {
        if (aMatrix.isScalar()){
            return (*this)/aMatrix.getScalar();
        }
-        return operatorMxM_RR(&vsDivI,&vcDivI,*this,std::forward<Matrix&&>(aMatrix));
+        Matrix leftMat = (aMatrix.isComplex()&& !isComplex())? complex(*this):(*this);
+        return operatorMxM_RR(&vsDivI,&vcDivI,leftMat,std::forward<Matrix&&>(aMatrix));
    }

    Matrix operator/(Matrix &&aMatrix, Matrix &aOther) {
@@ -593,7 +607,12 @@ namespace Aurora {
        if (aMatrix.isScalar()){
            return aMatrix.getScalar()/aOther;
        }
+        if( aOther.isComplex()&& !aMatrix.isComplex())
+        {
+            return operatorMxM_RR(&vsDivI,&vcDivI,aOther,std::forward<Matrix&&>(complex(aMatrix)),true);
+        }
        return operatorMxM_RR(&vsDivI,&vcDivI,aOther,std::forward<Matrix&&>(aMatrix),true);
+     
    }
    //operator ^ (pow)
    Matrix Matrix::operator^(int times) const { return operatorMxA(&vsPowI, times, *this);}
--- a/test/CudaMatrix_Test.cpp
+++ b/test/CudaMatrix_Test.cpp