feat: Add remove cuda option and cmak config support

2024-11-04 10:58:34 +08:00
21 changed files with 1554 additions and 2068 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,10 +8,9 @@ set(Aurora_USE_CUDA ON)
 if (Aurora_USE_CUDA)
 set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
-set(CUDA_TOOLKIT_ROOT /usr/local/cuda)
+
 enable_language(CUDA)
 find_package(CUDA)
 find_package(CUDAToolkit REQUIRED)
 add_definitions(-DUSE_CUDA)
 endif(Aurora_USE_CUDA)
@@ -44,12 +43,12 @@ target_link_libraries(Aurora PUBLIC $<LINK_ONLY:MKL::MKL>)
 target_link_libraries(Aurora PUBLIC OpenMP::OpenMP_CXX)
 target_link_libraries(Aurora PUBLIC matio)
 if (Aurora_USE_CUDA)
-target_include_directories(Aurora PRIVATE ./src  ${CUDA_INCLUDE_DIRS}) 
+target_include_directories(Aurora PRIVATE ./src /usr/local/cuda/include) 
 set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 target_compile_options(Aurora PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
    -arch=sm_75 --expt-extended-lambda 
 >)
-target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} ${CUDA_cufft_LIBRARY})
+target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
 target_link_libraries(Aurora PRIVATE ${CUDA_cublas_LIBRARY})
 target_link_libraries(Aurora PRIVATE ${CUDA_cusolver_LIBRARY})
 endif(Aurora_USE_CUDA)
--- a/cmake/AuroraConfig.cmake
+++ b/cmake/AuroraConfig.cmake
@@ -1,6 +1,10 @@
 set(MKL_INTERFACE_FULL intel_lp64)
 find_package(OpenMP REQUIRED)
 find_package(MKL CONFIG REQUIRED)
 if(${USE_CUDA})
 enable_language(CUDA)
 find_package(CUDAToolkit REQUIRED)
 endif()
 set(Aurora_MAJOR_VERSION 1)
 set(Aurora_MINOR_VERSION 0)
@@ -9,11 +13,17 @@ set(Aurora_BUILD_VERSION 0)
 get_filename_component(Aurora_DIR "${CMAKE_CURRENT_LIST_DIR}/" PATH)
 message("Aurora_DIR: ${Aurora_DIR}")
-file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/[AFSCM]*.cpp" "${Aurora_DIR}/src/Matrix*.cpp" "${Aurora_DIR}/src/*.cu")
+if(${USE_CUDA})
 file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/*.cpp" "${Aurora_DIR}/src/*.cu")
 set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY})
 else()
 set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX )
 file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/*.cpp" )
 endif()
 message( ${Aurora_Source})
 set(Aurora_INCLUDE_DIRS "${Aurora_DIR}/src" "${Aurora_DIR}/thirdparty/include" $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
 set(Aurora_Complie_Options $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS> )
-set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY})
+
 set(Aurora_FOUND TRUE)
 message(Aurora Found)
--- a/src/CudaMatrix.cpp
+++ b/src/CudaMatrix.cpp
@@ -1,9 +1,8 @@
 #include "AuroraDefs.h"
 #include "Function1D.cuh"
 #include <complex>
 #include <utility>
 #ifdef USE_CUDA
 #include "Function1D.cuh"
 #include "CudaMatrix.h"
 #include "Function.h"
--- a/src/CudaMatrixPrivate.cu
+++ b/src/CudaMatrixPrivate.cu
@@ -5,11 +5,129 @@
 #include <thrust/functional.h>
 #include <thrust/execution_policy.h>
 #include "AuroraDefs.h"
 #include "AuroraThrustIterator.cuh"
 using namespace thrust::placeholders;
 struct PowOp: public thrust::unary_function<float, float>{
    float exponent;
    PowOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return powf(x, exponent);
    }
 };
 struct CompareGOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareGOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent<x?1.0:.0);
    }
 };
 struct CompareGEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareGEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent<=x?1.0:.0);
    }
 };
 struct CompareEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent==x?1.0:.0);
    }
 };
 struct CompareNEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareNEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent!=x?1.0:.0);
    }
 };
 struct CompareLOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareLOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent>x?1.0:.0);
    }
 };
 struct CompareLEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareLEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent>=x?1.0:.0);
    }
 };
 struct CompareAGOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x>y?1:0;
    }
 };
 struct CompareAGEOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x>=y?1:0;
    }
 };
 struct CompareAEOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x==y?1:0;
    }
 };
 struct CompareANEOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x!=y?1:0;
    }
 };
 typedef thrust::complex<float> complexf;
@@ -533,51 +651,29 @@ void unaryPow(float* in1, float N,float* out, unsigned long length){
        thrust::transform(thrust::device,in1,in1+length,out,op);
        return;
    }
-    auto lambdaPow = [N] __host__ __device__(float x) {
+    thrust::transform(thrust::device,in1,in1+length,out,PowOp(N));
            return powf(x,N);
    };
    thrust::transform(thrust::device,in1,in1+length,out,lambdaPow);
 }
 void unaryCompare(float* in1, const float& in2, float* out, unsigned long length, int type){
    switch (type)
    {
        case G:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareGOp(in2));
                {
                    return in2 < x ? 1.0 : .0;
                });
            break;
        case GE:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareGEOp(in2));
            {
                return in2 <= x ? 1.0 : .0;
            });
            break;
        case E:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareEOp(in2));
            {
                return in2 == x ? 1.0 : .0;
            });
            break;
        case NE:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareNEOp(in2));
            {
                return in2 != x ? 1.0 : .0;
            });
            break;
        case LE:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareLEOp(in2));
            {
                return in2 >= x ? 1.0 : .0;
            });
            break;
        case L:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2]__host__ __device__(const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareLOp(in2));
            {
                return in2 > x ? 1.0 : .0;
            });
            break;
        default:
            break;
@@ -587,89 +683,51 @@ void unaryCompare(const float& in1, float* in2, float* out, unsigned long length
    switch (type)
    {
        case G:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareLOp(in1));
            {
                return in1 > x ? 1.0 : .0;
            });
            break;
        case GE:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareLEOp(in1));
            {
                return in1 >= x ? 1.0 : .0;
            });
            break;
        case E:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareEOp(in1));
            {
                return in1 == x ? 1.0 : .0;
            });
            break;
        case NE:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareNEOp(in1));
            {
                return in1 != x ? 1.0 : .0;
            });
            break;
        case LE:
-            thrust::transform(thrust::device,in2,in2+length,out, [in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareGEOp(in1));
            {
                return in1 <= x ? 1.0 : .0;
            });
            break;
        case L:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareGOp(in1));
            {
                return in1 < x ? 1.0 : .0;
            });
            break;
        default:
            break;
    }
 }
 void unaryCompare(float* in1, float* in2, float* out, unsigned long length, int type){
    switch (type)
    {
        case G:
-            thrust::transform(thrust::device,in1,in1+length,in2,out, []__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAGOp());
            {
                return x > y ? 1. : .0;
            });
            break;
        case GE:
-            thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAGEOp());
            {
                return x >= y ? 1. : .0;
            });
            break;
        case E:
-            thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAEOp());
            {
                return x == y ? 1. : .0;
            });
            break;
        case NE:
-            thrust::transform(thrust::device,in1,in1+length,in2,out, []__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareANEOp());
            {
                return x != y ? 1. : .0;
            });
            break;
        case LE:
-            thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__ (float x, float y)
+            thrust::transform(thrust::device,in2,in2+length,in1,out,CompareAGEOp());
            {
                return x <= y ? 1. : .0;
            });
            break;
        case L:
-            thrust::transform(thrust::device,in1,in1+length,in2,out, [] __host__ __device__ (float x, float y)
+            thrust::transform(thrust::device,in2,in2+length,in1,out,CompareAGOp());
            {
                return x < y ? 1. : .0;
            });
            break;
        default:
            break;
    }
 }
 void thrustFill(float* aBegin, float* aEnd, float aValue)
--- a/src/Function1D.cpp
+++ b/src/Function1D.cpp
@@ -35,7 +35,55 @@ namespace {
    uint CONVERT_ADD_VALUE = UINT32_MAX - 4095;
-    inline void convertValue(short* aValue ,float* des){
+    inline void convertValue(float aValue ,float* des){
      float value = aValue;
      ushort *exponentPtr = (ushort *)&value;
      exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[2] = (exponentPtr[2] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[3] = (exponentPtr[3] >> 11) & CONVERT_AND_VALUE;
      float signValue = aValue;
      short *signPtr = (short *)&signValue;
      uint sign_bit[4] = {
          (uint)(signPtr[0] < 0 ? 1 : 0), (uint)(signPtr[1] < 0 ? 1 : 0),
          (uint)(signPtr[2] < 0 ? 1 : 0), (uint)(signPtr[3] < 0 ? 1 : 0)};
      float fraction3Value = aValue;
      ushort *fraction3Ptr = (ushort *)&fraction3Value;
      fraction3Ptr[0] &= CONVERT_AND_VALUE_2;
      fraction3Ptr[1] &= CONVERT_AND_VALUE_2;
      fraction3Ptr[2] &= CONVERT_AND_VALUE_2;
      fraction3Ptr[3] &= CONVERT_AND_VALUE_2;
      uint hidden_bit[4] = {
          sign_bit[0] * (!exponentPtr[0] ? 1 : 0) * CONVERT_MUL_VALUE +
              ((!sign_bit[0] && exponentPtr[0]) ? 1 : 0) * CONVERT_MUL_VALUE,
          sign_bit[1] * (!exponentPtr[1] ? 1 : 0) * 2048 +
              ((!sign_bit[1] && exponentPtr[1]) ? 1 : 0) * CONVERT_MUL_VALUE,
          sign_bit[2] * (!exponentPtr[2] ? 1 : 0) * CONVERT_MUL_VALUE +
              ((!sign_bit[2] && exponentPtr[2]) ? 1 : 0) * CONVERT_MUL_VALUE,
          sign_bit[3] * (!exponentPtr[3] ? 1 : 0) * 2048 +
              ((!sign_bit[3] && exponentPtr[3]) ? 1 : 0) * CONVERT_MUL_VALUE,
      };
      int outputPtr[4] = {0};
      uint temp = fraction3Ptr[0] + hidden_bit[0] + sign_bit[0] * CONVERT_ADD_VALUE;
      outputPtr[0] = exponentPtr[0] > 1 ? (temp << (exponentPtr[0] - 1))
                                  : (temp >> std::abs(exponentPtr[0] - 1));
      temp = fraction3Ptr[1] + hidden_bit[1] + sign_bit[1] * CONVERT_ADD_VALUE;
      outputPtr[1] = exponentPtr[1] > 1 ? (temp << (exponentPtr[1] - 1))
                                  : (temp >> std::abs(exponentPtr[1] - 1));
      temp = fraction3Ptr[2] + hidden_bit[2] + sign_bit[2] * CONVERT_ADD_VALUE;
      outputPtr[2] = exponentPtr[2] > 1 ? (temp << (exponentPtr[2] - 1))
                                  : (temp >> std::abs(exponentPtr[2] - 1));
      temp = fraction3Ptr[3] + hidden_bit[3] + sign_bit[3] * CONVERT_ADD_VALUE;
      outputPtr[3] = exponentPtr[3] > 1 ? (temp << (exponentPtr[3] - 1))
                                  : (temp >> std::abs(exponentPtr[3] - 1));
    des[0] = outputPtr[0];
    des[1] = outputPtr[1];
    des[2] = outputPtr[2];
    des[3] = outputPtr[3];
    }
    inline void convertValue2(short* aValue ,float* des){
      ushort exponentPtr[4] = {(ushort)aValue[0],(ushort)aValue[1],(ushort)aValue[2],(ushort)aValue[3]};
      exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
@@ -568,9 +616,7 @@ Matrix Aurora::acosd(const Matrix& aMatrix)
    {
        resultData[i] = resultData[i] * 180 / PI;
    }
-    Matrix result = Matrix::New(resultData, aMatrix);
+    return Matrix::New(resultData, aMatrix);
    nantoval(result, 0);
    return result;
 }
 Matrix Aurora::conj(const Matrix& aMatrix)
@@ -1050,14 +1096,14 @@ Matrix Aurora::convertfp16tofloat(short* aData, int aRows, int aColumns)
    #pragma omp parallel for
    for (size_t i = 0; i < quaterSize; i+=8) {
        //循环展开以避免过度的线程调用
-        if (i  < quaterSize)::convertValue((short*)(input+i*4), output + (i) * 4);
+        if (i  < quaterSize)::convertValue2((short*)(input+i*4), output + (i) * 4);
-        if (i+1  < quaterSize)::convertValue((short*)(input+(i+1)*4), output + (i+1) * 4);
+        if (i+1  < quaterSize)::convertValue2((short*)(input+(i+1)*4), output + (i+1) * 4);
-        if (i+2  < quaterSize)::convertValue((short*)(input+(i+2)*4), output + (i+2) * 4);
+        if (i+2  < quaterSize)::convertValue2((short*)(input+(i+2)*4), output + (i+2) * 4);
-        if (i+3  < quaterSize)::convertValue((short*)(input+(i+3)*4), output + (i+3) * 4);
+        if (i+3  < quaterSize)::convertValue2((short*)(input+(i+3)*4), output + (i+3) * 4);
-        if (i+4  < quaterSize)::convertValue((short*)(input+(i+4)*4), output + (i+4) * 4);
+        if (i+4  < quaterSize)::convertValue2((short*)(input+(i+4)*4), output + (i+4) * 4);
-        if (i+5  < quaterSize)::convertValue((short*)(input+(i+5)*4), output + (i+5) * 4);
+        if (i+5  < quaterSize)::convertValue2((short*)(input+(i+5)*4), output + (i+5) * 4);
-        if (i+6  < quaterSize)::convertValue((short*)(input+(i+6)*4), output + (i+6) * 4);
+        if (i+6  < quaterSize)::convertValue2((short*)(input+(i+6)*4), output + (i+6) * 4);
-        if (i+7  < quaterSize)::convertValue((short*)(input+(i+7)*4), output + (i+7) * 4);
+        if (i+7  < quaterSize)::convertValue2((short*)(input+(i+7)*4), output + (i+7) * 4);
    }
    return Matrix::New(output,aRows,aColumns,1);
 }
--- a/src/Function1D.cu
+++ b/src/Function1D.cu
@@ -8,12 +8,6 @@
 #include <cstddef>
 #include <cstdlib>
 #include <sys/types.h>
 //this 3 header, only need by cuda version >= 12
 #include <thrust/unique.h>
 #include <thrust/sort.h>
 #include <thrust/set_operations.h>
 #include <thrust/device_vector.h>
 #include <thrust/transform.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -55,7 +49,6 @@ __global__ void complexKernel(float *aInputRealData, float *aInputImagData, floa
    }
 }
 CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
 {
    if(aMatrix.isComplex())
@@ -360,8 +353,7 @@ CudaMatrix Aurora::sign(const CudaMatrix &&aMatrix)
    return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
 }
-void Aurora::nantoval(CudaMatrix &aMatrix, float val)
+void Aurora::nantoval(CudaMatrix& aMatrix,float val){
 {
    auto lambda = [=] __host__ __device__ (const float& x){
        return ::isnan(x)?val:x;
    };
@@ -369,8 +361,7 @@ void Aurora::nantoval(CudaMatrix &aMatrix, float val)
        aMatrix.getData()+aMatrix.getDataSize(),aMatrix.getData(),lambda);
 }
-CudaMatrix Aurora::isnan(const CudaMatrix &aMatrix)
+CudaMatrix Aurora::isnan(const CudaMatrix& aMatrix){
 {
    size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
    float* data = nullptr;
@@ -384,8 +375,7 @@ CudaMatrix Aurora::isnan(const CudaMatrix &aMatrix)
        aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());    
 }
-CudaMatrix Aurora::isfinite(const CudaMatrix &aMatrix)
+CudaMatrix Aurora::isfinite(const CudaMatrix& aMatrix){
 {
    size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
    float* data = nullptr;
    cudaMalloc((void**)&data, sizeof(float) * size);
@@ -398,15 +388,13 @@ CudaMatrix Aurora::isfinite(const CudaMatrix &aMatrix)
        aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());  
 }
-void Aurora::padding(CudaMatrix &aMatrix, int aIndex, float aValue)
+void Aurora::padding(CudaMatrix& aMatrix, int aIndex, float aValue){
 {
    if(aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex())
    {
        std::cerr<<"padding only support real vector"<<std::endl;
        return;
    }
-    if (aMatrix.getDataSize() > aIndex)
+    if (aMatrix.getDataSize()>aIndex){
    {
        aMatrix.setValue(aIndex,  aValue);
        return;
    }
@@ -419,13 +407,12 @@ void Aurora::padding(CudaMatrix &aMatrix, int aIndex, float aValue)
    aMatrix=CudaMatrix::fromRawData(data,size,1,1,aMatrix.getValueType());
 }
-CudaMatrix Aurora::auroraNot(const CudaMatrix &aMatrix)
+
-{
+CudaMatrix Aurora::auroraNot(const CudaMatrix& aMatrix){
    return auroraNot(std::forward<CudaMatrix&&>(aMatrix.deepCopy()));
 }
-CudaMatrix Aurora::auroraNot(CudaMatrix &&aMatrix)
+CudaMatrix Aurora::auroraNot(CudaMatrix&& aMatrix){
 {
    size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
    float* data = nullptr;
    cudaMalloc((void**)&data, sizeof(float) * size);
@@ -438,6 +425,7 @@ CudaMatrix Aurora::auroraNot(CudaMatrix &&aMatrix)
        aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());    
 }
 void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op)
 {
    switch (op)
@@ -450,40 +438,35 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, float compareValue, float newV
        thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
        break;
    }
-    case NG:
+    case NG:{
    {
        auto lambda = [=] __host__ __device__ (const float& x){
            return x<=compareValue?newValue:x;
        };
        thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
        break;
    }
-    case EQ:
+    case EQ:{
    {
        auto lambda = [=] __host__ __device__ (const float& x){
            return x==compareValue?newValue:x;
        };
        thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
        break;
    }
-    case NE:
+    case NE:{
    {
        auto lambda = [=] __host__ __device__ (const float& x){
            return x!=compareValue?newValue:x;
        };
        thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
        break;
    }
-    case NL:
+    case NL:{
    {
        auto lambda = [=] __host__ __device__ (const float& x){
            return x>=compareValue?newValue:x;
        };
        thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
        break;
    }
-    case LT:
+    case LT:{
    {
        auto lambda = [=] __host__ __device__ (const float& x){
            return x<compareValue?newValue:x;
        };
@@ -493,10 +476,10 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, float compareValue, float newV
    default:
        break;
    }
 }
-void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, float compareValue, float newValue, CompareOp op)
+void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,float compareValue, float newValue,CompareOp op){
 {
    switch (op)
    {
    case GT:
@@ -510,8 +493,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
            lambda);
        break;
    }
-    case NG:
+    case NG:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x<=compareValue?newValue:y;
        };
@@ -521,8 +503,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
            lambda);
        break;
    }
-    case EQ:
+    case EQ:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x==compareValue?newValue:y;
        };
@@ -532,8 +513,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
            lambda);
        break;
    }
-    case NE:
+    case NE:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x!=compareValue?newValue:y;
        };
@@ -543,19 +523,17 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
            lambda);
        break;
    }
-    case NL:
+    case NL:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x>=compareValue?newValue:y;
        };
-        thrust::transform(aCompareMatrix.getData(),
+        thrust::transform(thrust::device,aCompareMatrix.getData(),
            aCompareMatrix.getData()+aValueMatrix.getDataSize(),
            aValueMatrix.getData(), aValueMatrix.getData(),
            lambda);
        break;
    }
-    case LT:
+    case LT:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x<compareValue?newValue:y;
        };
@@ -570,8 +548,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
    }
 }
-void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherCompareMatrix, float newValue, CompareOp op)
+void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op){
 {
    switch (op)
    {
    case GT:
@@ -585,8 +562,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
            lambda);
        break;
    }
-    case NG:
+    case NG:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x<=y?newValue:x;
        };
@@ -596,8 +572,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
            lambda);
        break;
    }
-    case EQ:
+    case EQ:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x==y?newValue:x;
        };
@@ -607,8 +582,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
            lambda);
        break;
    }
-    case NE:
+    case NE:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x!=y?newValue:x;
        };
@@ -618,8 +592,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
            lambda);
        break;
    }
-    case NL:
+    case NL:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x>=y?newValue:x;
        };
@@ -629,8 +602,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
            lambda);
        break;
    }
-    case LT:
+    case LT:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x<y?newValue:x;
        };
@@ -660,8 +632,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
            lambda);
        break;
    }
-    case NG:
+    case NG:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x<=compareValue?y:x;
        };
@@ -671,8 +642,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
            lambda);
        break;
    }
-    case EQ:
+    case EQ:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x==compareValue?y:x;
        };
@@ -682,8 +652,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
            lambda);
        break;
    }
-    case NE:
+    case NE:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x!=compareValue?y:x;
        };
@@ -693,8 +662,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
            lambda);
        break;
    }
-    case NL:
+    case NL:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x>=compareValue?y:x;
        };
@@ -704,8 +672,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
            lambda);
        break;
    }
-    case LT:
+    case LT:{
    {
        auto lambda = [=] __host__ __device__ (const float& x, const float& y){
            return x<compareValue?y:x;
        };
@@ -811,6 +778,7 @@ __global__ void repMat3DKernel(float *aInputData, unsigned int aInputRows, unsig
    {
        aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows];
    }
 }
 CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
@@ -988,17 +956,6 @@ __global__ void conjKernel(float *aInputData, float *aOutput, unsigned int aInpu
    }
 }
 __global__ void conjInplaceKernel(float *aInputData, unsigned int aInputSize)
 {
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < aInputSize)
    {
        unsigned int index = idx * 2;
        aInputData[index + 1] = -aInputData[index + 1];
    }
 }
 CudaMatrix Aurora::conj(const CudaMatrix& aMatrix)
 {
    if(!aMatrix.isComplex())
@@ -1014,18 +971,6 @@ CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
    return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
 }
 CudaMatrix Aurora::conj(CudaMatrix &&aMatrix)
 {
    if (!aMatrix.isComplex())
    {
        return CudaMatrix::copyFromRawData(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
    }
    size_t size = aMatrix.getDataSize();
    int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    conjInplaceKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), size);
    cudaDeviceSynchronize();
    return aMatrix;
 }
 float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod)
 {
@@ -1129,20 +1074,13 @@ float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod)
            }
        }
-        if (d_S)
+        if (d_S ) cudaFree(d_S);
-            cudaFree(d_S);
+        if (d_U ) cudaFree(d_U);
-        if (d_U)
+        if (d_VT) cudaFree(d_VT);
-            cudaFree(d_U);
+        if (devInfo) cudaFree(devInfo);
-        if (d_VT)
+        if (d_work) cudaFree(d_work);
-            cudaFree(d_VT);
+        if (cusolverH) cusolverDnDestroy(cusolverH);
-        if (devInfo)
+        if (stream) cudaStreamDestroy(stream);
            cudaFree(devInfo);
        if (d_work)
            cudaFree(d_work);
        if (cusolverH)
            cusolverDnDestroy(cusolverH);
        if (stream)
            cudaStreamDestroy(stream);
        return resultValue;
    }
 }
@@ -1167,6 +1105,7 @@ __global__ void transposeKernel(float *aInputData, float *aOutput, unsigned int
    }
 }
 CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix)
 {
    //not surpport for 3 dims.
@@ -1252,6 +1191,7 @@ __global__ void vertcatKernel(float *aInputData1, unsigned int aInputData1RowSiz
            aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize];
        }
    }
 }
 CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
@@ -1571,10 +1511,12 @@ CudaMatrix Aurora::createCudaVectorMatrix(float aStartValue, float aStepValue, f
 struct compareMatrixByRows
 {
    compareMatrixByRows(unsigned int aSize)
-        : mSize(aSize) {
+        : mSize(aSize)
    {
    };
    unsigned int mSize;
-    __host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const
+    __host__ __device__
    bool operator()(const float* aVector1, const float* aVector2) const
    {
        for(unsigned int i=0; i<mSize; ++i)
        {
@@ -1594,10 +1536,12 @@ struct compareMatrixByRows
 struct uniqueMatrixByRows
 {
    uniqueMatrixByRows(unsigned int aSize)
-        : mSize(aSize) {
+        : mSize(aSize)
    {
    };
    unsigned int mSize;
-    __host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const
+    __host__ __device__
    bool operator()(const float* aVector1, const float* aVector2) const
    {
        for(unsigned int i=0; i<mSize; ++i)
        {
@@ -1613,11 +1557,14 @@ struct uniqueMatrixByRows
 struct equalMatrixByRows
 {
    equalMatrixByRows(unsigned int aSize, float* aValue)
-        : mSize(aSize), mValue(aValue) {
+        : mSize(aSize)
        , mValue(aValue)
    {
    };
    unsigned int mSize;
    float* mValue;
-    __host__ __device__ bool operator()(const float *aVector) const
+    __host__ __device__
    bool operator()(const float* aVector) const
    {
        for(unsigned int i=0; i<mSize; ++i)
        {
@@ -1630,6 +1577,8 @@ struct equalMatrixByRows
    }
 };
 CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexResult)
 {
    CudaMatrix transposeMatrix = transpose(aMatrix);
@@ -1668,14 +1617,12 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix &aMatrix, CudaMatrix &aIndexRes
    return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
 }
-__global__ void convertValueKernel(float *aSrc, float *aDes, unsigned int size)
+__global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){
 {
    __shared__ ushort CONVERT_AND_VALUE;
    __shared__ ushort CONVERT_AND_VALUE_2;
    __shared__ ushort CONVERT_MUL_VALUE;
    __shared__ unsigned int CONVERT_ADD_VALUE;
-    if (threadIdx.x == 0)
+    if (threadIdx.x == 0){
    {
        CONVERT_AND_VALUE = 15u;
        CONVERT_AND_VALUE_2 = 2047u;
        CONVERT_MUL_VALUE = 2048u;
@@ -1683,8 +1630,7 @@ __global__ void convertValueKernel(float *aSrc, float *aDes, unsigned int size)
    }
    __syncthreads();
    unsigned int idx = blockIdx.x*blockDim.x +threadIdx.x;
-    if (idx >= size)
+    if (idx >= size) return; 
        return;
    short value = aSrc[idx];
    ushort exponent=(ushort)value;
    exponent = (exponent >> 11) & CONVERT_AND_VALUE;
--- a/src/Function1D.cuh
+++ b/src/Function1D.cuh
@@ -63,8 +63,6 @@ namespace Aurora
    CudaMatrix conj(const CudaMatrix& aMatrix);
    CudaMatrix conj(CudaMatrix&& aMatrix);
    float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
    CudaMatrix transpose(const CudaMatrix& aMatrix);
@@ -126,4 +124,5 @@ namespace Aurora
    void compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op);
    void compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatrix& aNewValueMatrix,CompareOp op);
 }
 #endif //AURORA_CUDA_FUNCTION1D_H
--- a/src/Function2D.cpp
+++ b/src/Function2D.cpp
@@ -1034,87 +1034,3 @@ Matrix Aurora::sub2ind(const Matrix &aVMatrixSize, std::initializer_list<Matrix>
    delete [] strides;
    return Matrix::New(output,returnVectorSize,1,1);
 }
 void Aurora::findPeaksHost(const Matrix & aData,  int aNPeaks, float aMinPeakHeight, float MinPeakProminece,
        int* outIndex)
 {
    int  signalSize = aData.getDimSize(0);
    int  signalCount = aData.getDimSize(1);
    #pragma omp parallel for
    for (size_t threadIndex = 0; threadIndex < signalCount; threadIndex++)
    {
        float* dataPointer  = aData.getData() + threadIndex*signalSize ;
        float newPeak = dataPointer[0];
        float newValley = dataPointer[0];
        int peakIndex = 0;
        float higherValley;
        int indexs[32];
        float values[32];
        for (size_t i = 0; i < aNPeaks; i++)
        {
            indexs[i] = signalSize;
            values[i] = 0;
        }
        int save_index=0;
        for (int i = 1; i < signalSize - 1; ++i)
        {
            // find peaks
            if (dataPointer[i] > dataPointer[i - 1] && dataPointer[i] > dataPointer[i + 1])
            {
                newPeak = dataPointer[i];
                peakIndex  = i;
            };
            // find valley
            if (dataPointer[i] < dataPointer[i - 1] && dataPointer[i] < dataPointer[i + 1])
            {
                higherValley = std::max(newValley, dataPointer[i]);
                newValley = dataPointer[i];
                if (newPeak >= aMinPeakHeight)
                {
                    float prominece = newPeak - higherValley;
                    if (prominece >= MinPeakProminece)
                    {
                        if (save_index < aNPeaks)
                        {
                            values[save_index] = newPeak;
                            indexs[save_index] = peakIndex;
                            save_index++;
                        }
                        else
                        {
                            for (size_t j = 0; j < aNPeaks; j++)
                            {
                                if (values[j] < newPeak)
                                {
                                    std::swap(values[j], newPeak);
                                    std::swap(indexs[j], peakIndex);
                                }
                            }
                        }
                    }
                }
            }
        }
        if (save_index>=aNPeaks)
        {
            for (size_t i = 0; i < aNPeaks; i++)
            {
                for (size_t j = i+1; j < aNPeaks; j++)
                {
                    if (indexs[i]>indexs[j])
                    {
                        std::swap(values[j], values[i]);
                        std::swap(indexs[j], indexs[i]);
                    }
                }
            }
        }
        for (size_t i = 0; i < aNPeaks; i++)
        {
            // aOutPeaks[threadIndex*aNPeaks+i] = values[i];
            outIndex[threadIndex*aNPeaks+i] = indexs[i];
        }
    }
 }
--- a/src/Function2D.cu
+++ b/src/Function2D.cu
--- a/src/Function2D.cuh
+++ b/src/Function2D.cuh
@@ -64,7 +64,6 @@ namespace Aurora
    CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
    CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
    CudaMatrix ifft(CudaMatrix && aMatrix);
    CudaMatrix hilbert(const CudaMatrix &aMatrix);
@@ -87,19 +86,6 @@ namespace Aurora
     */
    CudaMatrix ifft_symmetric(const CudaMatrix &aMatrix,long aLength);
    CudaMatrix valid(const CudaMatrix& aData, const CudaMatrix aValid);
    /**
     * findPeaks 按列进行峰查找和匹配
     * @attention 不要给aOutIndexs提前分配内存
     * @param aData 输入
     * @param aNpeaks 峰数量
     * @param aMinPeakHeight 最小高度
     * @param aMinPeakProminence 最小相对高度
     * @param aOutIndexs 空指针，会在函数内分配device内存
     * @return 筛选出的峰高度
    */
    CudaMatrix findPeaks(const CudaMatrix& aData, int aNpeaks, float aMinPeakHeight, float aMinPeakProminence, int** aOutIndexs);
 }
 #endif // __FUNCTION2D_CUDA_H__
--- a/src/Function2D.h
+++ b/src/Function2D.h
@@ -107,10 +107,10 @@ namespace Aurora
     * 当第一列包含重复的元素时，sortrows 会根据下一列中的值进行排序，并对后续的相等值重复此行为。
     * @attention 目前不支持三维，不支持复数
     * @param aMatrix 目标矩阵
-     * @param indexMatrix 排序后各行的原索引矩阵指针，必须要有
+     * @param indexMatrix 排序后各行的原索引矩阵指针，非必须
     * @return 排序后矩阵
     */
-    Matrix sortrows(const Matrix &aMatrix, Matrix* indexMatrix);
+    Matrix sortrows(const Matrix &aMatrix, Matrix* indexMatrix=nullptr);
    /**
     * 对矩阵求中间值 按列, 目前不支持三维，不支持复数
@@ -178,7 +178,7 @@ namespace Aurora
     * @return
     */
    Matrix sub2ind(const Matrix &aVMatrixSize, std::initializer_list<Matrix> aSliceIdxs);
-    void findPeaksHost(const Matrix & aData,  int aNPeaks, float aMinPeakHeight, float MinPeakProminece,int* outIndex);
+    
 };
 #endif // AURORA_FUNCTION2D_H
--- a/src/Function3D.cpp
+++ b/src/Function3D.cpp
@@ -5,10 +5,12 @@
 #include "Function.h"
 #ifdef USE_CUDA
 #include "CudaMatrix.h"
 #include "CudaMatrixPrivate.cuh"
 #include <cuda_runtime.h>
-#endif // USE_CUDA
+#include "CudaMatrixPrivate.cuh"
 #include "CudaMatrix.h"
 #endif
 //必须在Eigen之前
 #include "AuroraDefs.h"
@@ -72,6 +74,7 @@ Matrix Aurora::ones(int aRow, int aColumn, int aSlice) {
    return Matrix::New(data,rowSize,colSize,aSlice);
 }
 Matrix Aurora::ones(int aSquareRow) {
    return Aurora::ones(aSquareRow, aSquareRow);
 }
@@ -92,6 +95,7 @@ Matrix Aurora::zeros(int aRow, int aColumn, int aSlice) {
    return Matrix::New(data,rowSize,colSize,sliceSize);
 }
 Matrix Aurora::zeros(int aSquareRow) {
    return Aurora::zeros(aSquareRow, aSquareRow);
 }
@@ -214,7 +218,7 @@ Matrix Aurora::meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix&
    return result;
 }
-#if USE_CUDA
+#ifdef USE_CUDA
 CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){
    if (aRow == 0 || aColumn == 0)
    {
@@ -251,7 +255,6 @@ CudaMatrix Aurora::zerosCuda(int aRow, int aColumn, int aSlice) {
    return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize);
 }
 CudaMatrix Aurora::zerosCuda(int aSquareRow) {
    return Aurora::zerosCuda(aSquareRow, aSquareRow);
 }
--- a/src/Function3D.cu
+++ b/src/Function3D.cu
@@ -1,82 +0,0 @@
 #include "Function3D.cuh"
 using namespace Aurora;
 __global__ void interp3Kernel(cudaTextureObject_t aTexObj, float* aOutputData, float aStartX, float aDx, float aEndX, float aStartY, float aDy
                              , float aEndY, float aStartZ, float aDz, float aEndZ, float* aNewX, float* aNewY, float* aNewZ 
                              , int aOutputRowSize, int aOutputColumnSize, int aOutputSliceSize, float aOutValue)
 {
    int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    int zIndex = blockIdx.z * blockDim.z + threadIdx.z;
    if(xIndex > aOutputRowSize - 1 ||  yIndex > aOutputColumnSize - 1 || zIndex > aOutputSliceSize - 1)
    {
        return;
    }
    size_t index = zIndex * aOutputRowSize * aOutputColumnSize + yIndex * aOutputRowSize + xIndex;
    float x = aNewX[index];
    float y = aNewY[index];
    float z = aNewZ[index];
    if(x > aEndX || x < aStartX || y > aEndY || y < aStartY || z > aEndZ || z < aStartZ)
    {
        aOutputData[index] = aOutValue;
    }
    else
    {
        aOutputData[index] = tex3D<float>(aTexObj, (x - aStartX) / aDx + 0.5, (y - aStartY) / aDy + 0.5, (z - aStartZ) / aDz + 0.5);
    }
 }
 CudaMatrix Aurora::interp3(float aStartX, float aDx, float aEndX, float aStartY, float aDy, float aEndY,
                           float aStartZ, float aDz, float aEndZ, const CudaMatrix& aValue,
                           const CudaMatrix& aNewX, const CudaMatrix& aNewY, const CudaMatrix& aNewZ, float aOutValue)
 {
    cudaTextureObject_t texObj;
    size_t dimX = aValue.getDimSize(1);
    size_t dimY = aValue.getDimSize(0);
    size_t dimZ = aValue.getDimSize(2);
    cudaExtent extent = make_cudaExtent(dimX, dimY, dimZ);
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    cudaArray* cuArray;
    cudaMalloc3DArray(&cuArray, &channelDesc, extent);
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr = make_cudaPitchedPtr(aValue.getData(), dimX * sizeof(float), dimX, dimY);
    copyParams.dstArray = cuArray;
    copyParams.extent = extent;
    copyParams.kind = cudaMemcpyDeviceToDevice;
    cudaMemcpy3D(&copyParams);
    cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;
    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.addressMode[1] = cudaAddressModeClamp;
    texDesc.addressMode[2] = cudaAddressModeClamp;
    texDesc.readMode = cudaReadModeElementType;
    cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
    int row = aNewX.getDimSize(0);
    int column = aNewX.getDimSize(1);
    int slice = aNewX.getDimSize(2);
    dim3 blockDim(4,4,4);
    dim3 gridDim(row / 4 + 1, column / 4 + 1, slice / 4 + 1);
    float *data = nullptr;
    cudaMalloc((void **)&data, sizeof(float) * row * column * slice);
    CudaMatrix result = Aurora::CudaMatrix::fromRawData(data, row, column, slice);
    interp3Kernel<<<gridDim, blockDim>>>(texObj, data, aStartX, aDx, aEndX, aStartY, aDy, aEndY, aStartZ, aDz, aEndZ, 
                                         aNewX.getData(), aNewY.getData(), aNewZ.getData(), row, column, slice, aOutValue);
    cudaDeviceSynchronize();
    return result;
 }
--- a/src/Function3D.cuh
+++ b/src/Function3D.cuh
@@ -1,13 +0,0 @@
 #ifndef __FUNCTION3D_CUDA__
 #define __FUNCTION3D_CUDA__
 #include "CudaMatrix.h"
 #include "AuroraDefs.h"
 namespace Aurora
 {
    CudaMatrix interp3(float aStartX, float aDx, float aEndX, float aStartY, float aDy, float aEndY,
                       float aStartZ, float aDz, float aEndZ, const CudaMatrix& aValue,
                       const CudaMatrix& aNewX, const CudaMatrix& aNewY, const CudaMatrix& aNewZ, float aOutValue);
 }
 #endif // __FUNCTION3D_CUDA_H__
--- a/src/Function3D.h
+++ b/src/Function3D.h
@@ -4,10 +4,8 @@
 #include "Matrix.h"
 #include "Function1D.h"
 #if USE_CUDA
 #include "CudaMatrix.h"
-#endif
+
 namespace Aurora {
    /**
@@ -19,6 +17,8 @@ namespace Aurora {
     */
    Matrix ones(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
    /**
     * 创建全部为1的方阵
     * @param aSquareRow
@@ -26,6 +26,8 @@ namespace Aurora {
     */
    Matrix ones(int aSquareRow);
    CudaMatrix onesCuda(int aSquareRow);
    /**
     * 创建全部为0的数组，矩阵
     * @param aRow 行数，必须大于0
@@ -35,30 +37,25 @@ namespace Aurora {
     */
    Matrix zeros(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
    /**
     * 创建全部为0的方阵
     * @param aSquareRow
     * @return 全部为0的方阵
     */
    Matrix zeros(int aSquareRow);
    CudaMatrix zerosCuda(int aSquareRow);
    Matrix interp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
    Matrix meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval);
    Matrix interpn(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
    Matrix size(const Matrix &aMatrix);
    CudaMatrix size(const CudaMatrix &aMatrix);
    int size(const Matrix &aMatrix,int dims);
    #if USE_CUDA
    CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix onesCuda(int aSquareRow);
    CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix zerosCuda(int aSquareRow);
    CudaMatrix size(const CudaMatrix &aMatrix);
    int size(const CudaMatrix &aMatrix,int dims);
-    #endif
+
 };
--- a/src/Matrix.cpp
+++ b/src/Matrix.cpp
@@ -1,4 +1,6 @@
 #include "Matrix.h"
 #include <cmath>
 #include <complex>
 #include <cstddef>
--- a/src/Matrix.h
+++ b/src/Matrix.h
@@ -10,10 +10,7 @@
 namespace Aurora {
    const int $ = -1;
    #if USE_CUDA
    class CudaMatrix;
    #endif
    class Matrix {
    public:
@@ -289,9 +286,8 @@ namespace Aurora {
        void forceReshape(int rows, int columns, int slices);
        #if USE_CUDA
        CudaMatrix toDeviceMatrix() const;
-        #endif
+
    private:
        ValueType mValueType = Normal;
--- a/src/main.cxx
+++ b/src/main.cxx
@@ -7,19 +7,15 @@
 #include <complex>
 #include "Matrix.h"
 #include "CudaMatrix.h"
 #include "Function.h"
 #include "Function1D.h"
 #include "Function2D.h"
 #include "Function3D.h"
 #include "MatlabReader.h"
 #if USE_CUDA
 #include "CudaMatrix.h"
 #endif //USE_CUDA
 int main()
 {
    #if USE_CUDA
    auto A = Aurora::zeros(1000,1,1);
    auto B = Aurora::zeros(1000,1,1);
    for (size_t i = 0; i < 1000; i++)
@@ -119,6 +115,5 @@ int main()
            }
        }
    }
    #endif //USE_CUDA
    return 0;
 }
--- a/test/CudaMatrix_Test.cpp
+++ b/test/CudaMatrix_Test.cpp
@@ -2558,55 +2558,7 @@ TEST_F(CudaMatrix_Test, MatrixCompare){
    }
    {
        auto R= (9!=B);
-        auto dhR = (dB!=9).toHostMatrix();
+        auto dhR = (9!=dB).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
        {
        auto R= (9<B);
        auto dhR = (dB>9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9>B);
        auto dhR = (dB<9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9<=B);
        auto dhR = (dB>=9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9>=B);
        auto dhR = (dB<=9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9==B);
        auto dhR = (dB == 9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9!=B);
        auto dhR = (dB!=9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
--- a/test/Function2D_Cuda_Test.cpp
+++ b/test/Function2D_Cuda_Test.cpp
@@ -5,7 +5,6 @@
 #include "Function.h"
 #include "Matrix.h"
 #include "TestUtility.h"
 #include "MatlabReader.h"
 #include "Function2D.h"
 #include "Function2D.cuh"
@@ -19,15 +18,11 @@ protected:
    static void TearDownTestCase(){
    }
    public:
    Aurora::Matrix mSignal;
    Aurora::CudaMatrix dmSignal;
    Aurora::Matrix B;
    Aurora::CudaMatrix dB;
    void SetUp(){
-        MatlabReader m("/home/krad/TestData/peaks.mat");
+
        mSignal = m.read("AScan_env_norm");
        dmSignal = mSignal.toDeviceMatrix();
    }
    void TearDown(){
    }
@@ -1002,17 +997,3 @@ TEST_F(Function2D_Cuda_Test, hilbert) {
        EXPECT_NEAR(ret1[i], ret2.getValue(i), 0.01);
    }
 }
 TEST_F(Function2D_Cuda_Test, findPeaks) {
    int* indexs = new int[mSignal.getDimSize(1)*10];
    auto ret1 = Aurora::findPeaks(dmSignal,10, 0.2, 0.05,indexs);
    auto reH = ret1.toHostMatrix();
    for(unsigned int i=0; i<10; ++i)
    {
        printf("%d,",indexs[i]);
    }
    delete [] indexs;
 }
--- a/test/Function2D_Test.cpp
+++ b/test/Function2D_Test.cpp
@@ -2,8 +2,6 @@
 #include <vector>
 #include "TestUtility.h"
 #include "MatlabReader.h"
 #include "Matrix.h"
 #include "Function.h"
 #include "Function1D.h"
@@ -18,11 +16,7 @@ protected:
    }
    static void TearDownTestCase(){
    }
    public:
    Aurora::Matrix mSignal;
    void SetUp(){
        MatlabReader m("/home/krad/TestData/peaks.mat");
        mSignal = m.read("AScan_env_norm");
    }
    void TearDown(){
    }
@@ -579,15 +573,3 @@ TEST_F(Function2D_Test, sub2ind) {
 }
 TEST_F(Function2D_Test, findPeaks) {
    int* indexs = new int[mSignal.getDimSize(1)*10];
    Aurora::findPeaksHost(mSignal,10, 0.2, 0.05,indexs);
    for(unsigned int i=0; i<10; ++i)
    {
        printf("%d,",indexs[i]);
    }
    delete [] indexs;
 }