feat: Add remove cuda option and cmak config support

2024-11-04 10:58:34 +08:00
21 changed files with 1554 additions and 2068 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,10 +8,9 @@ set(Aurora_USE_CUDA ON)
 if (Aurora_USE_CUDA)
 set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
-set(CUDA_TOOLKIT_ROOT /usr/local/cuda)
+
 enable_language(CUDA)
 find_package(CUDA)
 find_package(CUDAToolkit REQUIRED)
 add_definitions(-DUSE_CUDA)
 endif(Aurora_USE_CUDA)
@@ -44,12 +43,12 @@ target_link_libraries(Aurora PUBLIC $<LINK_ONLY:MKL::MKL>)
 target_link_libraries(Aurora PUBLIC OpenMP::OpenMP_CXX)
 target_link_libraries(Aurora PUBLIC matio)
 if (Aurora_USE_CUDA)
-target_include_directories(Aurora PRIVATE ./src  ${CUDA_INCLUDE_DIRS}) 
+target_include_directories(Aurora PRIVATE ./src /usr/local/cuda/include) 
 set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 target_compile_options(Aurora PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
    -arch=sm_75 --expt-extended-lambda 
 >)
-target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} ${CUDA_cufft_LIBRARY})
+target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
 target_link_libraries(Aurora PRIVATE ${CUDA_cublas_LIBRARY})
 target_link_libraries(Aurora PRIVATE ${CUDA_cusolver_LIBRARY})
 endif(Aurora_USE_CUDA)
--- a/cmake/AuroraConfig.cmake
+++ b/cmake/AuroraConfig.cmake
@@ -1,6 +1,10 @@
 set(MKL_INTERFACE_FULL intel_lp64)
 find_package(OpenMP REQUIRED)
 find_package(MKL CONFIG REQUIRED)
 if(${USE_CUDA})
 enable_language(CUDA)
 find_package(CUDAToolkit REQUIRED)
 endif()
 set(Aurora_MAJOR_VERSION 1)
 set(Aurora_MINOR_VERSION 0)
@@ -9,11 +13,17 @@ set(Aurora_BUILD_VERSION 0)
 get_filename_component(Aurora_DIR "${CMAKE_CURRENT_LIST_DIR}/" PATH)
 message("Aurora_DIR: ${Aurora_DIR}")
-file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/[AFSCM]*.cpp" "${Aurora_DIR}/src/Matrix*.cpp" "${Aurora_DIR}/src/*.cu")
+if(${USE_CUDA})
 file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/*.cpp" "${Aurora_DIR}/src/*.cu")
 set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY})
 else()
 set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX )
 file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/*.cpp" )
 endif()
 message( ${Aurora_Source})
 set(Aurora_INCLUDE_DIRS "${Aurora_DIR}/src" "${Aurora_DIR}/thirdparty/include" $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
 set(Aurora_Complie_Options $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS> )
-set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY})
+
 set(Aurora_FOUND TRUE)
 message(Aurora Found)
--- a/src/CudaMatrix.cpp
+++ b/src/CudaMatrix.cpp
@@ -1,9 +1,8 @@
 #include "AuroraDefs.h"
 #include "Function1D.cuh"
 #include <complex>
 #include <utility>
 #ifdef USE_CUDA
 #include "Function1D.cuh"
 #include "CudaMatrix.h"
 #include "Function.h"
--- a/src/CudaMatrixPrivate.cu
+++ b/src/CudaMatrixPrivate.cu
@@ -5,11 +5,129 @@
 #include <thrust/functional.h>
 #include <thrust/execution_policy.h>
 #include "AuroraDefs.h"
 #include "AuroraThrustIterator.cuh"
 using namespace thrust::placeholders;
 struct PowOp: public thrust::unary_function<float, float>{
    float exponent;
    PowOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return powf(x, exponent);
    }
 };
 struct CompareGOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareGOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent<x?1.0:.0);
    }
 };
 struct CompareGEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareGEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent<=x?1.0:.0);
    }
 };
 struct CompareEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent==x?1.0:.0);
    }
 };
 struct CompareNEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareNEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent!=x?1.0:.0);
    }
 };
 struct CompareLOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareLOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent>x?1.0:.0);
    }
 };
 struct CompareLEOp: public thrust::unary_function<float, float>{
    float exponent;
    CompareLEOp(float v):exponent(v) {}
    void setExponent(float v){
        exponent = v;
    }
    __host__ __device__
    float operator()(const float& x)  {
        return (exponent>=x?1.0:.0);
    }
 };
 struct CompareAGOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x>y?1:0;
    }
 };
 struct CompareAGEOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x>=y?1:0;
    }
 };
 struct CompareAEOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x==y?1:0;
    }
 };
 struct CompareANEOp{
    __host__ __device__
    float operator()(const float& x,const float& y)  {
        return x!=y?1:0;
    }
 };
 typedef thrust::complex<float> complexf;
@@ -533,51 +651,29 @@ void unaryPow(float* in1, float N,float* out, unsigned long length){
        thrust::transform(thrust::device,in1,in1+length,out,op);
        return;
    }
-    auto lambdaPow = [N] __host__ __device__(float x) {
+    thrust::transform(thrust::device,in1,in1+length,out,PowOp(N));
            return powf(x,N);
    };
    thrust::transform(thrust::device,in1,in1+length,out,lambdaPow);
 }
 void unaryCompare(float* in1, const float& in2, float* out, unsigned long length, int type){
    switch (type)
    {
        case G:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareGOp(in2));
                {
                    return in2 < x ? 1.0 : .0;
                });
            break;
        case GE:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareGEOp(in2));
            {
                return in2 <= x ? 1.0 : .0;
            });
            break;
        case E:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareEOp(in2));
            {
                return in2 == x ? 1.0 : .0;
            });
            break;
        case NE:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareNEOp(in2));
            {
                return in2 != x ? 1.0 : .0;
            });
            break;
        case LE:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareLEOp(in2));
            {
                return in2 >= x ? 1.0 : .0;
            });
            break;
        case L:
-            thrust::transform(thrust::device,in1,in1+length,out,[in2]__host__ __device__(const float &x)
+            thrust::transform(thrust::device,in1,in1+length,out,CompareLOp(in2));
            {
                return in2 > x ? 1.0 : .0;
            });
            break;
        default:
            break;
@@ -587,89 +683,51 @@ void unaryCompare(const float& in1, float* in2, float* out, unsigned long length
    switch (type)
    {
        case G:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareLOp(in1));
            {
                return in1 > x ? 1.0 : .0;
            });
            break;
        case GE:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareLEOp(in1));
            {
                return in1 >= x ? 1.0 : .0;
            });
            break;
        case E:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareEOp(in1));
            {
                return in1 == x ? 1.0 : .0;
            });
            break;
        case NE:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareNEOp(in1));
            {
                return in1 != x ? 1.0 : .0;
            });
            break;
        case LE:
-            thrust::transform(thrust::device,in2,in2+length,out, [in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareGEOp(in1));
            {
                return in1 <= x ? 1.0 : .0;
            });
            break;
        case L:
-            thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x)
+            thrust::transform(thrust::device,in2,in2+length,out,CompareGOp(in1));
            {
                return in1 < x ? 1.0 : .0;
            });
            break;
        default:
            break;
    }
 }
 void unaryCompare(float* in1, float* in2, float* out, unsigned long length, int type){
    switch (type)
    {
        case G:
-            thrust::transform(thrust::device,in1,in1+length,in2,out, []__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAGOp());
            {
                return x > y ? 1. : .0;
            });
            break;
        case GE:
-            thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAGEOp());
            {
                return x >= y ? 1. : .0;
            });
            break;
        case E:
-            thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAEOp());
            {
                return x == y ? 1. : .0;
            });
            break;
        case NE:
-            thrust::transform(thrust::device,in1,in1+length,in2,out, []__host__ __device__(float x, float y)
+            thrust::transform(thrust::device,in1,in1+length,in2,out,CompareANEOp());
            {
                return x != y ? 1. : .0;
            });
            break;
        case LE:
-            thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__ (float x, float y)
+            thrust::transform(thrust::device,in2,in2+length,in1,out,CompareAGEOp());
            {
                return x <= y ? 1. : .0;
            });
            break;
        case L:
-            thrust::transform(thrust::device,in1,in1+length,in2,out, [] __host__ __device__ (float x, float y)
+            thrust::transform(thrust::device,in2,in2+length,in1,out,CompareAGOp());
            {
                return x < y ? 1. : .0;
            });
            break;
        default:
            break;
    }
 }
 void thrustFill(float* aBegin, float* aEnd, float aValue)
--- a/src/Function1D.cpp
+++ b/src/Function1D.cpp
@@ -35,7 +35,55 @@ namespace {
    uint CONVERT_ADD_VALUE = UINT32_MAX - 4095;
-    inline void convertValue(short* aValue ,float* des){
+    inline void convertValue(float aValue ,float* des){
      float value = aValue;
      ushort *exponentPtr = (ushort *)&value;
      exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[2] = (exponentPtr[2] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[3] = (exponentPtr[3] >> 11) & CONVERT_AND_VALUE;
      float signValue = aValue;
      short *signPtr = (short *)&signValue;
      uint sign_bit[4] = {
          (uint)(signPtr[0] < 0 ? 1 : 0), (uint)(signPtr[1] < 0 ? 1 : 0),
          (uint)(signPtr[2] < 0 ? 1 : 0), (uint)(signPtr[3] < 0 ? 1 : 0)};
      float fraction3Value = aValue;
      ushort *fraction3Ptr = (ushort *)&fraction3Value;
      fraction3Ptr[0] &= CONVERT_AND_VALUE_2;
      fraction3Ptr[1] &= CONVERT_AND_VALUE_2;
      fraction3Ptr[2] &= CONVERT_AND_VALUE_2;
      fraction3Ptr[3] &= CONVERT_AND_VALUE_2;
      uint hidden_bit[4] = {
          sign_bit[0] * (!exponentPtr[0] ? 1 : 0) * CONVERT_MUL_VALUE +
              ((!sign_bit[0] && exponentPtr[0]) ? 1 : 0) * CONVERT_MUL_VALUE,
          sign_bit[1] * (!exponentPtr[1] ? 1 : 0) * 2048 +
              ((!sign_bit[1] && exponentPtr[1]) ? 1 : 0) * CONVERT_MUL_VALUE,
          sign_bit[2] * (!exponentPtr[2] ? 1 : 0) * CONVERT_MUL_VALUE +
              ((!sign_bit[2] && exponentPtr[2]) ? 1 : 0) * CONVERT_MUL_VALUE,
          sign_bit[3] * (!exponentPtr[3] ? 1 : 0) * 2048 +
              ((!sign_bit[3] && exponentPtr[3]) ? 1 : 0) * CONVERT_MUL_VALUE,
      };
      int outputPtr[4] = {0};
      uint temp = fraction3Ptr[0] + hidden_bit[0] + sign_bit[0] * CONVERT_ADD_VALUE;
      outputPtr[0] = exponentPtr[0] > 1 ? (temp << (exponentPtr[0] - 1))
                                  : (temp >> std::abs(exponentPtr[0] - 1));
      temp = fraction3Ptr[1] + hidden_bit[1] + sign_bit[1] * CONVERT_ADD_VALUE;
      outputPtr[1] = exponentPtr[1] > 1 ? (temp << (exponentPtr[1] - 1))
                                  : (temp >> std::abs(exponentPtr[1] - 1));
      temp = fraction3Ptr[2] + hidden_bit[2] + sign_bit[2] * CONVERT_ADD_VALUE;
      outputPtr[2] = exponentPtr[2] > 1 ? (temp << (exponentPtr[2] - 1))
                                  : (temp >> std::abs(exponentPtr[2] - 1));
      temp = fraction3Ptr[3] + hidden_bit[3] + sign_bit[3] * CONVERT_ADD_VALUE;
      outputPtr[3] = exponentPtr[3] > 1 ? (temp << (exponentPtr[3] - 1))
                                  : (temp >> std::abs(exponentPtr[3] - 1));
    des[0] = outputPtr[0];
    des[1] = outputPtr[1];
    des[2] = outputPtr[2];
    des[3] = outputPtr[3];
    }
    inline void convertValue2(short* aValue ,float* des){
      ushort exponentPtr[4] = {(ushort)aValue[0],(ushort)aValue[1],(ushort)aValue[2],(ushort)aValue[3]};
      exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
      exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
@@ -568,9 +616,7 @@ Matrix Aurora::acosd(const Matrix& aMatrix)
    {
        resultData[i] = resultData[i] * 180 / PI;
    }
-    Matrix result = Matrix::New(resultData, aMatrix);
+    return Matrix::New(resultData, aMatrix);
    nantoval(result, 0);
    return result;
 }
 Matrix Aurora::conj(const Matrix& aMatrix)
@@ -1050,14 +1096,14 @@ Matrix Aurora::convertfp16tofloat(short* aData, int aRows, int aColumns)
    #pragma omp parallel for
    for (size_t i = 0; i < quaterSize; i+=8) {
        //循环展开以避免过度的线程调用
-        if (i  < quaterSize)::convertValue((short*)(input+i*4), output + (i) * 4);
+        if (i  < quaterSize)::convertValue2((short*)(input+i*4), output + (i) * 4);
-        if (i+1  < quaterSize)::convertValue((short*)(input+(i+1)*4), output + (i+1) * 4);
+        if (i+1  < quaterSize)::convertValue2((short*)(input+(i+1)*4), output + (i+1) * 4);
-        if (i+2  < quaterSize)::convertValue((short*)(input+(i+2)*4), output + (i+2) * 4);
+        if (i+2  < quaterSize)::convertValue2((short*)(input+(i+2)*4), output + (i+2) * 4);
-        if (i+3  < quaterSize)::convertValue((short*)(input+(i+3)*4), output + (i+3) * 4);
+        if (i+3  < quaterSize)::convertValue2((short*)(input+(i+3)*4), output + (i+3) * 4);
-        if (i+4  < quaterSize)::convertValue((short*)(input+(i+4)*4), output + (i+4) * 4);
+        if (i+4  < quaterSize)::convertValue2((short*)(input+(i+4)*4), output + (i+4) * 4);
-        if (i+5  < quaterSize)::convertValue((short*)(input+(i+5)*4), output + (i+5) * 4);
+        if (i+5  < quaterSize)::convertValue2((short*)(input+(i+5)*4), output + (i+5) * 4);
-        if (i+6  < quaterSize)::convertValue((short*)(input+(i+6)*4), output + (i+6) * 4);
+        if (i+6  < quaterSize)::convertValue2((short*)(input+(i+6)*4), output + (i+6) * 4);
-        if (i+7  < quaterSize)::convertValue((short*)(input+(i+7)*4), output + (i+7) * 4);
+        if (i+7  < quaterSize)::convertValue2((short*)(input+(i+7)*4), output + (i+7) * 4);
    }
    return Matrix::New(output,aRows,aColumns,1);
 }
--- a/src/Function1D.cu
+++ b/src/Function1D.cu
--- a/src/Function1D.cuh
+++ b/src/Function1D.cuh
@@ -63,8 +63,6 @@ namespace Aurora
    CudaMatrix conj(const CudaMatrix& aMatrix);
    CudaMatrix conj(CudaMatrix&& aMatrix);
    float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
    CudaMatrix transpose(const CudaMatrix& aMatrix);
@@ -126,4 +124,5 @@ namespace Aurora
    void compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op);
    void compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatrix& aNewValueMatrix,CompareOp op);
 }
 #endif //AURORA_CUDA_FUNCTION1D_H
--- a/src/Function2D.cpp
+++ b/src/Function2D.cpp
@@ -1034,87 +1034,3 @@ Matrix Aurora::sub2ind(const Matrix &aVMatrixSize, std::initializer_list<Matrix>
    delete [] strides;
    return Matrix::New(output,returnVectorSize,1,1);
 }
 void Aurora::findPeaksHost(const Matrix & aData,  int aNPeaks, float aMinPeakHeight, float MinPeakProminece,
        int* outIndex)
 {
    int  signalSize = aData.getDimSize(0);
    int  signalCount = aData.getDimSize(1);
    #pragma omp parallel for
    for (size_t threadIndex = 0; threadIndex < signalCount; threadIndex++)
    {
        float* dataPointer  = aData.getData() + threadIndex*signalSize ;
        float newPeak = dataPointer[0];
        float newValley = dataPointer[0];
        int peakIndex = 0;
        float higherValley;
        int indexs[32];
        float values[32];
        for (size_t i = 0; i < aNPeaks; i++)
        {
            indexs[i] = signalSize;
            values[i] = 0;
        }
        int save_index=0;
        for (int i = 1; i < signalSize - 1; ++i)
        {
            // find peaks
            if (dataPointer[i] > dataPointer[i - 1] && dataPointer[i] > dataPointer[i + 1])
            {
                newPeak = dataPointer[i];
                peakIndex  = i;
            };
            // find valley
            if (dataPointer[i] < dataPointer[i - 1] && dataPointer[i] < dataPointer[i + 1])
            {
                higherValley = std::max(newValley, dataPointer[i]);
                newValley = dataPointer[i];
                if (newPeak >= aMinPeakHeight)
                {
                    float prominece = newPeak - higherValley;
                    if (prominece >= MinPeakProminece)
                    {
                        if (save_index < aNPeaks)
                        {
                            values[save_index] = newPeak;
                            indexs[save_index] = peakIndex;
                            save_index++;
                        }
                        else
                        {
                            for (size_t j = 0; j < aNPeaks; j++)
                            {
                                if (values[j] < newPeak)
                                {
                                    std::swap(values[j], newPeak);
                                    std::swap(indexs[j], peakIndex);
                                }
                            }
                        }
                    }
                }
            }
        }
        if (save_index>=aNPeaks)
        {
            for (size_t i = 0; i < aNPeaks; i++)
            {
                for (size_t j = i+1; j < aNPeaks; j++)
                {
                    if (indexs[i]>indexs[j])
                    {
                        std::swap(values[j], values[i]);
                        std::swap(indexs[j], indexs[i]);
                    }
                }
            }
        }
        for (size_t i = 0; i < aNPeaks; i++)
        {
            // aOutPeaks[threadIndex*aNPeaks+i] = values[i];
            outIndex[threadIndex*aNPeaks+i] = indexs[i];
        }
    }
 }
--- a/src/Function2D.cu
+++ b/src/Function2D.cu
--- a/src/Function2D.cuh
+++ b/src/Function2D.cuh
@@ -64,7 +64,6 @@ namespace Aurora
    CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
    CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
    CudaMatrix ifft(CudaMatrix && aMatrix);
    CudaMatrix hilbert(const CudaMatrix &aMatrix);
@@ -87,19 +86,6 @@ namespace Aurora
     */
    CudaMatrix ifft_symmetric(const CudaMatrix &aMatrix,long aLength);
    CudaMatrix valid(const CudaMatrix& aData, const CudaMatrix aValid);
    /**
     * findPeaks 按列进行峰查找和匹配
     * @attention 不要给aOutIndexs提前分配内存
     * @param aData 输入
     * @param aNpeaks 峰数量
     * @param aMinPeakHeight 最小高度
     * @param aMinPeakProminence 最小相对高度
     * @param aOutIndexs 空指针，会在函数内分配device内存
     * @return 筛选出的峰高度
    */
    CudaMatrix findPeaks(const CudaMatrix& aData, int aNpeaks, float aMinPeakHeight, float aMinPeakProminence, int** aOutIndexs);
 }
 #endif // __FUNCTION2D_CUDA_H__
--- a/src/Function2D.h
+++ b/src/Function2D.h
@@ -107,10 +107,10 @@ namespace Aurora
     * 当第一列包含重复的元素时，sortrows 会根据下一列中的值进行排序，并对后续的相等值重复此行为。
     * @attention 目前不支持三维，不支持复数
     * @param aMatrix 目标矩阵
-     * @param indexMatrix 排序后各行的原索引矩阵指针，必须要有
+     * @param indexMatrix 排序后各行的原索引矩阵指针，非必须
     * @return 排序后矩阵
     */
-    Matrix sortrows(const Matrix &aMatrix, Matrix* indexMatrix);
+    Matrix sortrows(const Matrix &aMatrix, Matrix* indexMatrix=nullptr);
    /**
     * 对矩阵求中间值 按列, 目前不支持三维，不支持复数
@@ -178,7 +178,7 @@ namespace Aurora
     * @return
     */
    Matrix sub2ind(const Matrix &aVMatrixSize, std::initializer_list<Matrix> aSliceIdxs);
-    void findPeaksHost(const Matrix & aData,  int aNPeaks, float aMinPeakHeight, float MinPeakProminece,int* outIndex);
+    
 };
 #endif // AURORA_FUNCTION2D_H
--- a/src/Function3D.cpp
+++ b/src/Function3D.cpp
@@ -5,10 +5,12 @@
 #include "Function.h"
 #ifdef USE_CUDA
 #include "CudaMatrix.h"
 #include "CudaMatrixPrivate.cuh"
 #include <cuda_runtime.h>
-#endif // USE_CUDA
+#include "CudaMatrixPrivate.cuh"
 #include "CudaMatrix.h"
 #endif
 //必须在Eigen之前
 #include "AuroraDefs.h"
@@ -72,6 +74,7 @@ Matrix Aurora::ones(int aRow, int aColumn, int aSlice) {
    return Matrix::New(data,rowSize,colSize,aSlice);
 }
 Matrix Aurora::ones(int aSquareRow) {
    return Aurora::ones(aSquareRow, aSquareRow);
 }
@@ -92,6 +95,7 @@ Matrix Aurora::zeros(int aRow, int aColumn, int aSlice) {
    return Matrix::New(data,rowSize,colSize,sliceSize);
 }
 Matrix Aurora::zeros(int aSquareRow) {
    return Aurora::zeros(aSquareRow, aSquareRow);
 }
@@ -214,7 +218,7 @@ Matrix Aurora::meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix&
    return result;
 }
-#if USE_CUDA
+#ifdef USE_CUDA
 CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){
    if (aRow == 0 || aColumn == 0)
    {
@@ -251,7 +255,6 @@ CudaMatrix Aurora::zerosCuda(int aRow, int aColumn, int aSlice) {
    return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize);
 }
 CudaMatrix Aurora::zerosCuda(int aSquareRow) {
    return Aurora::zerosCuda(aSquareRow, aSquareRow);
 }
--- a/src/Function3D.cu
+++ b/src/Function3D.cu
@@ -1,82 +0,0 @@
 #include "Function3D.cuh"
 using namespace Aurora;
 __global__ void interp3Kernel(cudaTextureObject_t aTexObj, float* aOutputData, float aStartX, float aDx, float aEndX, float aStartY, float aDy
                              , float aEndY, float aStartZ, float aDz, float aEndZ, float* aNewX, float* aNewY, float* aNewZ 
                              , int aOutputRowSize, int aOutputColumnSize, int aOutputSliceSize, float aOutValue)
 {
    int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
    int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
    int zIndex = blockIdx.z * blockDim.z + threadIdx.z;
    if(xIndex > aOutputRowSize - 1 ||  yIndex > aOutputColumnSize - 1 || zIndex > aOutputSliceSize - 1)
    {
        return;
    }
    size_t index = zIndex * aOutputRowSize * aOutputColumnSize + yIndex * aOutputRowSize + xIndex;
    float x = aNewX[index];
    float y = aNewY[index];
    float z = aNewZ[index];
    if(x > aEndX || x < aStartX || y > aEndY || y < aStartY || z > aEndZ || z < aStartZ)
    {
        aOutputData[index] = aOutValue;
    }
    else
    {
        aOutputData[index] = tex3D<float>(aTexObj, (x - aStartX) / aDx + 0.5, (y - aStartY) / aDy + 0.5, (z - aStartZ) / aDz + 0.5);
    }
 }
 CudaMatrix Aurora::interp3(float aStartX, float aDx, float aEndX, float aStartY, float aDy, float aEndY,
                           float aStartZ, float aDz, float aEndZ, const CudaMatrix& aValue,
                           const CudaMatrix& aNewX, const CudaMatrix& aNewY, const CudaMatrix& aNewZ, float aOutValue)
 {
    cudaTextureObject_t texObj;
    size_t dimX = aValue.getDimSize(1);
    size_t dimY = aValue.getDimSize(0);
    size_t dimZ = aValue.getDimSize(2);
    cudaExtent extent = make_cudaExtent(dimX, dimY, dimZ);
    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
    cudaArray* cuArray;
    cudaMalloc3DArray(&cuArray, &channelDesc, extent);
    cudaMemcpy3DParms copyParams = {0};
    copyParams.srcPtr = make_cudaPitchedPtr(aValue.getData(), dimX * sizeof(float), dimX, dimY);
    copyParams.dstArray = cuArray;
    copyParams.extent = extent;
    copyParams.kind = cudaMemcpyDeviceToDevice;
    cudaMemcpy3D(&copyParams);
    cudaResourceDesc resDesc;
    memset(&resDesc, 0, sizeof(resDesc));
    resDesc.resType = cudaResourceTypeArray;
    resDesc.res.array.array = cuArray;
    cudaTextureDesc texDesc;
    memset(&texDesc, 0, sizeof(texDesc));
    texDesc.filterMode = cudaFilterModeLinear;
    texDesc.addressMode[0] = cudaAddressModeClamp;
    texDesc.addressMode[1] = cudaAddressModeClamp;
    texDesc.addressMode[2] = cudaAddressModeClamp;
    texDesc.readMode = cudaReadModeElementType;
    cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
    int row = aNewX.getDimSize(0);
    int column = aNewX.getDimSize(1);
    int slice = aNewX.getDimSize(2);
    dim3 blockDim(4,4,4);
    dim3 gridDim(row / 4 + 1, column / 4 + 1, slice / 4 + 1);
    float *data = nullptr;
    cudaMalloc((void **)&data, sizeof(float) * row * column * slice);
    CudaMatrix result = Aurora::CudaMatrix::fromRawData(data, row, column, slice);
    interp3Kernel<<<gridDim, blockDim>>>(texObj, data, aStartX, aDx, aEndX, aStartY, aDy, aEndY, aStartZ, aDz, aEndZ, 
                                         aNewX.getData(), aNewY.getData(), aNewZ.getData(), row, column, slice, aOutValue);
    cudaDeviceSynchronize();
    return result;
 }
--- a/src/Function3D.cuh
+++ b/src/Function3D.cuh
@@ -1,13 +0,0 @@
 #ifndef __FUNCTION3D_CUDA__
 #define __FUNCTION3D_CUDA__
 #include "CudaMatrix.h"
 #include "AuroraDefs.h"
 namespace Aurora
 {
    CudaMatrix interp3(float aStartX, float aDx, float aEndX, float aStartY, float aDy, float aEndY,
                       float aStartZ, float aDz, float aEndZ, const CudaMatrix& aValue,
                       const CudaMatrix& aNewX, const CudaMatrix& aNewY, const CudaMatrix& aNewZ, float aOutValue);
 }
 #endif // __FUNCTION3D_CUDA_H__
--- a/src/Function3D.h
+++ b/src/Function3D.h
@@ -4,10 +4,8 @@
 #include "Matrix.h"
 #include "Function1D.h"
 #if USE_CUDA
 #include "CudaMatrix.h"
-#endif
+
 namespace Aurora {
    /**
@@ -19,6 +17,8 @@ namespace Aurora {
     */
    Matrix ones(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
    /**
     * 创建全部为1的方阵
     * @param aSquareRow
@@ -26,6 +26,8 @@ namespace Aurora {
     */
    Matrix ones(int aSquareRow);
    CudaMatrix onesCuda(int aSquareRow);
    /**
     * 创建全部为0的数组，矩阵
     * @param aRow 行数，必须大于0
@@ -35,30 +37,25 @@ namespace Aurora {
     */
    Matrix zeros(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
    /**
     * 创建全部为0的方阵
     * @param aSquareRow
     * @return 全部为0的方阵
     */
    Matrix zeros(int aSquareRow);
    CudaMatrix zerosCuda(int aSquareRow);
    Matrix interp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
    Matrix meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval);
    Matrix interpn(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
    Matrix size(const Matrix &aMatrix);
    CudaMatrix size(const CudaMatrix &aMatrix);
    int size(const Matrix &aMatrix,int dims);
    #if USE_CUDA
    CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix onesCuda(int aSquareRow);
    CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
    CudaMatrix zerosCuda(int aSquareRow);
    CudaMatrix size(const CudaMatrix &aMatrix);
    int size(const CudaMatrix &aMatrix,int dims);
-    #endif
+
 };
--- a/src/Matrix.cpp
+++ b/src/Matrix.cpp
@@ -1,4 +1,6 @@
 #include "Matrix.h"
 #include <cmath>
 #include <complex>
 #include <cstddef>
--- a/src/Matrix.h
+++ b/src/Matrix.h
@@ -10,10 +10,7 @@
 namespace Aurora {
    const int $ = -1;
    #if USE_CUDA
    class CudaMatrix;
    #endif
    class Matrix {
    public:
@@ -289,9 +286,8 @@ namespace Aurora {
        void forceReshape(int rows, int columns, int slices);
        #if USE_CUDA
        CudaMatrix toDeviceMatrix() const;
-        #endif
+
    private:
        ValueType mValueType = Normal;
--- a/src/main.cxx
+++ b/src/main.cxx
@@ -7,19 +7,15 @@
 #include <complex>
 #include "Matrix.h"
 #include "CudaMatrix.h"
 #include "Function.h"
 #include "Function1D.h"
 #include "Function2D.h"
 #include "Function3D.h"
 #include "MatlabReader.h"
 #if USE_CUDA
 #include "CudaMatrix.h"
 #endif //USE_CUDA
 int main()
 {
    #if USE_CUDA
    auto A = Aurora::zeros(1000,1,1);
    auto B = Aurora::zeros(1000,1,1);
    for (size_t i = 0; i < 1000; i++)
@@ -119,6 +115,5 @@ int main()
            }
        }
    }
    #endif //USE_CUDA
    return 0;
 }
--- a/test/CudaMatrix_Test.cpp
+++ b/test/CudaMatrix_Test.cpp
@@ -2558,55 +2558,7 @@ TEST_F(CudaMatrix_Test, MatrixCompare){
    }
    {
        auto R= (9!=B);
-        auto dhR = (dB!=9).toHostMatrix();
+        auto dhR = (9!=dB).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
        {
        auto R= (9<B);
        auto dhR = (dB>9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9>B);
        auto dhR = (dB<9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9<=B);
        auto dhR = (dB>=9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9>=B);
        auto dhR = (dB<=9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9==B);
        auto dhR = (dB == 9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
        }
    }
    {
        auto R= (9!=B);
        auto dhR = (dB!=9).toHostMatrix();
        for (size_t i = 0; i < 1000; i++)
        {
            EXPECT_FLOAT_EQ(R[i],dhR[i]);
--- a/test/Function2D_Cuda_Test.cpp
+++ b/test/Function2D_Cuda_Test.cpp
@@ -5,7 +5,6 @@
 #include "Function.h"
 #include "Matrix.h"
 #include "TestUtility.h"
 #include "MatlabReader.h"
 #include "Function2D.h"
 #include "Function2D.cuh"
@@ -19,15 +18,11 @@ protected:
    static void TearDownTestCase(){
    }
    public:
    Aurora::Matrix mSignal;
    Aurora::CudaMatrix dmSignal;
    Aurora::Matrix B;
    Aurora::CudaMatrix dB;
    void SetUp(){
-        MatlabReader m("/home/krad/TestData/peaks.mat");
+
        mSignal = m.read("AScan_env_norm");
        dmSignal = mSignal.toDeviceMatrix();
    }
    void TearDown(){
    }
@@ -1002,17 +997,3 @@ TEST_F(Function2D_Cuda_Test, hilbert) {
        EXPECT_NEAR(ret1[i], ret2.getValue(i), 0.01);
    }
 }
 TEST_F(Function2D_Cuda_Test, findPeaks) {
    int* indexs = new int[mSignal.getDimSize(1)*10];
    auto ret1 = Aurora::findPeaks(dmSignal,10, 0.2, 0.05,indexs);
    auto reH = ret1.toHostMatrix();
    for(unsigned int i=0; i<10; ++i)
    {
        printf("%d,",indexs[i]);
    }
    delete [] indexs;
 }
--- a/test/Function2D_Test.cpp
+++ b/test/Function2D_Test.cpp
@@ -2,8 +2,6 @@
 #include <vector>
 #include "TestUtility.h"
 #include "MatlabReader.h"
 #include "Matrix.h"
 #include "Function.h"
 #include "Function1D.h"
@@ -18,11 +16,7 @@ protected:
    }
    static void TearDownTestCase(){
    }
    public:
    Aurora::Matrix mSignal;
    void SetUp(){
        MatlabReader m("/home/krad/TestData/peaks.mat");
        mSignal = m.read("AScan_env_norm");
    }
    void TearDown(){
    }
@@ -579,15 +573,3 @@ TEST_F(Function2D_Test, sub2ind) {
 }
 TEST_F(Function2D_Test, findPeaks) {
    int* indexs = new int[mSignal.getDimSize(1)*10];
    Aurora::findPeaksHost(mSignal,10, 0.2, 0.05,indexs);
    for(unsigned int i=0; i<10; ++i)
    {
        printf("%d,",indexs[i]);
    }
    delete [] indexs;
 }