From 7db741502e939424df96ea41bd0769cc7ebfae33 Mon Sep 17 00:00:00 2001 From: sunwen Date: Thu, 30 Nov 2023 17:55:02 +0800 Subject: [PATCH] Add vecnorm and unitest --- src/Function1D.cu | 94 +++++++++++++++++++++++++++++++++++ src/Function1D.cuh | 2 + test/Function1D_Cuda_Test.cpp | 38 ++++++++++++++ 3 files changed, 134 insertions(+) diff --git a/src/Function1D.cu b/src/Function1D.cu index 4e3271e..13a1af0 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -1118,3 +1118,97 @@ CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType()); } +__global__ void vecnorm1Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex) +{ + __shared__ float sharedValue[THREADS_PER_BLOCK]; + sharedValue[threadIdx.x] = 0; + if(aIsComplex) + { + for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + { + unsigned int indexByRows = i*blockDim.x + threadIdx.x; + if(indexByRows < aInputRowSize) + { + unsigned int idx = blockIdx.x*aInputRowSize + indexByRows; + sharedValue[threadIdx.x] += sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]); + } + } + } + else + { + for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + { + unsigned int indexByRows = i*blockDim.x + threadIdx.x; + if(indexByRows < aInputRowSize) + { + sharedValue[threadIdx.x] += abs(aInputData[blockIdx.x*aInputRowSize + indexByRows]); + } + } + } + __syncthreads(); + for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + { + if(threadIdx.x < i) + { + sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; + } + __syncthreads(); + } + aOutput[blockIdx.x] = sharedValue[0]; +} + +__global__ void vecnorm2Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex) +{ + __shared__ float sharedValue[THREADS_PER_BLOCK]; + sharedValue[threadIdx.x] = 0; + + for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + { + unsigned int indexByRows = i*blockDim.x + threadIdx.x; + if(indexByRows < aInputRowSize) + { + if(aIsComplex) + { + unsigned int idx = blockIdx.x*aInputRowSize + indexByRows; + sharedValue[threadIdx.x] += aInputData[2 * idx] * aInputData[2 * idx]; + sharedValue[threadIdx.x] += aInputData[2 * idx + 1] * aInputData[2 * idx + 1]; + } + else + { + sharedValue[threadIdx.x] += aInputData[blockIdx.x*aInputRowSize + indexByRows] * aInputData[blockIdx.x*aInputRowSize + indexByRows]; + } + } + } + __syncthreads(); + for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + { + if(threadIdx.x < i) + { + sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; + } + __syncthreads(); + } + aOutput[blockIdx.x] = sqrt(sharedValue[0]); +} + +CudaMatrix Aurora::vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim) +{ + //only surpport aDim = 1 for now. + if(aDim != 1 || aNormMethod == NormMethod::NormF || aMatrix.isNull()) + { + return CudaMatrix(); + } + unsigned int column = aMatrix.getDimSize(1); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * column); + if(aNormMethod == Aurora::Norm1) + { + vecnorm1Kernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex()); + } + else if(aNormMethod == Aurora::Norm2) + { + vecnorm2Kernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex()); + } + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data,column); +} diff --git a/src/Function1D.cuh b/src/Function1D.cuh index 62d7b16..1ee223c 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -69,6 +69,8 @@ namespace Aurora CudaMatrix vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2); + CudaMatrix vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim); + // ------compareSet---------------------------------------------------- diff --git a/test/Function1D_Cuda_Test.cpp b/test/Function1D_Cuda_Test.cpp index 6fd397a..10bb610 100644 --- a/test/Function1D_Cuda_Test.cpp +++ b/test/Function1D_Cuda_Test.cpp @@ -902,3 +902,41 @@ TEST_F(Function1D_Cuda_Test, vertcat) { EXPECT_FLOAT_EQ(result.getDimSize(0),6); EXPECT_FLOAT_EQ(result.getDimSize(1),1); } + +TEST_F(Function1D_Cuda_Test, vecnorm) { + //1Dim + float *data = new float[3]{1,2,-3}; + auto matrix = Aurora::Matrix::fromRawData(data, 3).toDeviceMatrix(); + auto result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm1,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],6); + result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm2,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],3.74166); + + //2Dims + data = new float[8]{1,2,-3,6,7,9,22.3,-8.6}; + matrix = Aurora::Matrix::fromRawData(data, 4,2).toDeviceMatrix(); + result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm1,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],12); + EXPECT_FLOAT_AE(result.getData()[1],46.9); + result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm2,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],7.0711); + EXPECT_FLOAT_AE(result.getData()[1],26.4811); + + //1Dim Complex + data = new float[6]{1,2,-3,4,5,-6}; + matrix = Aurora::Matrix::fromRawData(data, 3,1,1,Aurora::Complex).toDeviceMatrix(); + result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm1,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],15.0463); + result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm2,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],9.5394); + + //2Dims Complex + data = new float[12]{1,2,-3,4,5,-6,7,8,9,22,24,25}; + matrix = Aurora::Matrix::fromRawData(data, 3,2,1,Aurora::Complex).toDeviceMatrix(); + result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm1,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],15.0463); + EXPECT_FLOAT_AE(result.getData()[1],69.0553); + result = Aurora::vecnorm(matrix,Aurora::NormMethod::Norm2,1).toHostMatrix(); + EXPECT_FLOAT_AE(result.getData()[0],9.5394); + EXPECT_FLOAT_AE(result.getData()[1],43.3474); +}