From 42ecc6610645660839b5a4f0ef9b15bbaa4d0325 Mon Sep 17 00:00:00 2001 From: sunwen Date: Thu, 7 Dec 2023 15:03:37 +0800 Subject: [PATCH] Add cuda immse, sortrows and unitest. --- src/Function2D.cu | 107 +++++++++++++++++++++++++++++++++- src/Function2D.cuh | 12 ++++ test/Function2D_Cuda_Test.cpp | 32 ++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) diff --git a/src/Function2D.cu b/src/Function2D.cu index 1ce8904..c97c6f9 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -19,8 +19,16 @@ #include #include #include +#include "Function1D.cuh" +#include "Matrix.h" + using namespace Aurora; +namespace +{ + const int THREADS_PER_BLOCK = 256; +} + __global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize) { @@ -878,4 +886,101 @@ CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction) } -} \ No newline at end of file +} + +__global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aInputSize) + { + aOutputData[idx] = powf(aInputData1[idx] - aInputData2[idx], 2); + } +} + +float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB) +{ + if (aImageA.getDims()!=2|| aImageB.getDims()!=2) + { + std::cerr<<"Fail! cuda immse args must all 2d matrix!"; + return 0.0; + } + + if (!aImageB.compareShape(aImageA)) + { + std::cerr<<"Fail! cuda immse args must be same shape!"; + return 0.0; + } + + if (aImageA.getValueType() != Normal || aImageB.getValueType() != Normal) + { + std::cerr << "Fail! cuda immse args must be normal value type!"; + return 0.0; + } + + unsigned int size = aImageA.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + immseKernel<<>>(aImageA.getData(), aImageB.getData(), data, size); + cudaDeviceSynchronize(); + float result = thrust::reduce(thrust::device, data, data+size, 0.0, thrust::plus()) / size; + cudaFree(data); + return result; +} + +struct compareMatrixByRows +{ + compareMatrixByRows(unsigned int aSize) + : mSize(aSize) + { + }; + unsigned int mSize; + __host__ __device__ + bool operator()(const float* aVector1, const float* aVector2) const + { + for(unsigned int i=0; i aVector2[i]) + { + return false; + } + } + return false; + } +}; + +CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix) +{ + CudaMatrix transposeMatrix = transpose(aMatrix); + size_t rows = transposeMatrix.getDimSize(0); + size_t columns = transposeMatrix.getDimSize(1); + thrust::device_vector vector(columns); + for(unsigned int i=0; i vectorBack = vector; + thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows)); + + float* data = nullptr; + float* indexResult = new float[columns]; + cudaMalloc((void**)&data, sizeof(float) * rows * columns); + for(unsigned int i=0; i