From 42ecc6610645660839b5a4f0ef9b15bbaa4d0325 Mon Sep 17 00:00:00 2001
From: sunwen <sunwen@realcan.com>
Date: Thu, 7 Dec 2023 15:03:37 +0800
Subject: [PATCH] Add cuda immse, sortrows and unitest.

---
 src/Function2D.cu             | 107 +++++++++++++++++++++++++++++++++-
 src/Function2D.cuh            |  12 ++++
 test/Function2D_Cuda_Test.cpp |  32 ++++++++++
 3 files changed, 150 insertions(+), 1 deletion(-)
diff --git a/src/Function2D.cu b/src/Function2D.cu
index 1ce8904..c97c6f9 100644
--- a/src/Function2D.cu
+++ b/src/Function2D.cu
@@ -19,8 +19,16 @@
 #include <thrust/functional.h>
 #include <thrust/complex.h>
 #include <cuda_runtime.h>
+#include "Function1D.cuh"
+#include "Matrix.h"
+
 using namespace Aurora;
 
+namespace
+{
+    const int THREADS_PER_BLOCK = 256;
+}
+
 
 __global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize)
 {
@@ -878,4 +886,101 @@ CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction)
 
     }
     
-}
\ No newline at end of file
+}
+
+__global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputSize)
+{
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < aInputSize)
+    {
+        aOutputData[idx] = powf(aInputData1[idx] - aInputData2[idx], 2);
+    }
+}
+
+float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB)
+{
+    if (aImageA.getDims()!=2|| aImageB.getDims()!=2)
+    {
+        std::cerr<<"Fail! cuda immse args must all 2d matrix!";
+        return 0.0;
+    }
+
+    if (!aImageB.compareShape(aImageA))
+    {
+        std::cerr<<"Fail! cuda immse args must be same shape!";
+        return 0.0;
+    }
+
+    if (aImageA.getValueType() != Normal || aImageB.getValueType() != Normal)
+    {
+        std::cerr << "Fail! cuda immse args must be normal value type!";
+        return 0.0;
+    }
+
+    unsigned int size = aImageA.getDataSize();
+    float* data = nullptr;
+    cudaMalloc((void**)&data, sizeof(float) * size);
+    int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    immseKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aImageA.getData(), aImageB.getData(), data, size);
+    cudaDeviceSynchronize();
+    float result = thrust::reduce(thrust::device, data, data+size, 0.0, thrust::plus<float>()) / size;
+    cudaFree(data);
+    return result;
+}
+
+struct compareMatrixByRows
+{
+    compareMatrixByRows(unsigned int aSize)
+        : mSize(aSize)
+    {
+    };
+    unsigned int mSize;
+    __host__ __device__
+    bool operator()(const float* aVector1, const float* aVector2) const
+    {
+        for(unsigned int i=0; i<mSize; ++i)
+        {
+            if(aVector1[i] < aVector2[i])
+            {
+                return true;
+            }
+            else if(aVector1[i] > aVector2[i])
+            {
+                return false;
+            }
+        }
+        return false;
+    }
+};
+
+CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix)
+{
+    CudaMatrix transposeMatrix = transpose(aMatrix);
+    size_t rows = transposeMatrix.getDimSize(0);
+    size_t columns = transposeMatrix.getDimSize(1);
+    thrust::device_vector<float*> vector(columns);
+    for(unsigned int i=0; i<columns; ++i)
+    {
+        vector[i] = transposeMatrix.getData() + i*rows;
+    }
+    thrust::device_vector<float*> vectorBack = vector;
+    thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows));
+
+    float* data = nullptr;
+    float* indexResult = new float[columns];
+    cudaMalloc((void**)&data, sizeof(float) * rows * columns);
+    for(unsigned int i=0; i<columns; ++i)
+    {
+        cudaMemcpy(data + i*rows, vector[i], sizeof(float) * rows, cudaMemcpyDeviceToDevice);
+    }
+
+    for(unsigned int i=0; i<columns; ++i)
+    {
+        auto index = thrust::find(thrust::device, vectorBack.begin(), vectorBack.end(), vector[i]);
+        indexResult[i] = index - vectorBack.begin();
+    }
+
+    indexMatrix = Aurora::Matrix::fromRawData(indexResult, columns).toDeviceMatrix();
+
+    return transpose(CudaMatrix::fromRawData(data, rows, columns));
+}
diff --git a/src/Function2D.cuh b/src/Function2D.cuh
index 5de8a64..b855b3f 100644
--- a/src/Function2D.cuh
+++ b/src/Function2D.cuh
@@ -28,6 +28,18 @@ namespace Aurora
     CudaMatrix sort(const CudaMatrix &aMatrix,FunctionDirection direction = Column);
     CudaMatrix sort(CudaMatrix &&aMatrix,FunctionDirection direction = Column);
 
+    float immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB);
+
+    /**
+     * 基于第一列中的元素按升序对矩阵行进行排序。
+     * 当第一列包含重复的元素时，sortrows 会根据下一列中的值进行排序，并对后续的相等值重复此行为。
+     * @attention 目前不支持三维，不支持复数
+     * @param aMatrix 目标矩阵
+     * @param indexMatrix 排序后各行的原索引矩阵指针，非必须
+     * @return 排序后矩阵
+     */
+    CudaMatrix sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix);
+
 }
 
 #endif // __FUNCTION2D_CUDA_H__
\ No newline at end of file
diff --git a/test/Function2D_Cuda_Test.cpp b/test/Function2D_Cuda_Test.cpp
index e687b76..2801730 100644
--- a/test/Function2D_Cuda_Test.cpp
+++ b/test/Function2D_Cuda_Test.cpp
@@ -613,4 +613,36 @@ TEST_F(Function2D_Cuda_Test, sort)
         }
         
     }
+}
+
+TEST_F(Function2D_Cuda_Test, immse) {
+    auto matrixHost1 = Aurora::Matrix::fromRawData(Aurora::random(10000), 50,200);
+    auto matrixHost2 = Aurora::Matrix::fromRawData(Aurora::random(10000), 50,200);
+    auto matrixDevice1 = matrixHost1.toDeviceMatrix();
+    auto matrixDevice2 = matrixHost1.toDeviceMatrix();
+    auto result1 = Aurora::immse(matrixHost1, matrixHost2);
+    auto result2 = Aurora::immse(matrixDevice1, matrixDevice2);
+    EXPECT_FLOAT_AE(result1, result2);
+}
+
+TEST_F(Function2D_Cuda_Test, sortRows) {
+    auto matrixHost1 = Aurora::Matrix::fromRawData(Aurora::random(10000), 50,200);
+    Aurora::Matrix matrixHost2;
+    auto matrixDevice1 = matrixHost1.toDeviceMatrix();
+    Aurora::CudaMatrix matrixDevice2;
+    auto result1 = Aurora::sortrows(matrixHost1, &matrixHost2);
+    auto result2 = Aurora::sortrows(matrixDevice1, matrixDevice2).toHostMatrix();
+    auto result3 = matrixHost2;
+    auto result4 = matrixDevice2.toHostMatrix();
+    ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
+    for (size_t i = 0; i < result1.getDataSize(); i++)
+    {
+        ASSERT_FLOAT_EQ(result1[i], result2[i]);
+    }
+
+    ASSERT_FLOAT_EQ(result3.getDataSize(), result4.getDataSize());
+    for (size_t i = 0; i < result3.getDataSize(); i++)
+    {
+        ASSERT_FLOAT_EQ(result3[i], result4[i]);
+    }
 }
\ No newline at end of file