Add cuda immse, sortrows and unitest.

This commit is contained in:
sunwen
2023-12-07 15:03:37 +08:00
parent cdf4777dbe
commit 42ecc66106
3 changed files with 150 additions and 1 deletions

View File

@@ -19,8 +19,16 @@
#include <thrust/functional.h>
#include <thrust/complex.h>
#include <cuda_runtime.h>
#include "Function1D.cuh"
#include "Matrix.h"
using namespace Aurora;
namespace
{
const int THREADS_PER_BLOCK = 256;
}
__global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize)
{
@@ -878,4 +886,101 @@ CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction)
}
}
}
__global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aInputSize)
{
aOutputData[idx] = powf(aInputData1[idx] - aInputData2[idx], 2);
}
}
float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB)
{
if (aImageA.getDims()!=2|| aImageB.getDims()!=2)
{
std::cerr<<"Fail! cuda immse args must all 2d matrix!";
return 0.0;
}
if (!aImageB.compareShape(aImageA))
{
std::cerr<<"Fail! cuda immse args must be same shape!";
return 0.0;
}
if (aImageA.getValueType() != Normal || aImageB.getValueType() != Normal)
{
std::cerr << "Fail! cuda immse args must be normal value type!";
return 0.0;
}
unsigned int size = aImageA.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
immseKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aImageA.getData(), aImageB.getData(), data, size);
cudaDeviceSynchronize();
float result = thrust::reduce(thrust::device, data, data+size, 0.0, thrust::plus<float>()) / size;
cudaFree(data);
return result;
}
struct compareMatrixByRows
{
compareMatrixByRows(unsigned int aSize)
: mSize(aSize)
{
};
unsigned int mSize;
__host__ __device__
bool operator()(const float* aVector1, const float* aVector2) const
{
for(unsigned int i=0; i<mSize; ++i)
{
if(aVector1[i] < aVector2[i])
{
return true;
}
else if(aVector1[i] > aVector2[i])
{
return false;
}
}
return false;
}
};
CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix)
{
CudaMatrix transposeMatrix = transpose(aMatrix);
size_t rows = transposeMatrix.getDimSize(0);
size_t columns = transposeMatrix.getDimSize(1);
thrust::device_vector<float*> vector(columns);
for(unsigned int i=0; i<columns; ++i)
{
vector[i] = transposeMatrix.getData() + i*rows;
}
thrust::device_vector<float*> vectorBack = vector;
thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows));
float* data = nullptr;
float* indexResult = new float[columns];
cudaMalloc((void**)&data, sizeof(float) * rows * columns);
for(unsigned int i=0; i<columns; ++i)
{
cudaMemcpy(data + i*rows, vector[i], sizeof(float) * rows, cudaMemcpyDeviceToDevice);
}
for(unsigned int i=0; i<columns; ++i)
{
auto index = thrust::find(thrust::device, vectorBack.begin(), vectorBack.end(), vector[i]);
indexResult[i] = index - vectorBack.begin();
}
indexMatrix = Aurora::Matrix::fromRawData(indexResult, columns).toDeviceMatrix();
return transpose(CudaMatrix::fromRawData(data, rows, columns));
}

View File

@@ -28,6 +28,18 @@ namespace Aurora
CudaMatrix sort(const CudaMatrix &aMatrix,FunctionDirection direction = Column);
CudaMatrix sort(CudaMatrix &&aMatrix,FunctionDirection direction = Column);
float immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB);
/**
* 基于第一列中的元素按升序对矩阵行进行排序。
* 当第一列包含重复的元素时sortrows 会根据下一列中的值进行排序,并对后续的相等值重复此行为。
* @attention 目前不支持三维,不支持复数
* @param aMatrix 目标矩阵
* @param indexMatrix 排序后各行的原索引矩阵指针,非必须
* @return 排序后矩阵
*/
CudaMatrix sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix);
}
#endif // __FUNCTION2D_CUDA_H__

View File

@@ -613,4 +613,36 @@ TEST_F(Function2D_Cuda_Test, sort)
}
}
}
TEST_F(Function2D_Cuda_Test, immse) {
auto matrixHost1 = Aurora::Matrix::fromRawData(Aurora::random(10000), 50,200);
auto matrixHost2 = Aurora::Matrix::fromRawData(Aurora::random(10000), 50,200);
auto matrixDevice1 = matrixHost1.toDeviceMatrix();
auto matrixDevice2 = matrixHost1.toDeviceMatrix();
auto result1 = Aurora::immse(matrixHost1, matrixHost2);
auto result2 = Aurora::immse(matrixDevice1, matrixDevice2);
EXPECT_FLOAT_AE(result1, result2);
}
TEST_F(Function2D_Cuda_Test, sortRows) {
auto matrixHost1 = Aurora::Matrix::fromRawData(Aurora::random(10000), 50,200);
Aurora::Matrix matrixHost2;
auto matrixDevice1 = matrixHost1.toDeviceMatrix();
Aurora::CudaMatrix matrixDevice2;
auto result1 = Aurora::sortrows(matrixHost1, &matrixHost2);
auto result2 = Aurora::sortrows(matrixDevice1, matrixDevice2).toHostMatrix();
auto result3 = matrixHost2;
auto result4 = matrixDevice2.toHostMatrix();
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
for (size_t i = 0; i < result1.getDataSize(); i++)
{
ASSERT_FLOAT_EQ(result1[i], result2[i]);
}
ASSERT_FLOAT_EQ(result3.getDataSize(), result4.getDataSize());
for (size_t i = 0; i < result3.getDataSize(); i++)
{
ASSERT_FLOAT_EQ(result3[i], result4[i]);
}
}