Add prod and unittest.

2023-12-11 15:01:24 +08:00
parent 8b58d05d90
commit bd4a27a17b
3 changed files with 155 additions and 1 deletions
--- a/src/Function2D.cu
+++ b/src/Function2D.cu
@@ -1159,6 +1159,108 @@ CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, Func
    return CudaMatrix::fromRawData(data, 1, column);
 }

+ __global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
+{
+    __shared__ float sharedValue[THREADS_PER_BLOCK];
+    sharedValue[threadIdx.x] = 1;
+
+    for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
+    {
+        unsigned int indexByRows =  i*blockDim.x + threadIdx.x;
+        if(indexByRows < aInputRowSize)
+        {
+            sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows];
+        }
+    } 
+    __syncthreads();
+    for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
+    {
+        if(threadIdx.x < i)
+        {
+            sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i];           
+        }
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        aOutputData[blockIdx.x] = sharedValue[0];
+    } 
+}
+
+__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
+{
+    __shared__ float sharedValue[THREADS_PER_BLOCK * 2];
+    unsigned int complexIdx = threadIdx.x * 2;
+    
+    sharedValue[complexIdx] = 1;
+    sharedValue[complexIdx + 1] = 0;
+
+    for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i)
+    {
+        unsigned int indexByRows =  i*blockDim.x + threadIdx.x;
+        if(indexByRows < aInputRowSize)
+        {
+            unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows);
+            float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1];
+            float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index];
+            sharedValue[complexIdx] =  real;
+            sharedValue[complexIdx + 1] = imag;
+        }
+    } 
+    __syncthreads();
+    for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
+    {
+        if(threadIdx.x < i)
+        {
+            unsigned int index = 2 * (threadIdx.x + i);
+            float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1];
+            float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index];
+            sharedValue[complexIdx] = real;
+            sharedValue[complexIdx + 1] =  imag;
+        }
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        aOutputData[2 * blockIdx.x] = sharedValue[0]; 
+        aOutputData[2 * blockIdx.x + 1] = sharedValue[1];
+    }
+ 
+}
+
+CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
+{
+    if (aMatrix.getDimSize(2) > 1 )
+    {
+        std::cerr<< "cuda prod() not support 3D data!"<< std::endl;
+        return CudaMatrix();
+    }
+
+    unsigned int row = aMatrix.getDimSize(0);
+    unsigned int column = aMatrix.getDimSize(1);
+
+    if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1)
+    {
+        column = 1;
+        row = aMatrix.getDataSize();
+    }
+
+    float* data = nullptr;
+    cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType());
+    if(aMatrix.isComplex())
+    {
+        prodComplexKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
+    }
+    else
+    {
+        prodKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
+    }
+    cudaDeviceSynchronize();
+    return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType());
+}
+
 /**
 * @brief 
 * 
--- a/src/Function2D.cuh
+++ b/src/Function2D.cuh
@@ -47,6 +47,13 @@ namespace Aurora

    CudaMatrix dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction = Column);

+    /**
+     * prod，支持到2维
+     * @param aMatrix
+     * @return
+     */
+    CudaMatrix prod(const CudaMatrix &aMatrix);
+
    CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
    CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);

--- a/test/Function2D_Cuda_Test.cpp
+++ b/test/Function2D_Cuda_Test.cpp
@@ -807,10 +807,55 @@ TEST_F(Function2D_Cuda_Test, dot) {
    Aurora::CudaMatrix matrixDevice2 = matrixHost2.toDeviceMatrix();
    auto result1 = Aurora::dot(matrixHost1, matrixHost2);
    auto result2 = Aurora::dot(matrixDevice1, matrixDevice2).toHostMatrix();
-    std::cout<< result1.getDataSize();
    ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
    for (size_t i = 0; i < result1.getDataSize(); i++)
    {
        EXPECT_FLOAT_AE(result1[i], result2[i]);
    }
+}
+
+TEST_F(Function2D_Cuda_Test, prod) {
+    auto matrixHost = Aurora::Matrix::fromRawData(new float[20], 4,5);
+    for(unsigned int i=0; i<20;++i)
+    {
+        matrixHost[i] = i + 1;
+    }
+    auto matrixDevice = matrixHost.toDeviceMatrix();
+    auto result1 = Aurora::prod(matrixHost);
+    auto result2 = Aurora::prod(matrixDevice).toHostMatrix();
+    ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
+    for (size_t i = 0; i < result1.getDataSize(); i++)
+    {
+        EXPECT_FLOAT_AE(result1[i], result2[i]);
+    }
+
+    matrixHost = Aurora::Matrix::fromRawData(new float[20], 20,1);
+    for(unsigned int i=0; i<20;++i)
+    {
+        matrixHost[i] = i + 1;
+    }
+    matrixDevice = matrixHost.toDeviceMatrix();
+    result1 = Aurora::prod(matrixHost);
+    result2 = Aurora::prod(matrixDevice).toHostMatrix();
+    ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
+    for (size_t i = 0; i < result1.getDataSize(); i++)
+    {
+        EXPECT_FLOAT_AE(result1[i], result2[i]);
+    }
+
+    auto matrixHostComplex = Aurora::Matrix::fromRawData(new float[40], 4,5, 1,Aurora::Complex);
+    for(unsigned int i=0; i<40;++i)
+    {
+        matrixHost[i] = i + 1;
+    }
+
+    matrixDevice = matrixHostComplex.toDeviceMatrix();
+    result1 = Aurora::prod(matrixHostComplex);
+    result2 = Aurora::prod(matrixDevice).toHostMatrix();
+    ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
+    ASSERT_FLOAT_EQ(result1.getValueType(), result2.getValueType());
+    for (size_t i = 0; i < result1.getDataSize() * result1.getValueType(); i++)
+    {
+        EXPECT_FLOAT_AE(result1[i], result2[i]);
+    }
 }