Add prod and unittest.

2023-12-11 15:01:24 +08:00
parent 8b58d05d90
commit bd4a27a17b
3 changed files with 155 additions and 1 deletions
--- a/src/Function2D.cu
+++ b/src/Function2D.cu
@@ -1159,6 +1159,108 @@ CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, Func
    return CudaMatrix::fromRawData(data, 1, column);
 }

+ __global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
+{
+    __shared__ float sharedValue[THREADS_PER_BLOCK];
+    sharedValue[threadIdx.x] = 1;
+
+    for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
+    {
+        unsigned int indexByRows =  i*blockDim.x + threadIdx.x;
+        if(indexByRows < aInputRowSize)
+        {
+            sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows];
+        }
+    } 
+    __syncthreads();
+    for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
+    {
+        if(threadIdx.x < i)
+        {
+            sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i];           
+        }
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        aOutputData[blockIdx.x] = sharedValue[0];
+    } 
+}
+
+__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
+{
+    __shared__ float sharedValue[THREADS_PER_BLOCK * 2];
+    unsigned int complexIdx = threadIdx.x * 2;
+    
+    sharedValue[complexIdx] = 1;
+    sharedValue[complexIdx + 1] = 0;
+
+    for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i)
+    {
+        unsigned int indexByRows =  i*blockDim.x + threadIdx.x;
+        if(indexByRows < aInputRowSize)
+        {
+            unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows);
+            float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1];
+            float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index];
+            sharedValue[complexIdx] =  real;
+            sharedValue[complexIdx + 1] = imag;
+        }
+    } 
+    __syncthreads();
+    for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
+    {
+        if(threadIdx.x < i)
+        {
+            unsigned int index = 2 * (threadIdx.x + i);
+            float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1];
+            float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index];
+            sharedValue[complexIdx] = real;
+            sharedValue[complexIdx + 1] =  imag;
+        }
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0)
+    {
+        aOutputData[2 * blockIdx.x] = sharedValue[0]; 
+        aOutputData[2 * blockIdx.x + 1] = sharedValue[1];
+    }
+ 
+}
+
+CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
+{
+    if (aMatrix.getDimSize(2) > 1 )
+    {
+        std::cerr<< "cuda prod() not support 3D data!"<< std::endl;
+        return CudaMatrix();
+    }
+
+    unsigned int row = aMatrix.getDimSize(0);
+    unsigned int column = aMatrix.getDimSize(1);
+
+    if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1)
+    {
+        column = 1;
+        row = aMatrix.getDataSize();
+    }
+
+    float* data = nullptr;
+    cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType());
+    if(aMatrix.isComplex())
+    {
+        prodComplexKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
+    }
+    else
+    {
+        prodKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
+    }
+    cudaDeviceSynchronize();
+    return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType());
+}
+
 /**
 * @brief 
 *