diff --git a/src/Function2D.cu b/src/Function2D.cu index 5c15748..dccba5a 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -1159,6 +1159,108 @@ CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, Func return CudaMatrix::fromRawData(data, 1, column); } + __global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize) +{ + __shared__ float sharedValue[THREADS_PER_BLOCK]; + sharedValue[threadIdx.x] = 1; + + for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + { + unsigned int indexByRows = i*blockDim.x + threadIdx.x; + if(indexByRows < aInputRowSize) + { + sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows]; + } + } + __syncthreads(); + for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + { + if(threadIdx.x < i) + { + sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i]; + } + __syncthreads(); + } + + if(threadIdx.x == 0) + { + aOutputData[blockIdx.x] = sharedValue[0]; + } +} + +__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize) +{ + __shared__ float sharedValue[THREADS_PER_BLOCK * 2]; + unsigned int complexIdx = threadIdx.x * 2; + + sharedValue[complexIdx] = 1; + sharedValue[complexIdx + 1] = 0; + + for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i) + { + unsigned int indexByRows = i*blockDim.x + threadIdx.x; + if(indexByRows < aInputRowSize) + { + unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows); + float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1]; + float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index]; + sharedValue[complexIdx] = real; + sharedValue[complexIdx + 1] = imag; + } + } + __syncthreads(); + for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + { + if(threadIdx.x < i) + { + unsigned int index = 2 * (threadIdx.x + i); + float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1]; + float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index]; + sharedValue[complexIdx] = real; + sharedValue[complexIdx + 1] = imag; + } + __syncthreads(); + } + + if(threadIdx.x == 0) + { + aOutputData[2 * blockIdx.x] = sharedValue[0]; + aOutputData[2 * blockIdx.x + 1] = sharedValue[1]; + } + +} + +CudaMatrix Aurora::prod(const CudaMatrix &aMatrix) +{ + if (aMatrix.getDimSize(2) > 1 ) + { + std::cerr<< "cuda prod() not support 3D data!"<< std::endl; + return CudaMatrix(); + } + + unsigned int row = aMatrix.getDimSize(0); + unsigned int column = aMatrix.getDimSize(1); + + if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) + { + column = 1; + row = aMatrix.getDataSize(); + } + + float* data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType()); + if(aMatrix.isComplex()) + { + prodComplexKernel<<>>(aMatrix.getData(), data, row); + } + else + { + prodKernel<<>>(aMatrix.getData(), data, row); + } + cudaDeviceSynchronize(); + return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType()); +} + /** * @brief * diff --git a/src/Function2D.cuh b/src/Function2D.cuh index 8d9aba3..d9f39a6 100644 --- a/src/Function2D.cuh +++ b/src/Function2D.cuh @@ -47,6 +47,13 @@ namespace Aurora CudaMatrix dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction = Column); + /** + * prod,支持到2维 + * @param aMatrix + * @return + */ + CudaMatrix prod(const CudaMatrix &aMatrix); + CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1); CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1); diff --git a/test/Function2D_Cuda_Test.cpp b/test/Function2D_Cuda_Test.cpp index 97da6fb..5cc3bd6 100644 --- a/test/Function2D_Cuda_Test.cpp +++ b/test/Function2D_Cuda_Test.cpp @@ -807,10 +807,55 @@ TEST_F(Function2D_Cuda_Test, dot) { Aurora::CudaMatrix matrixDevice2 = matrixHost2.toDeviceMatrix(); auto result1 = Aurora::dot(matrixHost1, matrixHost2); auto result2 = Aurora::dot(matrixDevice1, matrixDevice2).toHostMatrix(); - std::cout<< result1.getDataSize(); ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize()); for (size_t i = 0; i < result1.getDataSize(); i++) { EXPECT_FLOAT_AE(result1[i], result2[i]); } +} + +TEST_F(Function2D_Cuda_Test, prod) { + auto matrixHost = Aurora::Matrix::fromRawData(new float[20], 4,5); + for(unsigned int i=0; i<20;++i) + { + matrixHost[i] = i + 1; + } + auto matrixDevice = matrixHost.toDeviceMatrix(); + auto result1 = Aurora::prod(matrixHost); + auto result2 = Aurora::prod(matrixDevice).toHostMatrix(); + ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize()); + for (size_t i = 0; i < result1.getDataSize(); i++) + { + EXPECT_FLOAT_AE(result1[i], result2[i]); + } + + matrixHost = Aurora::Matrix::fromRawData(new float[20], 20,1); + for(unsigned int i=0; i<20;++i) + { + matrixHost[i] = i + 1; + } + matrixDevice = matrixHost.toDeviceMatrix(); + result1 = Aurora::prod(matrixHost); + result2 = Aurora::prod(matrixDevice).toHostMatrix(); + ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize()); + for (size_t i = 0; i < result1.getDataSize(); i++) + { + EXPECT_FLOAT_AE(result1[i], result2[i]); + } + + auto matrixHostComplex = Aurora::Matrix::fromRawData(new float[40], 4,5, 1,Aurora::Complex); + for(unsigned int i=0; i<40;++i) + { + matrixHost[i] = i + 1; + } + + matrixDevice = matrixHostComplex.toDeviceMatrix(); + result1 = Aurora::prod(matrixHostComplex); + result2 = Aurora::prod(matrixDevice).toHostMatrix(); + ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize()); + ASSERT_FLOAT_EQ(result1.getValueType(), result2.getValueType()); + for (size_t i = 0; i < result1.getDataSize() * result1.getValueType(); i++) + { + EXPECT_FLOAT_AE(result1[i], result2[i]); + } } \ No newline at end of file