Add prod and unittest.
This commit is contained in:
@@ -1159,6 +1159,108 @@ CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, Func
|
||||
return CudaMatrix::fromRawData(data, 1, column);
|
||||
}
|
||||
|
||||
__global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
|
||||
{
|
||||
__shared__ float sharedValue[THREADS_PER_BLOCK];
|
||||
sharedValue[threadIdx.x] = 1;
|
||||
|
||||
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
||||
{
|
||||
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
||||
if(indexByRows < aInputRowSize)
|
||||
{
|
||||
sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
||||
{
|
||||
if(threadIdx.x < i)
|
||||
{
|
||||
sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
aOutputData[blockIdx.x] = sharedValue[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
|
||||
{
|
||||
__shared__ float sharedValue[THREADS_PER_BLOCK * 2];
|
||||
unsigned int complexIdx = threadIdx.x * 2;
|
||||
|
||||
sharedValue[complexIdx] = 1;
|
||||
sharedValue[complexIdx + 1] = 0;
|
||||
|
||||
for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i)
|
||||
{
|
||||
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
||||
if(indexByRows < aInputRowSize)
|
||||
{
|
||||
unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows);
|
||||
float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1];
|
||||
float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index];
|
||||
sharedValue[complexIdx] = real;
|
||||
sharedValue[complexIdx + 1] = imag;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
||||
{
|
||||
if(threadIdx.x < i)
|
||||
{
|
||||
unsigned int index = 2 * (threadIdx.x + i);
|
||||
float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1];
|
||||
float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index];
|
||||
sharedValue[complexIdx] = real;
|
||||
sharedValue[complexIdx + 1] = imag;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
aOutputData[2 * blockIdx.x] = sharedValue[0];
|
||||
aOutputData[2 * blockIdx.x + 1] = sharedValue[1];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.getDimSize(2) > 1 )
|
||||
{
|
||||
std::cerr<< "cuda prod() not support 3D data!"<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
|
||||
unsigned int row = aMatrix.getDimSize(0);
|
||||
unsigned int column = aMatrix.getDimSize(1);
|
||||
|
||||
if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1)
|
||||
{
|
||||
column = 1;
|
||||
row = aMatrix.getDataSize();
|
||||
}
|
||||
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType());
|
||||
if(aMatrix.isComplex())
|
||||
{
|
||||
prodComplexKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
|
||||
}
|
||||
else
|
||||
{
|
||||
prodKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief
|
||||
*
|
||||
|
||||
@@ -47,6 +47,13 @@ namespace Aurora
|
||||
|
||||
CudaMatrix dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction = Column);
|
||||
|
||||
/**
|
||||
* prod,支持到2维
|
||||
* @param aMatrix
|
||||
* @return
|
||||
*/
|
||||
CudaMatrix prod(const CudaMatrix &aMatrix);
|
||||
|
||||
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
|
||||
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
|
||||
|
||||
|
||||
@@ -807,10 +807,55 @@ TEST_F(Function2D_Cuda_Test, dot) {
|
||||
Aurora::CudaMatrix matrixDevice2 = matrixHost2.toDeviceMatrix();
|
||||
auto result1 = Aurora::dot(matrixHost1, matrixHost2);
|
||||
auto result2 = Aurora::dot(matrixDevice1, matrixDevice2).toHostMatrix();
|
||||
std::cout<< result1.getDataSize();
|
||||
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
|
||||
for (size_t i = 0; i < result1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(result1[i], result2[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(Function2D_Cuda_Test, prod) {
|
||||
auto matrixHost = Aurora::Matrix::fromRawData(new float[20], 4,5);
|
||||
for(unsigned int i=0; i<20;++i)
|
||||
{
|
||||
matrixHost[i] = i + 1;
|
||||
}
|
||||
auto matrixDevice = matrixHost.toDeviceMatrix();
|
||||
auto result1 = Aurora::prod(matrixHost);
|
||||
auto result2 = Aurora::prod(matrixDevice).toHostMatrix();
|
||||
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
|
||||
for (size_t i = 0; i < result1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(result1[i], result2[i]);
|
||||
}
|
||||
|
||||
matrixHost = Aurora::Matrix::fromRawData(new float[20], 20,1);
|
||||
for(unsigned int i=0; i<20;++i)
|
||||
{
|
||||
matrixHost[i] = i + 1;
|
||||
}
|
||||
matrixDevice = matrixHost.toDeviceMatrix();
|
||||
result1 = Aurora::prod(matrixHost);
|
||||
result2 = Aurora::prod(matrixDevice).toHostMatrix();
|
||||
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
|
||||
for (size_t i = 0; i < result1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(result1[i], result2[i]);
|
||||
}
|
||||
|
||||
auto matrixHostComplex = Aurora::Matrix::fromRawData(new float[40], 4,5, 1,Aurora::Complex);
|
||||
for(unsigned int i=0; i<40;++i)
|
||||
{
|
||||
matrixHost[i] = i + 1;
|
||||
}
|
||||
|
||||
matrixDevice = matrixHostComplex.toDeviceMatrix();
|
||||
result1 = Aurora::prod(matrixHostComplex);
|
||||
result2 = Aurora::prod(matrixDevice).toHostMatrix();
|
||||
ASSERT_FLOAT_EQ(result1.getDataSize(), result2.getDataSize());
|
||||
ASSERT_FLOAT_EQ(result1.getValueType(), result2.getValueType());
|
||||
for (size_t i = 0; i < result1.getDataSize() * result1.getValueType(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(result1[i], result2[i]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user