Add prod and unittest.
This commit is contained in:
@@ -1159,6 +1159,108 @@ CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, Func
|
||||
return CudaMatrix::fromRawData(data, 1, column);
|
||||
}
|
||||
|
||||
__global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
|
||||
{
|
||||
__shared__ float sharedValue[THREADS_PER_BLOCK];
|
||||
sharedValue[threadIdx.x] = 1;
|
||||
|
||||
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
||||
{
|
||||
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
||||
if(indexByRows < aInputRowSize)
|
||||
{
|
||||
sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows];
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
||||
{
|
||||
if(threadIdx.x < i)
|
||||
{
|
||||
sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
aOutputData[blockIdx.x] = sharedValue[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize)
|
||||
{
|
||||
__shared__ float sharedValue[THREADS_PER_BLOCK * 2];
|
||||
unsigned int complexIdx = threadIdx.x * 2;
|
||||
|
||||
sharedValue[complexIdx] = 1;
|
||||
sharedValue[complexIdx + 1] = 0;
|
||||
|
||||
for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i)
|
||||
{
|
||||
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
||||
if(indexByRows < aInputRowSize)
|
||||
{
|
||||
unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows);
|
||||
float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1];
|
||||
float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index];
|
||||
sharedValue[complexIdx] = real;
|
||||
sharedValue[complexIdx + 1] = imag;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
||||
{
|
||||
if(threadIdx.x < i)
|
||||
{
|
||||
unsigned int index = 2 * (threadIdx.x + i);
|
||||
float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1];
|
||||
float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index];
|
||||
sharedValue[complexIdx] = real;
|
||||
sharedValue[complexIdx + 1] = imag;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
{
|
||||
aOutputData[2 * blockIdx.x] = sharedValue[0];
|
||||
aOutputData[2 * blockIdx.x + 1] = sharedValue[1];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.getDimSize(2) > 1 )
|
||||
{
|
||||
std::cerr<< "cuda prod() not support 3D data!"<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
|
||||
unsigned int row = aMatrix.getDimSize(0);
|
||||
unsigned int column = aMatrix.getDimSize(1);
|
||||
|
||||
if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1)
|
||||
{
|
||||
column = 1;
|
||||
row = aMatrix.getDataSize();
|
||||
}
|
||||
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType());
|
||||
if(aMatrix.isComplex())
|
||||
{
|
||||
prodComplexKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
|
||||
}
|
||||
else
|
||||
{
|
||||
prodKernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, row);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user