Add vecnorm and unitest
This commit is contained in:
@@ -1118,3 +1118,97 @@ CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix
|
||||
return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType());
|
||||
}
|
||||
|
||||
__global__ void vecnorm1Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex)
|
||||
{
|
||||
__shared__ float sharedValue[THREADS_PER_BLOCK];
|
||||
sharedValue[threadIdx.x] = 0;
|
||||
if(aIsComplex)
|
||||
{
|
||||
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
||||
{
|
||||
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
||||
if(indexByRows < aInputRowSize)
|
||||
{
|
||||
unsigned int idx = blockIdx.x*aInputRowSize + indexByRows;
|
||||
sharedValue[threadIdx.x] += sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
||||
{
|
||||
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
||||
if(indexByRows < aInputRowSize)
|
||||
{
|
||||
sharedValue[threadIdx.x] += abs(aInputData[blockIdx.x*aInputRowSize + indexByRows]);
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
||||
{
|
||||
if(threadIdx.x < i)
|
||||
{
|
||||
sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
aOutput[blockIdx.x] = sharedValue[0];
|
||||
}
|
||||
|
||||
__global__ void vecnorm2Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex)
|
||||
{
|
||||
__shared__ float sharedValue[THREADS_PER_BLOCK];
|
||||
sharedValue[threadIdx.x] = 0;
|
||||
|
||||
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
||||
{
|
||||
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
||||
if(indexByRows < aInputRowSize)
|
||||
{
|
||||
if(aIsComplex)
|
||||
{
|
||||
unsigned int idx = blockIdx.x*aInputRowSize + indexByRows;
|
||||
sharedValue[threadIdx.x] += aInputData[2 * idx] * aInputData[2 * idx];
|
||||
sharedValue[threadIdx.x] += aInputData[2 * idx + 1] * aInputData[2 * idx + 1];
|
||||
}
|
||||
else
|
||||
{
|
||||
sharedValue[threadIdx.x] += aInputData[blockIdx.x*aInputRowSize + indexByRows] * aInputData[blockIdx.x*aInputRowSize + indexByRows];
|
||||
}
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
||||
{
|
||||
if(threadIdx.x < i)
|
||||
{
|
||||
sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
aOutput[blockIdx.x] = sqrt(sharedValue[0]);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim)
|
||||
{
|
||||
//only surpport aDim = 1 for now.
|
||||
if(aDim != 1 || aNormMethod == NormMethod::NormF || aMatrix.isNull())
|
||||
{
|
||||
return CudaMatrix();
|
||||
}
|
||||
unsigned int column = aMatrix.getDimSize(1);
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * column);
|
||||
if(aNormMethod == Aurora::Norm1)
|
||||
{
|
||||
vecnorm1Kernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex());
|
||||
}
|
||||
else if(aNormMethod == Aurora::Norm2)
|
||||
{
|
||||
vecnorm2Kernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex());
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data,column);
|
||||
}
|
||||
|
||||
@@ -69,6 +69,8 @@ namespace Aurora
|
||||
|
||||
CudaMatrix vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2);
|
||||
|
||||
CudaMatrix vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim);
|
||||
|
||||
// ------compareSet----------------------------------------------------
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user