Add vecnorm and unitest

This commit is contained in:
sunwen
2023-11-30 17:55:02 +08:00
parent a8c65f21b1
commit 7db741502e
3 changed files with 134 additions and 0 deletions

View File

@@ -1118,3 +1118,97 @@ CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix
return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType());
}
__global__ void vecnorm1Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex)
{
__shared__ float sharedValue[THREADS_PER_BLOCK];
sharedValue[threadIdx.x] = 0;
if(aIsComplex)
{
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
unsigned int idx = blockIdx.x*aInputRowSize + indexByRows;
sharedValue[threadIdx.x] += sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]);
}
}
}
else
{
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
sharedValue[threadIdx.x] += abs(aInputData[blockIdx.x*aInputRowSize + indexByRows]);
}
}
}
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
{
if(threadIdx.x < i)
{
sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i];
}
__syncthreads();
}
aOutput[blockIdx.x] = sharedValue[0];
}
__global__ void vecnorm2Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex)
{
__shared__ float sharedValue[THREADS_PER_BLOCK];
sharedValue[threadIdx.x] = 0;
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
{
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
if(indexByRows < aInputRowSize)
{
if(aIsComplex)
{
unsigned int idx = blockIdx.x*aInputRowSize + indexByRows;
sharedValue[threadIdx.x] += aInputData[2 * idx] * aInputData[2 * idx];
sharedValue[threadIdx.x] += aInputData[2 * idx + 1] * aInputData[2 * idx + 1];
}
else
{
sharedValue[threadIdx.x] += aInputData[blockIdx.x*aInputRowSize + indexByRows] * aInputData[blockIdx.x*aInputRowSize + indexByRows];
}
}
}
__syncthreads();
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
{
if(threadIdx.x < i)
{
sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i];
}
__syncthreads();
}
aOutput[blockIdx.x] = sqrt(sharedValue[0]);
}
CudaMatrix Aurora::vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim)
{
//only surpport aDim = 1 for now.
if(aDim != 1 || aNormMethod == NormMethod::NormF || aMatrix.isNull())
{
return CudaMatrix();
}
unsigned int column = aMatrix.getDimSize(1);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * column);
if(aNormMethod == Aurora::Norm1)
{
vecnorm1Kernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex());
}
else if(aNormMethod == Aurora::Norm2)
{
vecnorm2Kernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex());
}
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data,column);
}

View File

@@ -69,6 +69,8 @@ namespace Aurora
CudaMatrix vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2);
CudaMatrix vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim);
// ------compareSet----------------------------------------------------