Add cuda convertfp16tofloat

This commit is contained in:
kradchen
2023-12-18 13:48:35 +08:00
parent b09257298c
commit fd7c71f7e9
4 changed files with 59 additions and 3 deletions

View File

@@ -1590,3 +1590,46 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexRes
delete [] indexResult;
return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
}
__global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){
__shared__ ushort CONVERT_AND_VALUE;
__shared__ ushort CONVERT_AND_VALUE_2;
__shared__ ushort CONVERT_MUL_VALUE;
__shared__ unsigned int CONVERT_ADD_VALUE;
if (threadIdx.x == 0){
CONVERT_AND_VALUE = 15u;
CONVERT_AND_VALUE_2 = 2047u;
CONVERT_MUL_VALUE = 2048u;
CONVERT_ADD_VALUE = UINT32_MAX - 4095u;
}
__syncthreads();
unsigned int idx = blockIdx.x*blockDim.x +threadIdx.x;
if (idx >= size) return;
short value = aSrc[idx];
ushort exponent=(ushort)value;
exponent = (exponent >> 11) & CONVERT_AND_VALUE;
short sign = value;
unsigned int sign_bit = (unsigned int)(sign < 0 ? 1 : 0);
ushort fraction3= (ushort)value;
fraction3 &= CONVERT_AND_VALUE_2;
unsigned int hidden_bit = sign_bit * (!exponent ? 1 : 0) * CONVERT_MUL_VALUE +
((!sign_bit && exponent) ? 1 : 0) * CONVERT_MUL_VALUE;
unsigned int temp = fraction3 + hidden_bit + sign_bit * CONVERT_ADD_VALUE;
int ret = (int)(exponent> 1 ? (temp << (exponent - 1)): temp);
aDes[idx] = (float)ret;
}
CudaMatrix Aurora::convertfp16tofloatCuda(short* aData, int aRows, int aColumns)
{
unsigned int size = aRows*aColumns;
unsigned int short_size = size*sizeof(short);
short* input = nullptr;
cudaMalloc((void**)&input, short_size);
cudaMemcpy(input, aData, short_size, cudaMemcpyHostToDevice);
//uint16变换为float(32位)输出大小翻倍
float* output = nullptr;
cudaMalloc((void**)&output,size*sizeof(float));
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
convertValueKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(input,output, size);
return CudaMatrix::fromRawData(output, aRows, aColumns);
}