diff --git a/src/Function1D.cu b/src/Function1D.cu index ea42b5a..5186021 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -1590,3 +1590,46 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexRes delete [] indexResult; return transpose(CudaMatrix::fromRawData(resultData, rows, columns)); } + +__global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){ + __shared__ ushort CONVERT_AND_VALUE; + __shared__ ushort CONVERT_AND_VALUE_2; + __shared__ ushort CONVERT_MUL_VALUE; + __shared__ unsigned int CONVERT_ADD_VALUE; + if (threadIdx.x == 0){ + CONVERT_AND_VALUE = 15u; + CONVERT_AND_VALUE_2 = 2047u; + CONVERT_MUL_VALUE = 2048u; + CONVERT_ADD_VALUE = UINT32_MAX - 4095u; + } + __syncthreads(); + unsigned int idx = blockIdx.x*blockDim.x +threadIdx.x; + if (idx >= size) return; + short value = aSrc[idx]; + ushort exponent=(ushort)value; + exponent = (exponent >> 11) & CONVERT_AND_VALUE; + short sign = value; + unsigned int sign_bit = (unsigned int)(sign < 0 ? 1 : 0); + ushort fraction3= (ushort)value; + fraction3 &= CONVERT_AND_VALUE_2; + unsigned int hidden_bit = sign_bit * (!exponent ? 1 : 0) * CONVERT_MUL_VALUE + + ((!sign_bit && exponent) ? 1 : 0) * CONVERT_MUL_VALUE; + unsigned int temp = fraction3 + hidden_bit + sign_bit * CONVERT_ADD_VALUE; + int ret = (int)(exponent> 1 ? (temp << (exponent - 1)): temp); + aDes[idx] = (float)ret; +} + +CudaMatrix Aurora::convertfp16tofloatCuda(short* aData, int aRows, int aColumns) +{ + unsigned int size = aRows*aColumns; + unsigned int short_size = size*sizeof(short); + short* input = nullptr; + cudaMalloc((void**)&input, short_size); + cudaMemcpy(input, aData, short_size, cudaMemcpyHostToDevice); + //uint16变换为float(32位)输出大小翻倍 + float* output = nullptr; + cudaMalloc((void**)&output,size*sizeof(float)); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + convertValueKernel<<>>(input,output, size); + return CudaMatrix::fromRawData(output, aRows, aColumns); +} diff --git a/src/Function1D.cuh b/src/Function1D.cuh index 5934c75..a0f3380 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -114,6 +114,8 @@ namespace Aurora CudaMatrix auroraNot(CudaMatrix&& aMatrix); + CudaMatrix convertfp16tofloatCuda(short* aData, int aRows, int aColumns); + // ------compareSet---------------------------------------------------- void compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op); void compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,float compareValue, float newValue,CompareOp op); diff --git a/test/Function1D_Cuda_Test.cpp b/test/Function1D_Cuda_Test.cpp index 769cc4e..89f7142 100644 --- a/test/Function1D_Cuda_Test.cpp +++ b/test/Function1D_Cuda_Test.cpp @@ -1,5 +1,6 @@ #include +#include "MatlabReader.h" #include "CudaMatrix.h" #include "Matrix.h" #include "TestUtility.h" @@ -1149,3 +1150,16 @@ TEST_F(Function1D_Cuda_Test, uniqueByRows) { EXPECT_FLOAT_AE(indexResult1[i], indexResult2[i]); } } + +TEST_F(Function1D_Cuda_Test, convertfp16tofloat) { + MatlabReader m("/home/krad/TestData/convertReal.mat"); + + size_t count = 0; + auto input = m.readint16("input",count); + auto resultM = Aurora::convertfp16tofloat(input.get(),count,1); + auto resultC = Aurora::convertfp16tofloatCuda(input.get(),count,1); + for (size_t i = 0; i<100; i++) { + + EXPECT_FLOAT_EQ(resultC.getValue(i), resultM[i])<<",index:"<