diff --git a/src/Function1D.cu b/src/Function1D.cu index c99e1ba..84c30c6 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -1617,7 +1617,7 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexRes return transpose(CudaMatrix::fromRawData(resultData, rows, columns)); } -__global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){ +__global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){ __shared__ ushort CONVERT_AND_VALUE; __shared__ ushort CONVERT_AND_VALUE_2; __shared__ ushort CONVERT_MUL_VALUE; @@ -1645,18 +1645,14 @@ __global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){ aDes[idx] = (float)ret; } -CudaMatrix Aurora::convertfp16tofloatCuda(short* aData, int aRows, int aColumns) +CudaMatrix Aurora::convertfp16tofloatCuda(const CudaMatrix& aData, int aRows, int aColumns) { unsigned int size = aRows*aColumns; - unsigned int short_size = size*sizeof(short); - short* input = nullptr; - cudaMalloc((void**)&input, short_size); - cudaMemcpy(input, aData, short_size, cudaMemcpyHostToDevice); //uint16变换为float(32位)输出大小翻倍 float* output = nullptr; cudaMalloc((void**)&output,size*sizeof(float)); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - convertValueKernel<<>>(input,output, size); - cudaFree(input); + convertValueKernel<<>>(aData.getData(), output, size); + cudaDeviceSynchronize(); return CudaMatrix::fromRawData(output, aRows, aColumns); } diff --git a/src/Function1D.cuh b/src/Function1D.cuh index 24761bc..8ead088 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -116,7 +116,7 @@ namespace Aurora CudaMatrix auroraNot(CudaMatrix&& aMatrix); - CudaMatrix convertfp16tofloatCuda(short* aData, int aRows, int aColumns); + CudaMatrix convertfp16tofloatCuda(const CudaMatrix& aData, int aRows, int aColumns); // ------compareSet---------------------------------------------------- void compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op); diff --git a/test/Function1D_Cuda_Test.cpp b/test/Function1D_Cuda_Test.cpp index 688fe79..715f524 100644 --- a/test/Function1D_Cuda_Test.cpp +++ b/test/Function1D_Cuda_Test.cpp @@ -1165,10 +1165,12 @@ TEST_F(Function1D_Cuda_Test, convertfp16tofloat) { size_t count = 0; auto input = m.readint16("input",count); + float* inputFloat = new float[count]; + std::copy(input.get(),input.get() + count, inputFloat); + Aurora::CudaMatrix inputDevice = Aurora::Matrix::fromRawData(inputFloat, count).toDeviceMatrix(); auto resultM = Aurora::convertfp16tofloat(input.get(),count,1); - auto resultC = Aurora::convertfp16tofloatCuda(input.get(),count,1); + auto resultC = Aurora::convertfp16tofloatCuda(inputDevice,count,1); for (size_t i = 0; i<100; i++) { - EXPECT_FLOAT_EQ(resultC.getValue(i), resultM[i])<<",index:"<