Change convertfp16tofloat to CudaMatrix input.

2023-12-22 11:09:58 +08:00
parent 3629a9f08c
commit cf247e7299
3 changed files with 9 additions and 11 deletions
--- a/src/Function1D.cu
+++ b/src/Function1D.cu
@@ -1617,7 +1617,7 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexRes
    return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
 }

-__global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){
+__global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){
    __shared__ ushort CONVERT_AND_VALUE;
    __shared__ ushort CONVERT_AND_VALUE_2;
    __shared__ ushort CONVERT_MUL_VALUE;
@@ -1645,18 +1645,14 @@ __global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){
    aDes[idx] = (float)ret;
 }

-CudaMatrix Aurora::convertfp16tofloatCuda(short* aData, int aRows, int aColumns)
+CudaMatrix Aurora::convertfp16tofloatCuda(const CudaMatrix& aData, int aRows, int aColumns)
 {
    unsigned int size = aRows*aColumns;
-    unsigned int short_size = size*sizeof(short);
-    short* input = nullptr;
-    cudaMalloc((void**)&input, short_size);
-    cudaMemcpy(input, aData, short_size, cudaMemcpyHostToDevice);
    //uint16变换为float(32位)输出大小翻倍
    float* output = nullptr;
    cudaMalloc((void**)&output,size*sizeof(float));
    int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    convertValueKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(input,output, size);
-    cudaFree(input);
+    convertValueKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aData.getData(), output, size);
+    cudaDeviceSynchronize();
    return CudaMatrix::fromRawData(output, aRows, aColumns);
 }