Change convertfp16tofloat to CudaMatrix input.
This commit is contained in:
@@ -1617,7 +1617,7 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexRes
|
||||
return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
|
||||
}
|
||||
|
||||
__global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){
|
||||
__global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){
|
||||
__shared__ ushort CONVERT_AND_VALUE;
|
||||
__shared__ ushort CONVERT_AND_VALUE_2;
|
||||
__shared__ ushort CONVERT_MUL_VALUE;
|
||||
@@ -1645,18 +1645,14 @@ __global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){
|
||||
aDes[idx] = (float)ret;
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::convertfp16tofloatCuda(short* aData, int aRows, int aColumns)
|
||||
CudaMatrix Aurora::convertfp16tofloatCuda(const CudaMatrix& aData, int aRows, int aColumns)
|
||||
{
|
||||
unsigned int size = aRows*aColumns;
|
||||
unsigned int short_size = size*sizeof(short);
|
||||
short* input = nullptr;
|
||||
cudaMalloc((void**)&input, short_size);
|
||||
cudaMemcpy(input, aData, short_size, cudaMemcpyHostToDevice);
|
||||
//uint16变换为float(32位)输出大小翻倍
|
||||
float* output = nullptr;
|
||||
cudaMalloc((void**)&output,size*sizeof(float));
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
convertValueKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(input,output, size);
|
||||
cudaFree(input);
|
||||
convertValueKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aData.getData(), output, size);
|
||||
cudaDeviceSynchronize();
|
||||
return CudaMatrix::fromRawData(output, aRows, aColumns);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user