Change convertfp16tofloat to CudaMatrix input.
This commit is contained in:
@@ -1617,7 +1617,7 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexRes
|
|||||||
return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
|
return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){
|
__global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){
|
||||||
__shared__ ushort CONVERT_AND_VALUE;
|
__shared__ ushort CONVERT_AND_VALUE;
|
||||||
__shared__ ushort CONVERT_AND_VALUE_2;
|
__shared__ ushort CONVERT_AND_VALUE_2;
|
||||||
__shared__ ushort CONVERT_MUL_VALUE;
|
__shared__ ushort CONVERT_MUL_VALUE;
|
||||||
@@ -1645,18 +1645,14 @@ __global__ void convertValueKernel(short* aSrc ,float* aDes, unsigned int size){
|
|||||||
aDes[idx] = (float)ret;
|
aDes[idx] = (float)ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
CudaMatrix Aurora::convertfp16tofloatCuda(short* aData, int aRows, int aColumns)
|
CudaMatrix Aurora::convertfp16tofloatCuda(const CudaMatrix& aData, int aRows, int aColumns)
|
||||||
{
|
{
|
||||||
unsigned int size = aRows*aColumns;
|
unsigned int size = aRows*aColumns;
|
||||||
unsigned int short_size = size*sizeof(short);
|
|
||||||
short* input = nullptr;
|
|
||||||
cudaMalloc((void**)&input, short_size);
|
|
||||||
cudaMemcpy(input, aData, short_size, cudaMemcpyHostToDevice);
|
|
||||||
//uint16变换为float(32位)输出大小翻倍
|
//uint16变换为float(32位)输出大小翻倍
|
||||||
float* output = nullptr;
|
float* output = nullptr;
|
||||||
cudaMalloc((void**)&output,size*sizeof(float));
|
cudaMalloc((void**)&output,size*sizeof(float));
|
||||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||||
convertValueKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(input,output, size);
|
convertValueKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aData.getData(), output, size);
|
||||||
cudaFree(input);
|
cudaDeviceSynchronize();
|
||||||
return CudaMatrix::fromRawData(output, aRows, aColumns);
|
return CudaMatrix::fromRawData(output, aRows, aColumns);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -116,7 +116,7 @@ namespace Aurora
|
|||||||
|
|
||||||
CudaMatrix auroraNot(CudaMatrix&& aMatrix);
|
CudaMatrix auroraNot(CudaMatrix&& aMatrix);
|
||||||
|
|
||||||
CudaMatrix convertfp16tofloatCuda(short* aData, int aRows, int aColumns);
|
CudaMatrix convertfp16tofloatCuda(const CudaMatrix& aData, int aRows, int aColumns);
|
||||||
|
|
||||||
// ------compareSet----------------------------------------------------
|
// ------compareSet----------------------------------------------------
|
||||||
void compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op);
|
void compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op);
|
||||||
|
|||||||
@@ -1165,10 +1165,12 @@ TEST_F(Function1D_Cuda_Test, convertfp16tofloat) {
|
|||||||
|
|
||||||
size_t count = 0;
|
size_t count = 0;
|
||||||
auto input = m.readint16("input",count);
|
auto input = m.readint16("input",count);
|
||||||
|
float* inputFloat = new float[count];
|
||||||
|
std::copy(input.get(),input.get() + count, inputFloat);
|
||||||
|
Aurora::CudaMatrix inputDevice = Aurora::Matrix::fromRawData(inputFloat, count).toDeviceMatrix();
|
||||||
auto resultM = Aurora::convertfp16tofloat(input.get(),count,1);
|
auto resultM = Aurora::convertfp16tofloat(input.get(),count,1);
|
||||||
auto resultC = Aurora::convertfp16tofloatCuda(input.get(),count,1);
|
auto resultC = Aurora::convertfp16tofloatCuda(inputDevice,count,1);
|
||||||
for (size_t i = 0; i<100; i++) {
|
for (size_t i = 0; i<100; i++) {
|
||||||
|
|
||||||
EXPECT_FLOAT_EQ(resultC.getValue(i), resultM[i])<<",index:"<<i;
|
EXPECT_FLOAT_EQ(resultC.getValue(i), resultM[i])<<",index:"<<i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user