diff --git a/src/common/convertfp16tofloat.cpp b/src/common/convertfp16tofloat.cpp new file mode 100644 index 0000000..6ecfa1c --- /dev/null +++ b/src/common/convertfp16tofloat.cpp @@ -0,0 +1,77 @@ +#include "convertfp16tofloat.h" + +#include "Function.h" +#include +#include +#include + +Aurora::Matrix Recon::convertfp16tofloat(Aurora::Matrix aMatrix) { + auto input = aMatrix.getData(); + // uint16变换为float(32位)输出大小翻倍 + auto output = Aurora::malloc(aMatrix.getDataSize() * 4); + size_t rows = aMatrix.getDataSize() * sizeof(double) / sizeof(short); + size_t total_count = aMatrix.getDataSize(); + const ushort CONVERT_AND_VALUE = 15; + // andblack + __m128i andBlock = _mm_set_epi16(15, 15, 15, 15, 15, 15, 15, 15); + __m128i andBlock2 = + _mm_set_epi16(2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047); + __m128i zeroBlock = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0); + __m128i oneBlock = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); + __m128i twokBlock = + _mm_set_epi16(2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048); + uint CONVERT_ADD_VALUE = UINT32_MAX - 4095; + +#pragma omp parallel for + for (size_t i = 0; i < total_count; i += 2) { + // 循环展开以避免过度的线程调用 + if (i + 2 < total_count) { + auto ptr = (short *)(input + i); + // 初始化值 + auto value = _mm_set_epi16(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], + ptr[6], ptr[7]); + auto uvalue = _mm_set_epi16( + (ushort)ptr[0], (ushort)ptr[1], (ushort)ptr[2], (ushort)ptr[3], + (ushort)ptr[4], (ushort)ptr[5], (ushort)ptr[6], (ushort)ptr[7]); + // 位移 + auto sign_bit = _mm_srli_epi16(value, 15); // 右移16位取符号位 + auto exponent = _mm_srli_epi16(uvalue, 11); + // and + exponent = _mm_and_si128(exponent, andBlock); + // and ,then convert to int 32 bits + auto fraction3 = _mm256_cvtepi16_epi32(_mm_and_si128(uvalue, andBlock2)); + auto hidden_bit_mask = + (_mm_cmp_epi16_mask(sign_bit, oneBlock, _MM_CMPINT_EQ) & + _mm_cmp_epi16_mask(exponent, zeroBlock, _MM_CMPINT_EQ)) | + (_mm_cmp_epi16_mask(sign_bit, zeroBlock, _MM_CMPINT_EQ) & + _mm_cmp_epi16_mask(exponent, zeroBlock, _MM_CMPINT_NE)); + auto hidden_bit16 = _mm_maskz_set1_epi16(hidden_bit_mask, 2048); + auto hidden_bit32 = _mm256_cvtepi16_epi32(hidden_bit16); + auto outputBlock = _mm256_add_epi32(fraction3, hidden_bit32); + auto sign_bit_add_value = _mm256_maskz_set1_epi32( + _mm_cmp_epi16_mask(sign_bit, oneBlock, _MM_CMPINT_EQ), + CONVERT_ADD_VALUE); + outputBlock = _mm256_add_epi32(outputBlock, sign_bit_add_value); + auto exponent_mask = + _mm_cmp_epi16_mask(oneBlock, exponent, _MM_CMPINT_LT); + exponent = _mm_sub_epi16(exponent, oneBlock); + auto exponent32 = _mm256_cvtepi16_epi32(exponent); + auto zeroBlock32 = _mm256_cvtepi16_epi32(zeroBlock); + auto offsetCount = + _mm256_mask_blend_epi32(exponent_mask, zeroBlock32, exponent32); + + outputBlock = _mm256_sllv_epi32(outputBlock, offsetCount); + double *des = output + i * 4; + des[7] = (double)(int)_mm256_extract_epi32(outputBlock, 0); + des[6] = (double)(int)_mm256_extract_epi32(outputBlock, 1); + des[5] = (double)(int)_mm256_extract_epi32(outputBlock, 2); + des[4] = (double)(int)_mm256_extract_epi32(outputBlock, 3); + des[3] = (double)(int)_mm256_extract_epi32(outputBlock, 4); + des[2] = (double)(int)_mm256_extract_epi32(outputBlock, 5); + des[1] = (double)(int)_mm256_extract_epi32(outputBlock, 6); + des[0] = (double)(int)_mm256_extract_epi32(outputBlock, 7); + } + } + return Aurora::Matrix::New(output, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2)); +} \ No newline at end of file diff --git a/src/common/convertfp16tofloat.h b/src/common/convertfp16tofloat.h new file mode 100644 index 0000000..f018169 --- /dev/null +++ b/src/common/convertfp16tofloat.h @@ -0,0 +1,10 @@ +#ifndef __CONVERTFP16TOFLOAT_H__ +#define __CONVERTFP16TOFLOAT_H__ +#include "Matrix.h" +namespace Recon { + + Aurora::Matrix convertfp16tofloat(Aurora::Matrix aMatrix); + +} + +#endif // __CONVERTFP16TOFLOAT_H__ \ No newline at end of file diff --git a/test/Common_Test.cpp b/test/Common_Test.cpp index cc8332f..45e0c11 100644 --- a/test/Common_Test.cpp +++ b/test/Common_Test.cpp @@ -1,6 +1,7 @@ #include #include "common/ceMatchedFilterHandling.h" +#include "common/convertfp16tofloat.h" #include "MatlabReader.h" inline double fourDecimalRound(double src){ @@ -40,3 +41,18 @@ TEST_F(Common_Test, adaptFrequency) { } } + +TEST_F(Common_Test, convertfp16tofloat) { + MatlabReader m("/home/krad/TestData/convertReal.mat"); + + size_t count = 0; + auto input = m.readint16("input",count); + auto ma = Aurora::Matrix::copyFromRawData((double*)input.get(),count/4); + auto resultM = Recon::convertfp16tofloat(ma); + auto result = resultM.getData(); + auto output = m.read("output"); + for (size_t i = 0; i<10; i++) { + EXPECT_EQ(result[i], output.getData()[i])<<"index:"<