refactor convertfp16tofloat

This commit is contained in:
kradchen
2023-05-09 17:44:06 +08:00
parent b04c5ff58f
commit 04608db7e0
2 changed files with 91 additions and 66 deletions

View File

@@ -4,35 +4,25 @@
#include <emmintrin.h> #include <emmintrin.h>
#include <immintrin.h> #include <immintrin.h>
#include <sys/types.h> #include <sys/types.h>
namespace {
Aurora::Matrix Recon::convertfp16tofloat(Aurora::Matrix aMatrix) {
auto input = aMatrix.getData();
// uint16变换为float(32位)输出大小翻倍
auto output = Aurora::malloc(aMatrix.getDataSize() * 4);
size_t rows = aMatrix.getDataSize() * sizeof(double) / sizeof(short);
size_t total_count = aMatrix.getDataSize();
const ushort CONVERT_AND_VALUE = 15; const ushort CONVERT_AND_VALUE = 15;
// andblack // andblack
__m128i andBlock = _mm_set_epi16(15, 15, 15, 15, 15, 15, 15, 15); const __m128i andBlock = _mm_set_epi16(15, 15, 15, 15, 15, 15, 15, 15);
__m128i andBlock2 = const __m128i andBlock2 =
_mm_set_epi16(2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047); _mm_set_epi16(2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047);
__m128i zeroBlock = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0); const __m128i zeroBlock = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0);
__m128i oneBlock = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); const __m128i oneBlock = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1);
__m128i twokBlock = const __m128i twokBlock =
_mm_set_epi16(2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048); _mm_set_epi16(2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048);
uint CONVERT_ADD_VALUE = UINT32_MAX - 4095; const uint CONVERT_ADD_VALUE = UINT32_MAX - 4095;
void convert(short * ptr, double* des,bool single = false){
#pragma omp parallel for
for (size_t i = 0; i < total_count; i += 2) {
// 循环展开以避免过度的线程调用
if (i + 2 < total_count) {
auto ptr = (short *)(input + i);
// 初始化值 // 初始化值
auto value = _mm_set_epi16(ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], ptr[5], auto value = _mm_set_epi16(ptr[0], ptr[1], ptr[2], ptr[3], single?ptr[0]:ptr[4], single?ptr[0]:ptr[5],
ptr[6], ptr[7]); single?ptr[0]:ptr[6], single?ptr[0]:ptr[7]);
auto uvalue = _mm_set_epi16( auto uvalue = _mm_set_epi16(
(ushort)ptr[0], (ushort)ptr[1], (ushort)ptr[2], (ushort)ptr[3], (ushort)ptr[0], (ushort)ptr[1], (ushort)ptr[2], (ushort)ptr[3],
(ushort)ptr[4], (ushort)ptr[5], (ushort)ptr[6], (ushort)ptr[7]); (ushort)(single?ptr[0]:ptr[4]), (ushort)(single?ptr[0]:ptr[5]),
(ushort)(single?ptr[0]:ptr[6]), (ushort)(single?ptr[0]:ptr[7]));
// 位移 // 位移
auto sign_bit = _mm_srli_epi16(value, 15); // 右移16位取符号位 auto sign_bit = _mm_srli_epi16(value, 15); // 右移16位取符号位
auto exponent = _mm_srli_epi16(uvalue, 11); auto exponent = _mm_srli_epi16(uvalue, 11);
@@ -61,15 +51,50 @@ Aurora::Matrix Recon::convertfp16tofloat(Aurora::Matrix aMatrix) {
_mm256_mask_blend_epi32(exponent_mask, zeroBlock32, exponent32); _mm256_mask_blend_epi32(exponent_mask, zeroBlock32, exponent32);
outputBlock = _mm256_sllv_epi32(outputBlock, offsetCount); outputBlock = _mm256_sllv_epi32(outputBlock, offsetCount);
des[3] = _mm256_extract_epi32(outputBlock, 4);
des[2] = _mm256_extract_epi32(outputBlock, 5);
des[1] = _mm256_extract_epi32(outputBlock, 6);
des[0] = _mm256_extract_epi32(outputBlock, 7);
if(single) return;
des[7] = _mm256_extract_epi32(outputBlock, 0);
des[6] = _mm256_extract_epi32(outputBlock, 1);
des[5] = _mm256_extract_epi32(outputBlock, 2);
des[4] = _mm256_extract_epi32(outputBlock, 3);
}
}
Aurora::Matrix Recon::convertfp16tofloat(Aurora::Matrix aMatrix) {
auto input = aMatrix.getData();
// uint16变换为float(32位)输出大小翻倍
auto output = Aurora::malloc(aMatrix.getDataSize() * 4);
size_t rows = aMatrix.getDataSize() * sizeof(double) / sizeof(short);
size_t total_count = aMatrix.getDataSize();
#pragma omp parallel for
for (size_t i = 0; i < total_count; i += 8) {
// 循环展开以避免过度的线程调用
if (i < total_count) {
auto ptr = (short *)(input + i);
double *des = output + i * 4; double *des = output + i * 4;
des[7] = (double)(int)_mm256_extract_epi32(outputBlock, 0); ::convert(ptr, des,i+1>total_count);
des[6] = (double)(int)_mm256_extract_epi32(outputBlock, 1); }
des[5] = (double)(int)_mm256_extract_epi32(outputBlock, 2); if (i+2 < total_count) {
des[4] = (double)(int)_mm256_extract_epi32(outputBlock, 3); auto ptr = (short *)(input + i + 2);
des[3] = (double)(int)_mm256_extract_epi32(outputBlock, 4); double *des = output + (i+2) * 4;
des[2] = (double)(int)_mm256_extract_epi32(outputBlock, 5); ::convert(ptr, des,i+3>total_count);
des[1] = (double)(int)_mm256_extract_epi32(outputBlock, 6); }
des[0] = (double)(int)_mm256_extract_epi32(outputBlock, 7); if (i+4 < total_count) {
auto ptr = (short *)(input + i + 4);
double *des = output + (i+4) * 4;
::convert(ptr, des,i+5>total_count);
}
if (i+6 < total_count) {
auto ptr = (short *)(input + i + 6);
double *des = output + (i+6) * 4;
::convert(ptr, des,i+7>total_count);
} }
} }
return Aurora::Matrix::New(output, aMatrix.getDimSize(0), return Aurora::Matrix::New(output, aMatrix.getDimSize(0),

View File

@@ -51,7 +51,7 @@ TEST_F(Common_Test, convertfp16tofloat) {
auto resultM = Recon::convertfp16tofloat(ma); auto resultM = Recon::convertfp16tofloat(ma);
auto result = resultM.getData(); auto result = resultM.getData();
auto output = m.read("output"); auto output = m.read("output");
for (size_t i = 0; i<10; i++) { for (size_t i = 0; i<count; i++) {
EXPECT_EQ(result[i], output.getData()[i])<<"index:"<<i<<",input:"<< ((short*)ma.getData())[i]<<",input2:"<<input.get()[i]; EXPECT_EQ(result[i], output.getData()[i])<<"index:"<<i<<",input:"<< ((short*)ma.getData())[i]<<",input2:"<<input.get()[i];
} }