From 410d657fe7f29325ffc61766cc3b937a22779c3f Mon Sep 17 00:00:00 2001 From: sunwen Date: Fri, 22 Dec 2023 11:17:18 +0800 Subject: [PATCH 1/2] commit dev-sun. --- CMakeLists.txt | 10 +- src/Aurora.cu | 201 +++++++++++ src/Aurora.h | 24 ++ src/common/convertfp16tofloat.cpp | 177 +++++----- .../getAScanBlockPreprocessed.cpp | 100 +++++- .../getAScanBlockPreprocessed.h | 18 + .../dataBlockCreation/getAscanBlock.cpp | 6 +- src/common/fileHelper.h | 4 +- src/common/preprocessAscanBlock.cpp | 11 + src/common/preprocessAscanBlock.h | 2 + src/config/config.cpp | 10 +- src/main.cxx | 79 ++++- .../startReflectionReconstruction.cpp.orig | 137 ++++++++ src/startReconstructions.cpp | 18 +- .../dataPreperation.cpp | 2 +- .../dataPreperation.h | 4 +- .../detection/detection.cpp | 40 ++- .../detection/detection.cu | 323 ++++++++++++++++++ .../detection/detection.cuh | 53 +++ .../detection/detection.h | 4 +- .../detection/getTransmissionData.cpp | 253 +++++++------- .../detection/getTransmissionData.cu | 60 ++++ .../detection/getTransmissionData.cuh | 10 + .../reconstruction/reconstruction.cpp | 48 ++- .../solvingEquationSystem/solve.cpp | 8 +- .../solvingEquationSystem/solve.h | 2 +- .../startTransmissionReconstruction.cpp | 9 +- 27 files changed, 1346 insertions(+), 267 deletions(-) create mode 100644 src/Aurora.cu create mode 100644 src/Aurora.h create mode 100644 src/reflectionReconstruction/startReflectionReconstruction.cpp.orig create mode 100644 src/transmissionReconstruction/detection/detection.cu create mode 100644 src/transmissionReconstruction/detection/detection.cuh create mode 100644 src/transmissionReconstruction/detection/getTransmissionData.cu create mode 100644 src/transmissionReconstruction/detection/getTransmissionData.cuh diff --git a/CMakeLists.txt b/CMakeLists.txt index 0739b5a..0f8080e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(UR) set(CMAKE_CXX_STANDARD 14) set(CMAKE_INCLUDE_CURRENT_DIR ON) - +add_definitions(-DUSE_CUDA) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) @@ -20,8 +20,9 @@ file(GLOB_RECURSE cpp_files ./src/*.cpp) file(GLOB_RECURSE cu_files ./src/*.cu) file(GLOB_RECURSE cxx_files ./src/*.cxx) file(GLOB_RECURSE header_files ./src/*.h) -add_executable(UR ${cpp_files} ${cxx_files} ${header_files} ${Aurora_Source}) -target_compile_options(UR PUBLIC ${Aurora_Complie_Options} "-march=native") +add_executable(UR ${cpp_files} ${cu_files} ${cxx_files} ${header_files} ${Aurora_Source} ${Aurora_Source_Cu} ./src/Aurora.cu ${Aurora_DIR}/src/CudaMatrixPrivate.cu) + +#target_compile_options(UR PUBLIC ${Aurora_Complie_Options} "-march=native") target_include_directories(UR PUBLIC ./src/) target_include_directories(UR PUBLIC ${Aurora_INCLUDE_DIRS}) target_include_directories(UR PUBLIC ${DCMTK_INCLUDE_DIRS}) @@ -51,8 +52,9 @@ set_target_properties(UR PROPERTIES CUDA_SEPARABLE_COMPILATION ON) #                        -arch sm_75 #                        >) target_compile_options(UR PRIVATE $<$: - -arch=sm_75 + -arch=sm_75 --expt-extended-lambda >) + target_link_libraries(UR PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) # target_link_libraries(UR PUBLIC URDepends::SaftATT) diff --git a/src/Aurora.cu b/src/Aurora.cu new file mode 100644 index 0000000..eea20de --- /dev/null +++ b/src/Aurora.cu @@ -0,0 +1,201 @@ +#include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cufft.h" +#include +#include +#include +#include +#include + +#include "Aurora.h" +#include "AuroraDefs.h" +#include "CudaMatrix.h" +#include +#include "log/log.h" +#include + +__global__ void doubleToComplexKernel(const double* input, cufftDoubleComplex* output, int size) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < size) { + output[idx].x = input[idx]; + output[idx].y = 0; + } +} + +void Aurora::doubleToComplex(const double* input, cufftDoubleComplex* output, int size) +{ + int threadsPerBlock = 1024; + int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; + doubleToComplexKernel<<>>(input, output, size); + cudaDeviceSynchronize(); // 等待GPU完成操作 +} + +__global__ void maxKernel(const float* aInput, const float* aOutput, int aSize) +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x*blockDim.x; + float maxResult = aInput[0]; + while (index < aSize) + { + if(maxResult < aInput[index]) + { + maxResult = aInput[index]; + } + index += stride; + } + +} + +void Aurora::max(const float* aInput, const float* aOutput, int aSize) +{ + int threadsPerBlock = 1024; + int blocksPerGrid = 68; + //max<<>>(aInput, aOutput, aSize); + cudaDeviceSynchronize(); +} + +__global__ void validKernel(const float* aData, const float* aValid, float* aOutput, int aOutputRowCount, int aOutputColumnCount) +{ + int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; + int dataIndex = (int)aValid[threadIndex]; + if(threadIndex < aOutputColumnCount) + { + for(int i=0; i < aOutputRowCount; ++i) + { + aOutput[threadIndex * aOutputRowCount + i] = aData[dataIndex * aOutputRowCount + i]; + } + } +} + +// __global__ void validSubKernel(const double* aValid, double* aOutput, unsigned int* aCount, int aValidSize) +// { +// int index = blockIdx.x * blockDim.x + threadIdx.x; +// if(index == 0) +// { +// for(int i=0;i>>(aData.getData(), validProcessedDevice, result, rowCount, validColumnCount); + cudaDeviceSynchronize(); + + cudaFree(validProcessedDevice); + delete[] hostValid; + delete[] validProcessed; + return Aurora::CudaMatrix::fromRawData(result, rowCount, validColumnCount); +} + +texture tex; +cudaArray* array; + +__global__ void testKernel(float* aData,cudaTextureObject_t aTexObj, cudaSurfaceObject_t aSurface) +{ + float a = tex2D(tex,5.5,5.5); + float b = tex2D(aTexObj,5.5,5.5); + float2 c = tex2D(aSurface,1,1); + printf("%f\n",a); + printf("%f\n",b); + printf("%f\n",c.x); + printf("%f\n",c.y); +} + +__global__ void writeSurfaceKernel( cudaSurfaceObject_t aSurface) +{ + float2 value; + value.x = 100; + value.y = 99; + surf2Dwrite(value, aSurface, 1, 1 ); + +} + +void subTest(cudaTextureObject_t& aTexture) +{ + cudaResourceDesc resourceDesc; + cudaTextureDesc textureDesc; + memset(&resourceDesc, 0, sizeof(resourceDesc)); + resourceDesc.resType = cudaResourceTypeArray; + resourceDesc.res.array.array = array; // 指向设备端的 CUDA 数组 + +// 在 textureDesc 中设置纹理描述 + memset(&textureDesc, 0, sizeof(textureDesc)); + textureDesc.addressMode[0] = cudaAddressModeClamp; + textureDesc.addressMode[1] = cudaAddressModeClamp; + textureDesc.filterMode = cudaFilterModeLinear; + textureDesc.readMode = cudaReadModeElementType; + textureDesc.normalizedCoords = false; + //textureDesc.channelDesc = texChannelDescSpeedOfSoundField; + cudaCreateTextureObject(&aTexture, &resourceDesc, &textureDesc, nullptr); +} + +void Aurora::test(float* aData) +{ + tex.addressMode[0] = cudaAddressModeClamp; // Texturreferenz beschreiben + tex.addressMode[1] = cudaAddressModeClamp; + tex.filterMode = cudaFilterModeLinear; + tex.normalized = 0; + cudaChannelFormatDesc texChannelDescSpeedOfSoundField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + cudaMallocArray(&array, &texChannelDescSpeedOfSoundField, 10, 9); + cudaMemcpyToArray(array, 0, 0, aData,10 * 9 *sizeof(float), cudaMemcpyHostToDevice); + cudaBindTextureToArray ( &tex, array, &texChannelDescSpeedOfSoundField ); + + cudaTextureObject_t textureObj; + subTest(textureObj); + + + + struct cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypeArray; + // Create the surface objects + resDesc.res.array.array = array; + cudaSurfaceObject_t inputSurfObj = 0; + cudaCreateSurfaceObject(&inputSurfObj, &resDesc); + + writeSurfaceKernel<<<1,1>>>(inputSurfObj); + cudaDeviceSynchronize(); + testKernel<<<1, 1>>>(aData,textureObj, inputSurfObj); + + + cudaDeviceSynchronize(); + cudaUnbindTexture(&tex); +} + +void Aurora::sort(const Aurora::Matrix& aMatrix) +{ + RECON_INFO("cuda start"); + thrust::sort(thrust::device, aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(), thrust::greater()); + RECON_INFO("cuda end"); +} + + diff --git a/src/Aurora.h b/src/Aurora.h new file mode 100644 index 0000000..e6de35b --- /dev/null +++ b/src/Aurora.h @@ -0,0 +1,24 @@ +#ifndef SUM_MATRIX_CU_H +#define SUM_MATRIX_CU_H +#include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cufft.h" +#include + +#include "Matrix.h" +namespace Aurora +{ + //__global__ void doubleToComplexKernel(const double* input, cufftDoubleComplex* output, int size); + void doubleToComplex(const double* input, cufftDoubleComplex* output, int size); + + //__global__ void maxKernel(const float* aInput, const float* aOutput, int aSize); + void max(const float* aInput, const float* aOutput, int aSize); + + Aurora::CudaMatrix valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid); + void test(float* aData); + + void sort(const Aurora::Matrix& aMatrix); + + //Aurora::CudaMatrix getTransmissionDataSubFunction(const Aurora::CudaMatrix& aFxMatrix, const Aurora::CudaMatrix& aFhMatrix); +} + + +#endif \ No newline at end of file diff --git a/src/common/convertfp16tofloat.cpp b/src/common/convertfp16tofloat.cpp index 62d5fed..39123c8 100644 --- a/src/common/convertfp16tofloat.cpp +++ b/src/common/convertfp16tofloat.cpp @@ -5,98 +5,103 @@ #include #include namespace { - const ushort CONVERT_AND_VALUE = 15; - // andblack - const __m128i andBlock = _mm_set_epi16(15, 15, 15, 15, 15, 15, 15, 15); - const __m128i andBlock2 = - _mm_set_epi16(2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047); - const __m128i zeroBlock = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0); - const __m128i oneBlock = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); - const __m128i twokBlock = - _mm_set_epi16(2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048); - const uint CONVERT_ADD_VALUE = UINT32_MAX - 4095; - void convert(short * ptr, float* des,bool single = false){ - // 初始化值 - auto value = _mm_set_epi16(ptr[0], ptr[1], ptr[2], ptr[3], single?ptr[0]:ptr[4], single?ptr[0]:ptr[5], - single?ptr[0]:ptr[6], single?ptr[0]:ptr[7]); - auto uvalue = _mm_set_epi16( - (ushort)ptr[0], (ushort)ptr[1], (ushort)ptr[2], (ushort)ptr[3], - (ushort)(single?ptr[0]:ptr[4]), (ushort)(single?ptr[0]:ptr[5]), - (ushort)(single?ptr[0]:ptr[6]), (ushort)(single?ptr[0]:ptr[7])); - // 位移 - auto sign_bit = _mm_srli_epi16(value, 15); // 右移16位取符号位 - auto exponent = _mm_srli_epi16(uvalue, 11); - // and - exponent = _mm_and_si128(exponent, andBlock); - // and ,then convert to int 32 bits - auto fraction3 = _mm256_cvtepi16_epi32(_mm_and_si128(uvalue, andBlock2)); - auto hidden_bit_mask = - (_mm_cmp_epi16_mask(sign_bit, oneBlock, _MM_CMPINT_EQ) & - _mm_cmp_epi16_mask(exponent, zeroBlock, _MM_CMPINT_EQ)) | - (_mm_cmp_epi16_mask(sign_bit, zeroBlock, _MM_CMPINT_EQ) & - _mm_cmp_epi16_mask(exponent, zeroBlock, _MM_CMPINT_NE)); - auto hidden_bit16 = _mm_maskz_set1_epi16(hidden_bit_mask, 2048); - auto hidden_bit32 = _mm256_cvtepi16_epi32(hidden_bit16); - auto outputBlock = _mm256_add_epi32(fraction3, hidden_bit32); - auto sign_bit_add_value = _mm256_maskz_set1_epi32( - _mm_cmp_epi16_mask(sign_bit, oneBlock, _MM_CMPINT_EQ), - CONVERT_ADD_VALUE); - outputBlock = _mm256_add_epi32(outputBlock, sign_bit_add_value); - auto exponent_mask = - _mm_cmp_epi16_mask(oneBlock, exponent, _MM_CMPINT_LT); - exponent = _mm_sub_epi16(exponent, oneBlock); - auto exponent32 = _mm256_cvtepi16_epi32(exponent); - auto zeroBlock32 = _mm256_cvtepi16_epi32(zeroBlock); - auto offsetCount = - _mm256_mask_blend_epi32(exponent_mask, zeroBlock32, exponent32); - outputBlock = _mm256_sllv_epi32(outputBlock, offsetCount); + // const ushort CONVERT_AND_VALUE = 15; + // // andblack + // const __m128i andBlock = _mm_set_epi16(15, 15, 15, 15, 15, 15, 15, 15); + // const __m128i andBlock2 = + // _mm_set_epi16(2047, 2047, 2047, 2047, 2047, 2047, 2047, 2047); + // const __m128i zeroBlock = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, 0); + // const __m128i oneBlock = _mm_set_epi16(1, 1, 1, 1, 1, 1, 1, 1); + // const __m128i twokBlock = + // _mm_set_epi16(2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048); + // const uint CONVERT_ADD_VALUE = UINT32_MAX - 4095; + // void convert(short * ptr, double* des,bool single = false){ + // // 初始化值 + // auto value = _mm_set_epi16(ptr[0], ptr[1], ptr[2], ptr[3], single?ptr[0]:ptr[4], single?ptr[0]:ptr[5], + // single?ptr[0]:ptr[6], single?ptr[0]:ptr[7]); + // auto uvalue = _mm_set_epi16( + // (ushort)ptr[0], (ushort)ptr[1], (ushort)ptr[2], (ushort)ptr[3], + // (ushort)(single?ptr[0]:ptr[4]), (ushort)(single?ptr[0]:ptr[5]), + // (ushort)(single?ptr[0]:ptr[6]), (ushort)(single?ptr[0]:ptr[7])); + // // 位移 + // auto sign_bit = _mm_srli_epi16(value, 15); // 右移16位取符号位 + // auto exponent = _mm_srli_epi16(uvalue, 11); + // // and + // exponent = _mm_and_si128(exponent, andBlock); + // // and ,then convert to int 32 bits + // auto fraction3 = _mm256_cvtepi16_epi32(_mm_and_si128(uvalue, andBlock2)); + // auto hidden_bit_mask = + // (_mm_cmp_epi16_mask(sign_bit, oneBlock, _MM_CMPINT_EQ) & + // _mm_cmp_epi16_mask(exponent, zeroBlock, _MM_CMPINT_EQ)) | + // (_mm_cmp_epi16_mask(sign_bit, zeroBlock, _MM_CMPINT_EQ) & + // _mm_cmp_epi16_mask(exponent, zeroBlock, _MM_CMPINT_NE)); + // auto hidden_bit16 = _mm_maskz_set1_epi16(hidden_bit_mask, 2048); + // auto hidden_bit32 = _mm256_cvtepi16_epi32(hidden_bit16); + // auto outputBlock = _mm256_add_epi32(fraction3, hidden_bit32); + // auto sign_bit_add_value = _mm256_maskz_set1_epi32( + // _mm_cmp_epi16_mask(sign_bit, oneBlock, _MM_CMPINT_EQ), + // CONVERT_ADD_VALUE); + // outputBlock = _mm256_add_epi32(outputBlock, sign_bit_add_value); + // auto exponent_mask = + // _mm_cmp_epi16_mask(oneBlock, exponent, _MM_CMPINT_LT); + // exponent = _mm_sub_epi16(exponent, oneBlock); + // auto exponent32 = _mm256_cvtepi16_epi32(exponent); + // auto zeroBlock32 = _mm256_cvtepi16_epi32(zeroBlock); + // auto offsetCount = + // _mm256_mask_blend_epi32(exponent_mask, zeroBlock32, exponent32); + + + // outputBlock = _mm256_sllv_epi32(outputBlock, offsetCount); - des[3] = _mm256_extract_epi32(outputBlock, 4); - des[2] = _mm256_extract_epi32(outputBlock, 5); - des[1] = _mm256_extract_epi32(outputBlock, 6); - des[0] = _mm256_extract_epi32(outputBlock, 7); - if(single) return; - des[7] = _mm256_extract_epi32(outputBlock, 0); - des[6] = _mm256_extract_epi32(outputBlock, 1); - des[5] = _mm256_extract_epi32(outputBlock, 2); - des[4] = _mm256_extract_epi32(outputBlock, 3); + // des[3] = _mm256_extract_epi32(outputBlock, 4); + // des[2] = _mm256_extract_epi32(outputBlock, 5); + // des[1] = _mm256_extract_epi32(outputBlock, 6); + // des[0] = _mm256_extract_epi32(outputBlock, 7); + // if(single) return; + // des[7] = _mm256_extract_epi32(outputBlock, 0); + // des[6] = _mm256_extract_epi32(outputBlock, 1); + // des[5] = _mm256_extract_epi32(outputBlock, 2); + // des[4] = _mm256_extract_epi32(outputBlock, 3); - } + // } } Aurora::Matrix Recon::convertfp16tofloat(Aurora::Matrix aMatrix) { - auto input = aMatrix.getData(); - // uint16变换为float(32位)输出大小翻倍 - auto output = Aurora::malloc(aMatrix.getDataSize() * 4); - size_t rows = aMatrix.getDataSize() * sizeof(float) / sizeof(short); - size_t total_count = aMatrix.getDataSize(); - #pragma omp parallel for - for (size_t i = 0; i < total_count; i += 8) { - // 循环展开以避免过度的线程调用 - if (i < total_count) { - auto ptr = (short *)(input + i); - float *des = output + i * 4; - ::convert(ptr, des,i+1>total_count); - } - if (i+2 < total_count) { - auto ptr = (short *)(input + i + 2); - float *des = output + (i+2) * 4; - ::convert(ptr, des,i+3>total_count); - } - if (i+4 < total_count) { - auto ptr = (short *)(input + i + 4); - float *des = output + (i+4) * 4; - ::convert(ptr, des,i+5>total_count); - } - if (i+6 < total_count) { - auto ptr = (short *)(input + i + 6); - float *des = output + (i+6) * 4; - ::convert(ptr, des,i+7>total_count); - } - } - return Aurora::Matrix::New(output, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2)); + // auto input = aMatrix.getData(); + // // uint16变换为float(32位)输出大小翻倍 + // auto output = Aurora::malloc(aMatrix.getDataSize() * 4); + // size_t rows = aMatrix.getDataSize() * sizeof(double) / sizeof(short); + // size_t total_count = aMatrix.getDataSize(); + + + // #pragma omp parallel for + // for (size_t i = 0; i < total_count; i += 8) { + // // 循环展开以避免过度的线程调用 + // if (i < total_count) { + // auto ptr = (short *)(input + i); + // double *des = output + i * 4; + // ::convert(ptr, des,i+1>total_count); + // } + // if (i+2 < total_count) { + // auto ptr = (short *)(input + i + 2); + // double *des = output + (i+2) * 4; + // ::convert(ptr, des,i+3>total_count); + // } + // if (i+4 < total_count) { + // auto ptr = (short *)(input + i + 4); + // double *des = output + (i+4) * 4; + // ::convert(ptr, des,i+5>total_count); + // } + // if (i+6 < total_count) { + // auto ptr = (short *)(input + i + 6); + // double *des = output + (i+6) * 4; + // ::convert(ptr, des,i+7>total_count); + // } + // } + // return Aurora::Matrix::New(output, aMatrix.getDimSize(0), + // aMatrix.getDimSize(1), aMatrix.getDimSize(2)); + } \ No newline at end of file diff --git a/src/common/dataBlockCreation/getAScanBlockPreprocessed.cpp b/src/common/dataBlockCreation/getAScanBlockPreprocessed.cpp index 4f37937..f8bfb18 100644 --- a/src/common/dataBlockCreation/getAScanBlockPreprocessed.cpp +++ b/src/common/dataBlockCreation/getAScanBlockPreprocessed.cpp @@ -1,5 +1,6 @@ #include "getAScanBlockPreprocessed.h" +#include "CudaMatrix.h" #include "Matrix.h" #include "blockingGeometryInfo.h" #include "removeDataFromArrays.h" @@ -10,15 +11,36 @@ #include "src/transmissionReconstruction/dataFilter/dataFilter.h" #include "src/reflectionReconstruction/dataFilter.h" +#include "Aurora.h" + using namespace Aurora; using namespace Recon; +#include +#include +void printTime() +{ + struct timeval tpend; + gettimeofday(&tpend,NULL); + int secofday = (tpend.tv_sec + 3600 * 8 ) % 86400; + int hours = secofday / 3600; + int minutes = (secofday - hours * 3600 ) / 60; + int seconds = secofday % 60; + int milliseconds = tpend.tv_usec/1000; + std::cout<< hours << ":" < 0) @@ -62,6 +88,72 @@ AscanBlockPreprocessed Recon::getAscanBlockPreprocessed(Parser* aParser, const A { result.ascanBlockPreprocessed = ascanBlock.ascanBlock; } - +//printTime(); return result; -} \ No newline at end of file +} + + +AscanBlockPreprocessedCuda Recon::getAscanBlockPreprocessedCuda(Parser* aParser, const Aurora::Matrix& aMp, const Aurora::Matrix& aSl, const Aurora::Matrix& aSn, + const Aurora::Matrix& aRl, const Aurora::Matrix& aRn, GeometryInfo& aGeom, const MeasurementInfo& aMeasInfo, + bool aApplyFilter, bool aTransReco) +{ +//std::cout<<"strart"< 0) + { + result.ascanBlockPreprocessed = preprocessAscanBlockCuda(result.ascanBlockPreprocessed, aMeasInfo); + } + // else + // { + // result.ascanBlockPreprocessed = ascanBlock.ascanBlock; + // } +//printTime(); +//std::cout<<"end"<getOneTasAscanDataOfMotorPosition(aSl[slIndex]); - + #pragma omp parallel for for(int snIndex=0; snIndex Aurora::Matrix Recon::preprocessAscanBlock(const Aurora::Matrix& aAscans, const MeasurementInfo& aMeasInfo) @@ -20,4 +21,14 @@ Aurora::Matrix Recon::preprocessAscanBlock(const Aurora::Matrix& aAscans, const // end return result; +} + +Aurora::CudaMatrix Recon::preprocessAscanBlockCuda(const Aurora::CudaMatrix& aAscans, const MeasurementInfo& aMeasInfo) +{ + if(aMeasInfo.ascanDataType == "float16") + { + return Aurora::convertfp16tofloatCuda(aAscans, aAscans.getDimSize(0), aAscans.getDimSize(1)); + } + + return aAscans; } \ No newline at end of file diff --git a/src/common/preprocessAscanBlock.h b/src/common/preprocessAscanBlock.h index 961ffa0..30ed66c 100644 --- a/src/common/preprocessAscanBlock.h +++ b/src/common/preprocessAscanBlock.h @@ -7,6 +7,8 @@ namespace Recon { Aurora::Matrix preprocessAscanBlock(const Aurora::Matrix& aAscans, const MeasurementInfo& aMeasInfo); + + Aurora::CudaMatrix preprocessAscanBlockCuda(const Aurora::CudaMatrix& aAscans, const MeasurementInfo& aMeasInfo); } #endif \ No newline at end of file diff --git a/src/config/config.cpp b/src/config/config.cpp index be71b83..dd89991 100644 --- a/src/config/config.cpp +++ b/src/config/config.cpp @@ -393,23 +393,23 @@ namespace Recon nlohmann::json dataSelection = params.at("dataSelection"); if(dataSelection.contains("senderTasList")) { - reflectParams::senderTasList = readToMatrix(dataSelection.at("senderTasList"), false); + transParams::senderTasList = readToMatrix(dataSelection.at("senderTasList"), false); } if(dataSelection.contains("receiverTasList")) { - reflectParams::receiverTasList = readToMatrix(dataSelection.at("receiverTasList"), false); + transParams::receiverTasList = readToMatrix(dataSelection.at("receiverTasList"), false); } if(dataSelection.contains("senderElementList")) { - reflectParams::senderElementList = readToMatrix(dataSelection.at("senderElementList"), false); + transParams::senderElementList = readToMatrix(dataSelection.at("senderElementList"), false); } if(dataSelection.contains("receiverElementList")) { - reflectParams::receiverElementList = readToMatrix(dataSelection.at("receiverElementList"), false); + transParams::receiverElementList = readToMatrix(dataSelection.at("receiverElementList"), false); } if(dataSelection.contains("motorPos")) { - reflectParams::motorPos = readToMatrix(dataSelection.at("motorPos"), false); + transParams::motorPos = readToMatrix(dataSelection.at("motorPos"), false); } if(dataSelection.contains("filterSensitivity")) { diff --git a/src/main.cxx b/src/main.cxx index 52fc52e..f7bdfa0 100644 --- a/src/main.cxx +++ b/src/main.cxx @@ -1,6 +1,23 @@ + +#include "AuroraDefs.h" +#include "CudaMatrix.h" #include "Parser.h" #include "config/config.h" #include "log/notify.h" + +#include "Data/ElementIndex.h" +#include "Matrix.h" +#include "Parser.h" +#include "ShotList/ShotList.h" +#include "config/config.h" +#include +#include +#include + +#include +#include +#include +#include #include #define EIGEN_USE_MKL_ALL @@ -12,15 +29,71 @@ 2 is output path. 3 is config file path. */ +#include "Data/OneTasAScanData.h" +#include "Data/AScanData.h" +#include "Data/TasElementIndex.h" +#include "Data/ElectricIndex.h" +#include "Data/PatientData.h" +#include +#include "Function.h" +#include "Function1D.h" +#include "Function2D.h" +#include "Function1D.cuh" +#include "Function2D.cuh" +#include "MatlabWriter.h" +#include "MatlabReader.h" +#include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cufft.h" +#include +#include "Aurora.h" +#include "common/dataBlockCreation/removeDataFromArrays.h" +#include + +#include +#include +#include + + +#include +#include +#include +#include "reflectionReconstruction/preprocessData/imageExtrapolation.h" +#include +#include "transmissionReconstruction/detection/getTransmissionData.cuh" + + std::mutex mutex; + std::condition_variable condition; + std::vector vector; + + int main(int argc, char *argv[]) { - int argNum = 4; + // Aurora::CudaMatrix fxDevice = Aurora::Matrix::fromRawData(new float[10]{1,2,3,4,5,6,7,8,9,10}, 5,1,1,Aurora::Complex).toDeviceMatrix(); + // Aurora::CudaMatrix fhDevice = Aurora::Matrix::fromRawData(new float[10]{2,3,4,3,2,5,2,1,2,4}, 5,1,1,Aurora::Complex).toDeviceMatrix(); + // Aurora::CudaMatrix fxReal = Aurora::real(fxDevice); + // Aurora::CudaMatrix fhReal = Aurora::real(fhDevice); + // Aurora::CudaMatrix fxImag = Aurora::imag(fxDevice); + // Aurora::CudaMatrix fhImag = Aurora::imag(fhDevice); + // Aurora::CudaMatrix real = fxReal *fhReal + fxImag * fhImag; + // Aurora::CudaMatrix image = fxImag * fhReal - fxReal * fhImag; + // Aurora::Matrix result = Aurora::complex(real, image).toHostMatrix(); + // result.printf(); + // result = getTransmissionDataSubFunction(fxDevice, fhDevice).toHostMatrix(); + // result.printf(); + // return 0; + + + int argNum = 4; std::vector args(argNum); - args[0] = "/home/sun/20230418T141000/"; - args[1] = "/home/sun/20230418T145123/"; + //args[0] = "/DataCenter/科正测试/00e04b741e9f_20231130T091019"; + //args[1] = "/DataCenter/科正测试/00e04b741e9f_20231130T091019"; + args[0] = "/home/sun/20230418T145123/"; + args[1] = "/home/sun/20230418T141000/"; + //args[0] = "/home/krad/TestStore/00e04b741e9f_20231123T153138/"; + //args[1] = "/home/krad/TestStore/00e04b741e9f_20231123T152045/"; args[2] = Recon::DEFAULT_OUTPUT_PATH; args[3] = Recon::DEFAULT_CONFIG_PATH; + // args[3] = "/home/UR/"; argc = argc <= argNum? argc-1 : argNum; for (int i = 0; i < argc; i++) { diff --git a/src/reflectionReconstruction/startReflectionReconstruction.cpp.orig b/src/reflectionReconstruction/startReflectionReconstruction.cpp.orig new file mode 100644 index 0000000..15b20ec --- /dev/null +++ b/src/reflectionReconstruction/startReflectionReconstruction.cpp.orig @@ -0,0 +1,137 @@ +#include "startReflectionReconstruction.h" +#include "Function.h" +#include "Function2D.h" +#include "Function3D.h" +#include "Matrix.h" +#include "MatlabWriter.h" + +#include "common/getGeometryInfo.h" +#include "common/precalculateChannelList.h" +#include "common/dataBlockCreation/getAScanBlockPreprocessed.h" +#include "common/dataBlockCreation/removeDataFromArrays.h" +#include "log/notify.h" +#include "reflectionReconstruction/preprocessData/determineOptimalPulse.h" +#include "reflectionReconstruction/reconstructionSAFT/reconstructionSAFT.h" +#include "src/reflectionReconstruction/preprocessData/preprocessAScanBlockForReflection.h" +#include "config/config.h" +#include "log/log.h" + +#include "CudaEnvInit.h" + +#include +#include +#include +#include +#include + +using namespace Aurora; +using namespace Recon; + +namespace +{ + std::queue PRODUCER_PROCESSDATAS; + std::queue PRODUCER_BLOCKDATAS; + std::mutex PRODUCER_MUTEX; + std::condition_variable PRODUCER_CONDITION; + std::mutex CUSTOMER_MUTEX; + std::condition_variable CUSTOMER_CONDITION; +} + +void producerThread( Parser* aParser, const Aurora::Matrix& aMotorPos, + const Aurora::Matrix& aSlList, const Aurora::Matrix& aSnList, + const Aurora::Matrix& aRlList, const Aurora::Matrix& aRnList, + GeometryInfo& aGeom, MeasurementInfo& aExpInfo, PreComputes& aPreComputes) +{ + if(reflectParams::useOptPulse==1 && reflectParams::runReflectionReco) + { + aPreComputes.sincPeak_ft = determineOptimalPulse(aPreComputes.timeInterval, aExpInfo.expectedAScanLength); + } + printf(" - channel list"); + auto channelList = precalculateChannelList(aRlList, aRnList, aExpInfo, aPreComputes); + size_t numScans = aMotorPos.getDataSize() * aSlList.getDataSize() * + aSnList.getDataSize() * aRlList.getDataSize() * + aRnList.getDataSize(); + int numTakenScans = 0,numProcessedScans = 0,numPossibleScans = 0; + for(int i=0; i +#include +#include +#include "Function1D.cuh" +#include "Function2D.cuh" +#include "Function3D.h" + +__global__ void copyMatrixKernel(float* aStartPos1, float* aEndPos1, + float* aStartPos2, float* aEndPos2, + float* aIn1, float* aOut1, + float* aIn2, float* aOut2, + unsigned int aColElements){ + unsigned int i = blockIdx.x; + unsigned int base_offset = i * aColElements; + unsigned int j = aStartPos1[i]-1+threadIdx.x; + for (int offset = 0; j+offset>>(startPos.getData(),endPos.getData(), + startPosRef.getData(), endPosRef.getData(), + ascans.getData(), ascans2.getData(), + ascansRef.getData(), ascansRef2.getData(), + ascans2.getDimSize(0)); + + auto pulseEnergy = Aurora::sum(ascans2^2); + auto pulseEnergyEmpty = Aurora::sum(ascansRef2^2); + + return Aurora::log(pulseEnergyEmpty/pulseEnergy); +} + +CudaMatrix +Recon::detectAttVectorizedCuda(const CudaMatrix &Ascan, const CudaMatrix &AscanRef, + const CudaMatrix &distRef, + const CudaMatrix &sosWaterRef, + const CudaMatrix &tof, int aScanReconstructionFrequency, + float offsetElectronic, int detectionWindowATT) { + auto sizeAscan = size(Ascan); + + auto sampleRate = aScanReconstructionFrequency; + float offsetElectronicSamples = offsetElectronic * sampleRate; + + auto envelopeOfAScan = Aurora::abs(Aurora::hilbert(Ascan)); + auto envelopeOfReferenceAScan = Aurora::abs(hilbert(AscanRef)); + auto tof2 = (distRef / sosWaterRef) * sampleRate + offsetElectronicSamples; + + auto startPos = zerosCuda(Ascan.getDimSize(1), 1); + auto endPos = zerosCuda(Ascan.getDimSize(1), 1); + auto startPosRef = zerosCuda(Ascan.getDimSize(1), 1); + auto endPosRef = zerosCuda(Ascan.getDimSize(1), 1); + + setPosKernel<<>>(startPos.getData(), + endPos.getData(), startPosRef.getData(), endPosRef.getData(), + tof.getData(), tof2.getData(), sizeAscan.getData(), sampleRate, + offsetElectronicSamples,detectionWindowATT); + return calculateAttenuationCuda(envelopeOfAScan, startPos, endPos, + envelopeOfReferenceAScan, startPosRef, + endPosRef); +} + +CudaMatrix Recon::calculateSOSOffset(const CudaMatrix &sosBlock, float referenceSOS, const CudaMatrix &distBlock, float sampleRate){ + auto offset = (distBlock / sosBlock - distBlock / referenceSOS) * sampleRate; + return offset; +} + +Recon::SearchPositionC Recon::calculateStarEndSearchPosition(const CudaMatrix &aVDistBlock, + float minSpeedOfSound, float maxSpeedOfSound, + float sampleRate, float maxSample, + const CudaMatrix &aVSosOffsetBlock, + float startOffset, float segmentLenOffset) +{ + auto startSearch = floor((aVDistBlock / maxSpeedOfSound)*sampleRate+aVSosOffsetBlock+startOffset); + auto lambda = [=] __device__ (const float& v){ + float a = (uint)(v); + if(a<1) return 1.0f; + else if(a>maxSample) return maxSample; + return a; + }; + thrust::transform(thrust::device, startSearch.getData(),startSearch.getData()+startSearch.getDataSize(), + startSearch.getData(),lambda); + + auto endSearch = ceil(aVDistBlock/minSpeedOfSound*sampleRate+aVSosOffsetBlock+startOffset+segmentLenOffset); + + thrust::transform(thrust::device, endSearch.getData(),endSearch.getData()+endSearch.getDataSize(), + endSearch.getData(),lambda); + Recon::SearchPositionC result; + result.startSearch = startSearch; + result.endSearch = endSearch; + return result; +} + +__global__ void copyMatrixKernel(float* aStartPos1, float* aEndPos1, + float* aIn1, float* aOut1, + unsigned int aColElements){ + unsigned int i = blockIdx.x; + unsigned int base_offset = i * aColElements; + unsigned int j = aStartPos1[i]-1+threadIdx.x; + for (int offset = 0; j+offset=begin && threadAccIdx>>(calcResult.startSearch.getData(), + calcResult.endSearch.getData(), AscanBlock.getData(), AscanBlockProcessed.getData(), + expectedPosWater.getData(), AscanBlock.getDimSize(0)); + } + else{ + copyMatrixKernel<<>>(calcResult.startSearch.getData(), + calcResult.endSearch.getData(),AscanBlock.getData(),AscanBlockProcessed.getData(), + AscanBlock.getDimSize(0)); + } + Recon::TimeWindowResultC result; + result.startSearch = calcResult.startSearch; + result.AscanBlockProcessed = AscanBlockProcessed; + return result; + } + +__device__ struct KVPair{ + __device__ KVPair(){} + __device__ KVPair(int aIndex,float aValue):index(aIndex),value(aValue){}; + __device__ KVPair(const KVPair& other){ + index = other.index; + value = other.value; + } + int index; + float value; +}; + +__device__ KVPair maxKV(const KVPair& x, const KVPair& y){ + return x.value>y.value ? x:y; +}; + +__global__ void findMaxIndexKernel(float* aInputData, unsigned int aColSize,float* aOutput, int aMaxlag){ + //确定每个thread的index + unsigned int idx = blockIdx.x * aColSize + threadIdx.x; + __shared__ KVPair shared_data[256]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? KVPair(threadIdx.x,aInputData[idx]) : KVPair(0,-FLT_MAX); + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset0; i>>=1) { + + if (threadIdx.x < i) { + shared_data[threadIdx.x] = maxKV(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = shared_data[0].index - aMaxlag; + } +} +int nextpow2(unsigned int value){ + unsigned int compareValue = 2; + int result = 1; + while(compareValue<=value){ + compareValue=compareValue<<1; + result++; + } + return result; +} +Recon::DetectResultC Recon::detectTofAndAtt( + const Aurora::CudaMatrix &AscanBlock, const Aurora::CudaMatrix &AscanRefBlock, + const Aurora::CudaMatrix &distBlock, const Aurora::CudaMatrix &distRefBlock, + const Aurora::CudaMatrix &sosWaterBlock, + const Aurora::CudaMatrix &sosWaterRefBlock, + int resampleFactor,int nthreads, float expectedSOSWater, + int useTimeWindowing, int aScanReconstructionFrequency,int detectionWindowATT, + float offsetElectronic, int detectionWindowSOS, float minSpeedOfSound, + float maxSpeedOfSound, bool gaussWindow) + { + auto sampleRate = aScanReconstructionFrequency; + float offsetElectronicSamples = offsetElectronic * sampleRate; + CudaMatrix diffStartSearch; + Recon::TimeWindowResultC timeResult1; + timeResult1.AscanBlockProcessed = AscanBlock; + Recon::TimeWindowResultC timeResult2; + timeResult2.AscanBlockProcessed = AscanRefBlock; + if (useTimeWindowing == 1) { + timeResult1 = applyTimeWindowing( + AscanBlock, sampleRate, distBlock, sosWaterBlock, + expectedSOSWater, offsetElectronicSamples, detectionWindowSOS, + minSpeedOfSound, maxSpeedOfSound, gaussWindow); + timeResult2 = applyTimeWindowing( + AscanRefBlock, sampleRate, distRefBlock, sosWaterRefBlock, + expectedSOSWater, offsetElectronicSamples, detectionWindowSOS, + minSpeedOfSound, maxSpeedOfSound, gaussWindow); + + diffStartSearch = timeResult1.startSearch - timeResult2.startSearch; + } + auto _AscanBlock = timeResult1.AscanBlockProcessed; + auto _AscanRefBlock = timeResult2.AscanBlockProcessed; + int maxlag = 0; + CudaMatrix c, c1; + { + auto m = std::max(size(_AscanBlock, 1), size(_AscanRefBlock, 1)); + maxlag = + std::max(size(_AscanBlock, 1), size(_AscanRefBlock, 1)) - 1; + auto mxl = std::min(maxlag, m - 1); + auto ceilLog2 = nextpow2(2 * m - 1); + auto m2 = pow(2, ceilLog2); + { + CudaMatrix c_1_2; + { + CudaMatrix c_1_1; + { + auto x = fft(_AscanBlock, m2); + auto y = fft(_AscanRefBlock, m2); + c_1_1 = x * conj(y); + } + c_1_2 = ifft(c_1_1); + } + c1 = real(c_1_2); + } + c = zerosCuda(mxl + mxl + 1, c1.getDimSize(1)); + c.setBlock(0, 0, mxl-1, c1.block(0, m2-mxl,m2-1)); + c.setBlock(0, mxl, mxl*2, c1.block(0, 0, mxl)); + } + auto shiftInSamples = zerosCuda(1, c1.getDimSize(1)); + findMaxIndexKernel<<>>(c.getData(),c.getDimSize(0),shiftInSamples.getData(),maxlag); + if (useTimeWindowing) { + shiftInSamples = shiftInSamples - diffStartSearch; + } + auto tof = shiftInSamples / sampleRate + distBlock / sosWaterBlock; + auto sosValue = distBlock / tof; + Recon::DetectResultC result; + result.sosValue = sosValue; + auto tofRel = tof - distBlock / sosWaterBlock; + result.att = detectAttVectorizedCuda( + _AscanBlock, _AscanRefBlock, distRefBlock, sosWaterRefBlock, + tof, aScanReconstructionFrequency, offsetElectronic, + detectionWindowATT); + result.tof = tofRel; + return result; + } \ No newline at end of file diff --git a/src/transmissionReconstruction/detection/detection.cuh b/src/transmissionReconstruction/detection/detection.cuh new file mode 100644 index 0000000..0767e9e --- /dev/null +++ b/src/transmissionReconstruction/detection/detection.cuh @@ -0,0 +1,53 @@ +#ifndef __DETECTION_CUH__ +#define __DETECTION_CUH__ +#include "CudaMatrix.h" +using namespace Aurora; +namespace Recon { +struct SearchPositionC { + Aurora::CudaMatrix startSearch; + Aurora::CudaMatrix endSearch; +}; +struct DetectResultC { + Aurora::CudaMatrix tof; + Aurora::CudaMatrix sosValue; + Aurora::CudaMatrix att; +}; +struct TimeWindowResultC { + Aurora::CudaMatrix startSearch; + Aurora::CudaMatrix AscanBlockProcessed; +}; + CudaMatrix calculateAttenuationCuda(const CudaMatrix &ascans, + const CudaMatrix &startPos, + const CudaMatrix &endPos, + const CudaMatrix &ascansRef, + const CudaMatrix &startPosRef, + const CudaMatrix &endPosRef); + CudaMatrix + detectAttVectorizedCuda(const CudaMatrix &Ascan, const CudaMatrix &AscanRef, + const CudaMatrix &distRef, + const CudaMatrix &sosWaterRef, + const CudaMatrix &tof, int aScanReconstructionFrequency, + float offsetElectronic, int detectionWindowATT); + + TimeWindowResultC applyTimeWindowing(const Aurora::CudaMatrix &AscanBlock, float sampleRate, + const Aurora::CudaMatrix &distBlock, const Aurora::CudaMatrix &sosBlock, + float expectedSOSWater, float startOffset, float segmentLenOffset, + float minSpeedOfSound, float maxSpeedOfSound, bool gaussWindow); + SearchPositionC calculateStarEndSearchPosition(const CudaMatrix &aVDistBlock, + float minSpeedOfSound, float maxSpeedOfSound, + float sampleRate, float maxSample, + const CudaMatrix &aVSosOffsetBlock, + float startOffset, float segmentLenOffset); + CudaMatrix calculateSOSOffset(const CudaMatrix &sosBlock, float referenceSOS, const CudaMatrix &distBlock, float sampleRate); + DetectResultC detectTofAndAtt( + const Aurora::CudaMatrix &AscanBlock, const Aurora::CudaMatrix &AscanRefBlock, + const Aurora::CudaMatrix &distBlock, const Aurora::CudaMatrix &distRefBlock, + const Aurora::CudaMatrix &sosWaterBlock, + const Aurora::CudaMatrix &sosWaterRefBlock, + int resampleFactor,int nthreads, float expectedSOSWater, + int useTimeWindowing, int aScanReconstructionFrequency,int detectionWindowATT, + float offsetElectronic, int detectionWindowSOS, float minSpeedOfSound, + float maxSpeedOfSound, bool gaussWindow); +}; + +#endif // __DETECTION_H__ \ No newline at end of file diff --git a/src/transmissionReconstruction/detection/detection.h b/src/transmissionReconstruction/detection/detection.h index b30a360..76a5292 100644 --- a/src/transmissionReconstruction/detection/detection.h +++ b/src/transmissionReconstruction/detection/detection.h @@ -75,8 +75,8 @@ detectTofAndAttMex( DetectResult transmissionDetection( - const Aurora::Matrix &AscanBlock, const Aurora::Matrix &AscanRefBlock, - const Aurora::Matrix &distBlock, const Aurora::Matrix &distRefBlock, + const Aurora::CudaMatrix &AscanBlock, const Aurora::CudaMatrix &AscanRefBlock, + const Aurora::CudaMatrix &distBlock, const Aurora::CudaMatrix &distRefBlock, const Aurora::Matrix &sosWaterBlock, const Aurora::Matrix &sosWaterRefBlock, float expectedSOSWater); } // namespace Recon diff --git a/src/transmissionReconstruction/detection/getTransmissionData.cpp b/src/transmissionReconstruction/detection/getTransmissionData.cpp index 5446a22..b78718e 100644 --- a/src/transmissionReconstruction/detection/getTransmissionData.cpp +++ b/src/transmissionReconstruction/detection/getTransmissionData.cpp @@ -1,4 +1,7 @@ #include "getTransmissionData.h" +#include "getTransmissionData.cuh" +#include "AuroraDefs.h" +#include "CudaMatrix.h" #include "Function.h" #include "Function1D.h" #include "Function2D.h" @@ -28,6 +31,11 @@ #include #include +#include +#include "Function1D.cuh" +#include "Function2D.cuh" +#include + using namespace Recon; using namespace Aurora; @@ -42,8 +50,8 @@ namespace Matrix waterTempBlock; MetaInfos metaInfos; - Matrix ascanBlock; - Matrix ascanBlockRef; + CudaMatrix ascanBlock; + CudaMatrix ascanBlockRef; Matrix dists; Matrix distRefBlock; Matrix waterTempRefBlock; @@ -56,28 +64,59 @@ namespace std::mutex PROCESS_BUFFER_MUTEX; std::condition_variable PROCESS_BUFFER_CONDITION; int BUFFER_COUNT = 0; - int BUFFER_SIZE = 3; + int BUFFER_SIZE = 4;//<=8 - Matrix prepareAScansForTransmissionDetection(const Matrix& aAscanBlock, const Matrix& aGainBlock) +void printTime() +{ + struct timeval tpend; + gettimeofday(&tpend,NULL); + int secofday = (tpend.tv_sec + 3600 * 8 ) % 86400; + int hours = secofday / 3600; + int minutes = (secofday - hours * 3600 ) / 60; + int seconds = secofday % 60; + int milliseconds = tpend.tv_usec/1000; + std::cout<< hours << ":" < lockBufferCount(CREATE_BUFFER_MUTEX); BLOCK_OF_TRANSIMISSIONDARA_BUFFER.erase(std::to_string(index)); @@ -371,7 +392,7 @@ TransmissionData Recon::getTransmissionData(const Aurora::Matrix& aMotorPos, con std::cout<<"Remove: "< +#include + +const int THREADS_PER_BLOCK = 256; + +__global__ void getTransmissionDataSubFunctionKernel(float* aFx, float* aFh, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + unsigned int realIndex = idx * 2; + unsigned int imagIndex = realIndex + 1; + float fxReal = aFx[realIndex]; + float fhReal = aFh[realIndex]; + float fxImag = aFx[imagIndex]; + float fhImag = aFh[imagIndex]; + aOutput[realIndex] = fxReal * fhReal + fxImag * fhImag; + aOutput[2*idx + 1] = fxImag * fhReal - fxReal * fhImag; + } +} + +Aurora::CudaMatrix getTransmissionDataSubFunction(const Aurora::CudaMatrix& aFxMatrix, const Aurora::CudaMatrix& aFhMatrix) +{ + size_t size = aFxMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size * Aurora::Complex); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + getTransmissionDataSubFunctionKernel<<>>(aFxMatrix.getData(), aFhMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aFxMatrix.getDimSize(0), aFxMatrix.getDimSize(1), aFxMatrix.getDimSize(2), Aurora::Complex); +} + +__global__ void createFhMatrixKernel(float* aMatchedFilter, float* aChannelListBlock, float* aOutput, unsigned int aRowSize) +{ + for(unsigned int i=0; i< std::ceil((float)aRowSize / (float)THREADS_PER_BLOCK); ++i) + { + unsigned int index = i * THREADS_PER_BLOCK + threadIdx.x; + if(index < aRowSize) + { + unsigned int outPutComplexIndex = 2 * (blockIdx.x * aRowSize + index); + unsigned int inputComplexIndex = 2 * ((aChannelListBlock[blockIdx.x] - 1) * aRowSize + index); + aOutput[outPutComplexIndex] = aMatchedFilter[inputComplexIndex]; + aOutput[outPutComplexIndex + 1] = aMatchedFilter[inputComplexIndex + 1]; + } + } +} + +Aurora::CudaMatrix createFhMatrix(const Aurora::CudaMatrix& aMatchedFilter, const Aurora::CudaMatrix& aChannelListBlock) +{ + size_t columnSize = aChannelListBlock.getDataSize(); + size_t rowSize = aMatchedFilter.getDimSize(0); + float* fh = nullptr; + cudaMalloc((void**)&fh, sizeof(float) * rowSize * columnSize * Aurora::Complex); + createFhMatrixKernel<<>>(aMatchedFilter.getData(), aChannelListBlock.getData(), fh, rowSize); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(fh, rowSize, columnSize, 1, Aurora::Complex); +} diff --git a/src/transmissionReconstruction/detection/getTransmissionData.cuh b/src/transmissionReconstruction/detection/getTransmissionData.cuh new file mode 100644 index 0000000..3e69a75 --- /dev/null +++ b/src/transmissionReconstruction/detection/getTransmissionData.cuh @@ -0,0 +1,10 @@ +#ifndef GETTRANSMISSIONDATA_CU_H +#define GETTRANSMISSIONDATA_CU_H + +#include "CudaMatrix.h" + +Aurora::CudaMatrix getTransmissionDataSubFunction(const Aurora::CudaMatrix& aFxMatrix, const Aurora::CudaMatrix& aFhMatrix); + +Aurora::CudaMatrix createFhMatrix(const Aurora::CudaMatrix& aMatchedFilter, const Aurora::CudaMatrix& aChannelListBlock); + +#endif \ No newline at end of file diff --git a/src/transmissionReconstruction/reconstruction/reconstruction.cpp b/src/transmissionReconstruction/reconstruction/reconstruction.cpp index 41207a7..1fee8d8 100644 --- a/src/transmissionReconstruction/reconstruction/reconstruction.cpp +++ b/src/transmissionReconstruction/reconstruction/reconstruction.cpp @@ -15,7 +15,12 @@ #include #include #include +#include +#include using namespace Aurora; +using solveParameterIteratorFunctionType = std::vector> (*)(Aurora::Sparse M, Aurora::Matrix &b, + const Aurora::Matrix &dims, bool oneIter, bool nonNeg, int aDevice); +using slownessToSOSFunctionType = Matrix (*)(Aurora::Matrix & aVF1, float aSOS_IN_WATER); namespace Recon { Aurora::Matrix calculateMinimalMaximalTransducerPositions( const Aurora::Matrix &aMSenderList, const Aurora::Matrix &aMReceiverList) { @@ -189,6 +194,7 @@ namespace Recon { BuildMatrixResult buildMatrixR; for(int iter=1; iter<=numIter; ++iter) { + //1200ms buildMatrixR = buildMatrix(senderList, receiverList, res, dims, bentRecon && (iter!=1), potentialMap); if(!data.isNull() && bentRecon && iter != numIter) { @@ -223,17 +229,51 @@ namespace Recon { allHitMaps.push_back(buildMatrixR.hitmap); } + std::promise sosPromise; + std::future sosFuture = sosPromise.get_future(); + + std::promise attPromise; + std::future attFuture = attPromise.get_future(); + solveParameterIteratorFunctionType solveParameterIteratorFunctionPointer = &solveParameterIterator; + slownessToSOSFunctionType slownessToSOSFunctionPointer = &slownessToSOS; + std::thread sosThread; + std::thread attThread; + if(!data.isNull()) { - Matrix sosValue = solveParameterIterator(buildMatrixR.M, b, dims, false, transParams::nonNeg)[0][0]; - result.outSOS = slownessToSOS(sosValue, SOS_IN_WATER) ; + //1500ms + sosThread = std::thread([solveParameterIteratorFunctionPointer, slownessToSOSFunctionPointer, &buildMatrixR, &b, &dims, &SOS_IN_WATER, &sosPromise]() + { + //auto bCopy = b.deepCopy(); + //auto dimCopy = dims.deepCopy(); + Matrix result = solveParameterIteratorFunctionPointer(buildMatrixR.M, b, dims, false, transParams::nonNeg, 0)[0][0]; + result = slownessToSOSFunctionPointer(result, SOS_IN_WATER); + sosPromise.set_value(result); + }); + //Matrix sosValue = solveParameterIterator(buildMatrixR.M, b, dims, false, transParams::nonNeg)[0][0]; + //1ms + //result.outSOS = slownessToSOS(sosValue, SOS_IN_WATER) ; } if(!dataAtt.isNull()) { - Matrix attValue = solveParameterIterator(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg)[0][0]; - result.outATT = attValue/100 ; + attThread = std::thread([solveParameterIteratorFunctionPointer, slownessToSOSFunctionPointer, &buildMatrixR, &bAtt, &dims, &attPromise]() + { + //auto bAttCopy = bAtt.deepCopy(); + //auto dimCopy = dims.deepCopy(); + Matrix result = solveParameterIteratorFunctionPointer(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg, 1)[0][0]; + result = result / 100; + attPromise.set_value(result); + }); + //1500ms + //Matrix attValue = solveParameterIterator(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg)[0][0]; + //1ms + //result.outATT = attValue/100 ; } + sosThread.join(); + attThread.join(); + result.outSOS = sosFuture.get(); + result.outATT = attFuture.get(); Recon::notifyProgress(10 + 10 * (iter/numIter)); } diff --git a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.cpp b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.cpp index 291acdb..d7b8c08 100644 --- a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.cpp +++ b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.cpp @@ -21,7 +21,7 @@ namespace Recon bool nonNeg = false; }; - Aurora::Matrix solve( Aurora::Sparse& M, Aurora::Matrix& b, const Aurora::Matrix& dims, int niter, TVAL3SolverOptions solverOptions){ + Aurora::Matrix solve( Aurora::Sparse& M, Aurora::Matrix& b, const Aurora::Matrix& dims, int niter, TVAL3SolverOptions solverOptions, int aDevice){ if (Recon::transParams::name.empty()){ Recon::transParams::name = "TVAL3"; } @@ -39,7 +39,7 @@ namespace Recon opt.mu = solverOptions.TVAL3MU; opt.beta = solverOptions.TVAL3Beta; opt.beta0 = solverOptions.TVAL3Beta0; - int device = (int)solverOptions.gpuSelectionList[0]; + int device = aDevice; return callTval3(M, b, dims, device, opt); } //SART @@ -50,7 +50,7 @@ namespace Recon } std::vector> solveParameterIterator(Aurora::Sparse M, Aurora::Matrix &b, - const Aurora::Matrix &dims, bool oneIter, bool nonNeg) + const Aurora::Matrix &dims, bool oneIter, bool nonNeg, int aDevice) { if (Recon::transParams::name == "TVAL3"){ std::vector> result(Recon::transParams::muValues.getDataSize()); @@ -82,7 +82,7 @@ namespace Recon options.TVAL3Beta = Recon::transParams::betaValues[i]; options.TVAL3Beta0 = Recon::transParams::betaValues[i]; options.nonNeg = nonNeg; - solveResult[j] = solve(M, b, dims, transParams::maxIter, options); + solveResult[j] = solve(M, b, dims, transParams::maxIter, options, aDevice); solveResult[j].forceReshape(dims[0], dims[1], dims.getDataSize()<3?1:dims[2]); } result[i] = solveResult; diff --git a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.h b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.h index 06b0df8..f572ede 100644 --- a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.h +++ b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.h @@ -6,7 +6,7 @@ namespace Recon { std::vector> solveParameterIterator(Aurora::Sparse M, Aurora::Matrix &b, - const Aurora::Matrix &dims, bool oneIter = true, bool nonNeg = false); + const Aurora::Matrix &dims, bool oneIter = true, bool nonNeg = false, int aDevice = 0); } // namespace Recon #endif // __SOLVE_H__ \ No newline at end of file diff --git a/src/transmissionReconstruction/startTransmissionReconstruction.cpp b/src/transmissionReconstruction/startTransmissionReconstruction.cpp index 5cdb976..cef6c53 100644 --- a/src/transmissionReconstruction/startTransmissionReconstruction.cpp +++ b/src/transmissionReconstruction/startTransmissionReconstruction.cpp @@ -1,6 +1,7 @@ #include "startTransmissionReconstruction.h" #include "./detection/getTransmissionData.h" #include "Matrix.h" +#include "CudaMatrix.h" #include "log/log.h" #include "common/dataBlockCreation/removeDataFromArrays.h" #include "log/notify.h" @@ -29,22 +30,22 @@ TransmissionReconstructionResult Recon::startTransmissionReconstruction(const Au aGeom, aGeomRef, aExpInfo, aExpInfoRef, aPreComputes, aParser, aParserRef); Matrix dists = Recon::distanceBetweenTwoPoints(transmissionData.senderList, transmissionData.receiverList); Matrix sosRef = Recon::temperatureToSoundSpeed(transmissionData.waterTempList, "marczak"); - Recon::notifyProgress(17); + //Recon::notifyProgress(17); Matrix valid = Recon::checkTofDetections(transmissionData.tofDataTotal, dists, sosRef, Recon::transParams::minSpeedOfSound,Recon::transParams::maxSpeedOfSound).valid; - Recon::notifyProgress(18); + //Recon::notifyProgress(18); if(transParams::qualityCheck) { qualityReview(sum(valid,Aurora::All)[0], transmissionData.dataInfo.numPossibleScans); } - Recon::notifyProgress(19); + //Recon::notifyProgress(19); DiscretizePositionValues positionValues = Recon::discretizePositions(transmissionData.senderList, transmissionData.receiverList, Recon::transParams::numPixelXY); Matrix tofData = removeDataFromArrays(transmissionData.tofDataTotal, valid); Matrix attData = removeDataFromArrays(transmissionData.attDataTotal, valid); Matrix senderList = removeDataFromArrays(positionValues.senderCoordList, valid); Matrix reveiverList = removeDataFromArrays(positionValues.receiverCoordList, valid); - Recon::notifyProgress(20); + //Recon::notifyProgress(20); RECON_INFO("Start reconstructArt."); auto transmissionReon = reconstructArt(tofData, attData, positionValues.dims, senderList, reveiverList, positionValues.res, aTemp.expectedSOSWater[0]); From c40dc8e938d5497893643dd31f4201b9d1a07784 Mon Sep 17 00:00:00 2001 From: sunwen Date: Tue, 26 Dec 2023 15:21:27 +0800 Subject: [PATCH 2/2] Update buildMatrix and solveParameterIterator speed up. --- .../reconstruction/buildMatrix/buildMatrix.cu | 246 ++++++++++++++++++ .../buildMatrix/buildMatrix.cuh | 12 + .../reconstruction/reconstruction.cpp | 42 +-- .../solvingEquationSystem/TVAL/TVAL.cpp | 51 ++-- .../solvingEquationSystem/TVAL/TVAL.h | 2 +- 5 files changed, 299 insertions(+), 54 deletions(-) create mode 100644 src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu create mode 100644 src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh diff --git a/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu new file mode 100644 index 0000000..4807c0f --- /dev/null +++ b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu @@ -0,0 +1,246 @@ +#include "Function2D.cuh" + + +#include +#include +#include +#include +#include + +#include "CudaMatrix.h" +#include "buildMatrix.cuh" +#include "Sparse.h" + + +namespace { + const int THREADS_PER_BLOCK = 256; +} + + + +__device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc) +{ + float temp1; + float temp2; + float temp3; + + temp1 = aEndPt[0]- aStartPt[0]; + temp2 = aEndPt[1]- aStartPt[1]; + temp3 = aEndPt[2]- aStartPt[2]; + temp1 *= aRes[0]; + temp2 *= aRes[1]; + temp3 *= aRes[2]; + temp1 = temp1*temp1; + temp2 = temp2*temp2; + temp3 = temp3*temp3; + return (sqrtf(temp3+temp2+temp1))/(float)aPathLenDisc; +} + +__global__ struct b3dStruct{ + int max, f, pathlen; + float x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight; +}; +__device__ void getRayBLen(float *startPoint, float *endPoint, b3dStruct& aOut) { + aOut.x = startPoint[0]; + aOut.y = startPoint[1]; + aOut.z = startPoint[2]; + + float dx = endPoint[0] - aOut.x; + float dy = endPoint[1] - aOut.y; + float dz = endPoint[2] - aOut.z; + + aOut.x_inc = (dx < 0) ? -1 : 1; + float l = abs(dx); + aOut.y_inc = (dy < 0) ? -1 : 1; + float m = abs(dy); + aOut.z_inc = (dz < 0) ? -1 : 1; + float n = abs(dz); + aOut.dx2 = l*2;// << 1; + aOut.dy2 = m*2;// << 1; + aOut.dz2 = n*2;// << 1; + float vmax ; + if ((l >= m) && (l >= n)){ + aOut.max =ceilf(l); + vmax = l; + aOut.d = aOut.dx2; + aOut.dx2 = 0; + aOut.f = 0; + } + else if((m >= l) && (m >= n)){ + aOut.max = ceilf(m); + vmax = m; + aOut.d = aOut.dy2; + aOut.dy2 = 0; + aOut.f = 1; + } + else{ + aOut.max = ceilf(n); + vmax = n; + aOut.d = aOut.dz2; + aOut.dz2 = 0; + aOut.f = 2; + } + aOut.err_0 = aOut.f==0?0:(aOut.dx2 - vmax); + aOut.err_1 = aOut.f==1?0:(aOut.dy2 - vmax); + aOut.err_2 = aOut.f==2?0:(aOut.dz2 - vmax); +} + +/** + * @brief 计算traceStraightRayBresenham的pathlen和weight的核函数 + * + * @param aStratPt + * @param aEndPt + * @param aSize + * @param aRes + * @param aOuts + * @return _ + */ +__global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, float* aRes,int aSize, b3dStruct* aOuts ){ + unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; + if (idx > aSize)return; + getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts[idx]); + + aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes,aOuts[idx].max); + if (idx == 196){ + int v=0; + for (int i =0;i<196; i++) { + v+=aOuts[i].max; + } + } +}; + +__global__ void calcPathOffsetKernel(b3dStruct* aStructs ,unsigned int* aPath, int aSize){ + aPath[0] = aStructs[0].max; + for (int i=1; i aSize)return; + b3dStruct* in = aInstruct+idx; + int f; + float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z; + x = in->x; + y = in->y; + z = in->z; + x_inc = in->x_inc; + y_inc = in->y_inc; + z_inc = in->z_inc; + + dx2 = in->dx2; + dy2 = in->dy2; + dz2 = in->dz2; + d = in->d; + + err_0 = in->err_0; + err_1 = in->err_1; + err_2 = in->err_2; + max = in->max; + f = in->f; + + float3 dims3{dims[0],dims[1],dims[2]}; + float* output_ptr = aJOut + (idx ==0?0:aOffset[idx-1]); + for (float i = 0; i < max; i++) { + output_ptr[0]=convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)}); + // output_ptr[0]=i; + + output_ptr ++; + if (err_0 > 0) { + x += x_inc; + err_0 -= d; + } + if (err_1 > 0) { + y += y_inc; + err_1 -= d; + } + if (err_2 > 0) { + z += z_inc; + err_2 -= d; + } + err_0 += dx2; + err_1 += dy2; + err_2 += dz2; + if (f == 0) x += x_inc; + if (f == 1) y += y_inc; + if (f == 2) z += z_inc; + } +} + +__global__ void fillIAndSKernel(struct b3dStruct* aInstructs,unsigned int * aOffset,float* aIOut,float* aSOut){ + b3dStruct& s = aInstructs[blockIdx.x]; + __shared__ unsigned int beginIdx; + __shared__ unsigned int length; + __shared__ float weight; + + if (threadIdx.x == 0){ + beginIdx = (blockIdx.x == 0) ? 0 : aOffset[blockIdx.x - 1]; + length = s.max; + weight = s.weight; + } + __syncthreads(); + + for (int i = 0; threadIdx.x + i < length; i+=blockDim.x) { + aIOut[beginIdx + threadIdx.x + i] = blockIdx.x; + aSOut[beginIdx + threadIdx.x + i] = weight; + } +} + +Recon::BuildMatrixResult Recon::buildMatrix(const Aurora::CudaMatrix &senderList, + const Aurora::CudaMatrix &receiverList, + Aurora::CudaMatrix& res, + const Aurora::CudaMatrix &dims, bool BENT, + const Aurora::CudaMatrix &potentialMap){ + Recon::BuildMatrixResult result; + unsigned int numDims = senderList.getDimSize(0); + unsigned int nTotalRays = senderList.getDimSize(1); + b3dStruct* structList= nullptr; + cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays); + int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + calcLenWeightKernel<<>>( + senderList.getData(), receiverList.getData(), res.getData(),nTotalRays,structList); + cudaDeviceSynchronize(); + + unsigned int* offset= nullptr; + cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays); + calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays); + cudaDeviceSynchronize(); + + unsigned int size; + cudaMemcpy(&size, offset+(nTotalRays-1), sizeof(unsigned int), cudaMemcpyDeviceToHost); + + float* iData = nullptr; + float* jData = nullptr; + float* sData = nullptr; + + cudaMalloc((void**)&iData, sizeof(float)*size); + cudaMalloc((void**)&jData, sizeof(float)*size); + cudaMalloc((void**)&sData, sizeof(float)*size); + + calcRayBLPathKernel<<>>(structList, dims.getData(), offset, jData, nTotalRays); + fillIAndSKernel<<>>(structList, offset, iData, sData); + cudaDeviceSynchronize(); + cudaFree(structList); + + auto mI = Aurora::CudaMatrix::fromRawData(iData, 1, size ,1); + auto mJ = Aurora::CudaMatrix::fromRawData(jData, 1, size,1); + auto mS = Aurora::CudaMatrix::fromRawData(sData, 1, size,1); + result.M = Aurora::Sparse(mI.toHostMatrix(), mJ.toHostMatrix(),mS.toHostMatrix(),nTotalRays, Aurora::prod(dims).getValue(0)); + return result; + +} + + + diff --git a/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh new file mode 100644 index 0000000..61a2c06 --- /dev/null +++ b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh @@ -0,0 +1,12 @@ +#ifndef __BUILDMATRIX_CUH__ +#define __BUILDMATRIX_CUH__ +#include "buildMatrix.h" +namespace Recon { + +BuildMatrixResult buildMatrix(const Aurora::CudaMatrix &senderList, + const Aurora::CudaMatrix &receiverList, + Aurora::CudaMatrix& res, + const Aurora::CudaMatrix &dims, bool BENT, + const Aurora::CudaMatrix &potentialMap); +} +#endif // __BUILDMATRIX_H__ \ No newline at end of file diff --git a/src/transmissionReconstruction/reconstruction/reconstruction.cpp b/src/transmissionReconstruction/reconstruction/reconstruction.cpp index 1fee8d8..908bb13 100644 --- a/src/transmissionReconstruction/reconstruction/reconstruction.cpp +++ b/src/transmissionReconstruction/reconstruction/reconstruction.cpp @@ -9,6 +9,7 @@ #include "CudaEnvInit.h" #include "log/notify.h" #include "transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.h" +#include "transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh" #include "src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.h" #include @@ -194,8 +195,9 @@ namespace Recon { BuildMatrixResult buildMatrixR; for(int iter=1; iter<=numIter; ++iter) { + auto resDevice = res.toDeviceMatrix(); //1200ms - buildMatrixR = buildMatrix(senderList, receiverList, res, dims, bentRecon && (iter!=1), potentialMap); + buildMatrixR = buildMatrix(senderList.toDeviceMatrix(), receiverList.toDeviceMatrix(), resDevice, dims.toDeviceMatrix(), bentRecon && (iter!=1), potentialMap.toDeviceMatrix()); if(!data.isNull() && bentRecon && iter != numIter) { //与默认配置bentRecon不符,暂不实现todo @@ -229,51 +231,21 @@ namespace Recon { allHitMaps.push_back(buildMatrixR.hitmap); } - std::promise sosPromise; - std::future sosFuture = sosPromise.get_future(); - - std::promise attPromise; - std::future attFuture = attPromise.get_future(); - solveParameterIteratorFunctionType solveParameterIteratorFunctionPointer = &solveParameterIterator; - slownessToSOSFunctionType slownessToSOSFunctionPointer = &slownessToSOS; - std::thread sosThread; - std::thread attThread; - if(!data.isNull()) { //1500ms - sosThread = std::thread([solveParameterIteratorFunctionPointer, slownessToSOSFunctionPointer, &buildMatrixR, &b, &dims, &SOS_IN_WATER, &sosPromise]() - { - //auto bCopy = b.deepCopy(); - //auto dimCopy = dims.deepCopy(); - Matrix result = solveParameterIteratorFunctionPointer(buildMatrixR.M, b, dims, false, transParams::nonNeg, 0)[0][0]; - result = slownessToSOSFunctionPointer(result, SOS_IN_WATER); - sosPromise.set_value(result); - }); - //Matrix sosValue = solveParameterIterator(buildMatrixR.M, b, dims, false, transParams::nonNeg)[0][0]; + Matrix sosValue = solveParameterIterator(buildMatrixR.M, b, dims, false, transParams::nonNeg)[0][0]; //1ms - //result.outSOS = slownessToSOS(sosValue, SOS_IN_WATER) ; + result.outSOS = slownessToSOS(sosValue, SOS_IN_WATER) ; } if(!dataAtt.isNull()) { - attThread = std::thread([solveParameterIteratorFunctionPointer, slownessToSOSFunctionPointer, &buildMatrixR, &bAtt, &dims, &attPromise]() - { - //auto bAttCopy = bAtt.deepCopy(); - //auto dimCopy = dims.deepCopy(); - Matrix result = solveParameterIteratorFunctionPointer(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg, 1)[0][0]; - result = result / 100; - attPromise.set_value(result); - }); //1500ms - //Matrix attValue = solveParameterIterator(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg)[0][0]; + Matrix attValue = solveParameterIterator(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg)[0][0]; //1ms - //result.outATT = attValue/100 ; + result.outATT = attValue/100 ; } - sosThread.join(); - attThread.join(); - result.outSOS = sosFuture.get(); - result.outATT = attFuture.get(); Recon::notifyProgress(10 + 10 * (iter/numIter)); } diff --git a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.cpp b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.cpp index c2c03bf..9fd82a2 100644 --- a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.cpp +++ b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.cpp @@ -16,28 +16,33 @@ #include #include #include +#include + +#include namespace Recon { + bool isEigsFinished = false; + bool isEigsStatus = false; - double eigs(Aurora::Sparse& aM) + float eigs(Aurora::Sparse& aM) { - double result = NAN; size_t size = aM.getM(); if(size < aM.getN()) { size = aM.getN(); } - Eigen::SparseMatrix M(size, size); - std::vector> triplets; + Eigen::SparseMatrix M(size,size); Aurora::Matrix rows = aM.getRowVector(); Aurora::Matrix columns = aM.getColVector(); Aurora::Matrix values = aM.getValVector(); + std::vector> triplets(rows.getDataSize()); + #pragma omp parallel for for (int i = 0; i < rows.getDataSize(); ++i) { - triplets.push_back(Eigen::Triplet(rows[i], columns[i], values[i])); + triplets[i] = Eigen::Triplet(rows[i], columns[i], values[i]); } - M.setFromTriplets(triplets.begin(), triplets.end()); + float result; Spectra::SparseGenMatProd op(M); Spectra::GenEigsSolver> eigs(op, 1, 6); eigs.init(); @@ -49,14 +54,24 @@ namespace Recon std::complex complex = evalues[0]; result = complex.real(); } - + isEigsFinished = true; + if (result> 1 + 1e-10) + { + isEigsStatus = true; + } return result; } - void checkAndScale(Aurora::Sparse& M, Aurora::Matrix& b,size_t n){ - bool isreal = M.getValVector().getValueType() == Aurora::Normal; - auto s2 = eigs(M); - if (s2> 1 + 1e-10) + void checkAndScale(Aurora::Sparse& M, Aurora::Matrix& b,size_t n) + { + float s2 = 0.0; + if(!isEigsFinished) + { + s2 = eigs(M); + return;; + } + + if (isEigsStatus) { b = b / sqrt(s2); M.getValVector() = M.getValVector() / sqrt( s2); @@ -65,23 +80,23 @@ namespace Recon Aurora::Matrix callTval3(Aurora::Sparse& M, Aurora::Matrix& b,const Aurora::Matrix& dims,int device, TVALOptions& opt) { - checkAndScale(M,b,(size_t)Aurora::prod(dims).getScalar()); + checkAndScale(M,b,(size_t)Aurora::prod(dims).getScalar()); int * yIdxs = new int[M.getColVector().getDataSize()]; std::copy(M.getColVector().getData(),M.getColVector().getData()+M.getColVector().getDataSize(),yIdxs); int * xIdxs = new int[M.getRowVector().getDataSize()]; std::copy(M.getRowVector().getData(),M.getRowVector().getData()+M.getRowVector().getDataSize(),xIdxs); Aurora::Matrix values = M.getValVector(); - size_t cols = M.getM(), rows = M.getN(); - int nz = std::max(M.getColVector().getDataSize(),std::max(M.getRowVector().getDataSize(),M.getValVector().getDataSize())); + size_t cols = M.getM(), rows = M.getN(); + int nz = std::max(M.getColVector().getDataSize(),std::max(M.getRowVector().getDataSize(),M.getValVector().getDataSize())); sparse_matrix_t A; sparse_matrix_t csrA; - mkl_sparse_s_create_coo(&A, sparse_index_base_t::SPARSE_INDEX_BASE_ZERO, rows, cols, nz, yIdxs, xIdxs,values.getData()); - mkl_sparse_convert_csr(A, sparse_operation_t::SPARSE_OPERATION_NON_TRANSPOSE, &csrA); + mkl_sparse_s_create_coo(&A, sparse_index_base_t::SPARSE_INDEX_BASE_ZERO, rows, cols, nz, yIdxs, xIdxs,values.getData()); + mkl_sparse_convert_csr(A, sparse_operation_t::SPARSE_OPERATION_NON_TRANSPOSE, &csrA); int n_rows,n_cols; int *rows_start,*rows_end,*col_indx; float * csrValues; sparse_index_base_t index; - mkl_sparse_s_export_csr(csrA, &index, &n_rows, &n_cols, &rows_start, &rows_end, &col_indx, &csrValues); + mkl_sparse_s_export_csr(csrA, &index, &n_rows, &n_cols, &rows_start, &rows_end, &col_indx, &csrValues); mkl_sparse_destroy(A); delete [] xIdxs; delete [] yIdxs; @@ -101,4 +116,4 @@ namespace Recon return Aurora::Matrix::fromRawData(result.data, result.dims[0],result.dims[1],result.dims[2]); // return Aurora::Matrix(); } -} \ No newline at end of file +} diff --git a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.h b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.h index 35a8630..64b634a 100644 --- a/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.h +++ b/src/transmissionReconstruction/reconstruction/solvingEquationSystem/TVAL/TVAL.h @@ -7,7 +7,7 @@ namespace Recon { void checkAndScale(Aurora::Sparse& M, Aurora::Matrix& b,size_t n); -double eigs(Aurora::Sparse& aM); +float eigs(Aurora::Sparse& aM); Aurora::Matrix callTval3(Aurora::Sparse &M, Aurora::Matrix &b, const Aurora::Matrix &dims, int device,