From 82a2a9e132c512d292ee1f53de266402868bde31 Mon Sep 17 00:00:00 2001 From: kradchen Date: Thu, 21 Nov 2024 09:49:34 +0800 Subject: [PATCH] feat: refactor & clean cpp code in SAFT_TOFI --- SAFT_TOFI/CMakeLists.txt | 5 +- SAFT_TOFI/src/SAFT_TOFI.cpp | 2771 ++++++++++----------------- SAFT_TOFI/src/kernel/rayTracing.cuh | 2 +- SAFT_TOFI/src/processAScans.cpp | 2597 +++++++------------------ SAFT_TOFI/src/saft.cpp | 788 +++----- SAFT_TOFI/src/saft.cu | 15 - SAFT_TOFI/src/saft.hpp | 625 +++--- 7 files changed, 2210 insertions(+), 4593 deletions(-) delete mode 100644 SAFT_TOFI/src/saft.cu diff --git a/SAFT_TOFI/CMakeLists.txt b/SAFT_TOFI/CMakeLists.txt index aed4a1a..644cd3a 100644 --- a/SAFT_TOFI/CMakeLists.txt +++ b/SAFT_TOFI/CMakeLists.txt @@ -4,7 +4,8 @@ set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) enable_language(CUDA) find_package (OpenMP REQUIRED) file(GLOB_RECURSE cu_files ./src/*.cu) -file(GLOB_RECURSE cuh_files ./src/*.cuh) +file(GLOB_RECURSE cuh_files ./src/*.cuh) + add_library(SaftTofi SHARED ./src/SAFT_TOFI.cpp ./src/processAScans.cpp ./src/saft.cpp ${cu_files} ${cuh_files}) target_include_directories(SaftTofi PRIVATE ../SAFT ./src /usr/local/cuda/include ) set_target_properties(SaftTofi PROPERTIES CUDA_SEPARABLE_COMPILATION ON) @@ -13,7 +14,7 @@ target_compile_options(SaftTofi PRIVATE $<$: --use_fast_math --ptxas-options=-v -arch compute_30 -code compute_30,sm_30 - >) + >) target_link_libraries(SaftTofi PRIVATE ${CUDA_RUNTIME_LIBRARY} ) target_link_libraries(SaftTofi PRIVATE OpenMP::OpenMP_CXX ) diff --git a/SAFT_TOFI/src/SAFT_TOFI.cpp b/SAFT_TOFI/src/SAFT_TOFI.cpp index c2f73b0..5ebd929 100644 --- a/SAFT_TOFI/src/SAFT_TOFI.cpp +++ b/SAFT_TOFI/src/SAFT_TOFI.cpp @@ -1,879 +1,512 @@ #include "SAFT_TOFI.h" -#include "saft.hpp" + +#include +#include +#include #include -#include -#include +#include "saft.hpp" -#include // For German printf output format: Float with , instead of . +#include +#include + +#include +#include +#include #include #include #include -#include - -#include -#include -#include - -#include #include "spdlog/sinks/stdout_color_sinks.h" + // TODO: Blockgroesse (z > 1) fuehrt zu Kernelabbruechen -//pthread handle -typedef struct thread_handle_t { - //pthread_t pthread; +// pthread handle +typedef struct thread_handle_t +{ + // pthread_t pthread; - int deviceId; - int deviceIndex; - float *aScan_ptr; - double *output_ptr; - double *Duration_ptr; - unsigned short *receiver_index_ptr; - unsigned short *emitter_index_ptr; - float *receiver_list_ptr; - int receiver_list_Size; - float *emitter_list_ptr; - int emitter_list_Size; - float *speed_vec_ptr; - int3 SOSGrid_XYZ; - float3 sosOffset; - float SOS_RESOLUTION; + int deviceId; + int deviceIndex; + float *aScan_ptr; + double *output_ptr; + double *Duration_ptr; + unsigned short *receiver_index_ptr; + unsigned short *emitter_index_ptr; + float *receiver_list_ptr; + int receiver_list_Size; + float *emitter_list_ptr; + int emitter_list_Size; + float *speed_vec_ptr; + int3 SOSGrid_XYZ; + float3 sosOffset; + float SOS_RESOLUTION; - float *att_vec_ptr; + float *att_vec_ptr; - int aScanCount; - int aScanLength; + int aScanCount; + int aScanLength; - float inc; - int3 res; - float sampleRate; - float3 volposition; + float inc; + int3 res; + float sampleRate; + float3 volposition; - int num_threads; - dim3 fixedBlockDimensions; + int num_threads; + dim3 fixedBlockDimensions; - float debugMode; - float debugModeParameter; - bool SOSMode_3DVolume; - bool ATTMode_3DVolume; - int SAFT_MODE; - int *SAFT_VARIANT; - int SAFT_VARIANT_Size; - int *Abort_ptr; + float debugMode; + float debugModeParameter; + bool SOSMode_3DVolume; + bool ATTMode_3DVolume; + int SAFT_MODE; + int *SAFT_VARIANT; + int SAFT_VARIANT_Size; + int *Abort_ptr; } thread_handle; - -//Convenient typedefs for GPU-DeviceProperties container +// Convenient typedefs for GPU-DeviceProperties container typedef std::vector DeviceProperties; /** Load CUDA devices and write them to a container. */ -void loadDevices( - DeviceProperties & output ///< This argument is written to. Container in which the device data are stored. - ) +void loadDevices(DeviceProperties &output ///< This argument is written to. Container in which the device data are stored. +) { - #ifdef debug_OutputFunctions - printf( "==> loadDevices - Start\n"); - #endif - int deviceCount; - CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); - - #ifdef debug_OutputInfo - printf( "There are %i devices present:\n", deviceCount); - #endif + int deviceCount; + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); output.reserve(static_cast(deviceCount)); - for(int i = 0; i < deviceCount; i++) + for (int i = 0; i < deviceCount; i++) { - cudaDeviceProp & device = output[i]; + cudaDeviceProp &device = output[i]; CUDA_CHECK(cudaGetDeviceProperties(&device, i)); - #ifdef debug_OutputInfo - std::cout << " " << (i + 1) << ". " << device.name << std::endl; - std::cout << " " << "Byte Total Global Mem: " << device.totalGlobalMem <deviceId, // deviceId + pthread_handle->deviceIndex, // deviceIndex -//pthread call function -void thread_function (void *arg) { + pthread_handle->aScan_ptr, // aScan_ptr, + pthread_handle->output_ptr, // output_ptr, + pthread_handle->Duration_ptr, // Duration_ptr, - thread_handle *pthread_handle = (thread_handle*)arg; + pthread_handle->receiver_index_ptr, // receiver_index_ptr ///< + pthread_handle->emitter_index_ptr, // emitter_index_ptr ///< + pthread_handle->receiver_list_ptr, // receiver_list_ptr ///< + pthread_handle->receiver_list_Size, // receiver_list_Size ///< + pthread_handle->emitter_list_ptr, // emitter_list_ptr ///< + pthread_handle->emitter_list_Size, // emitter_list_Size ///< - #ifdef debug_OutputSAFTHandlerThreadPerformance - struct timeval startSAFTHandler, stopSAFTHandler; - gettimeofday(&startSAFTHandler, NULL); - #endif + pthread_handle->speed_vec_ptr, // speed_vec_ptr, - // Create Instance of SAFT-Handler and call constructor - SAFTHandler saft( - pthread_handle->deviceId, // deviceId - pthread_handle->deviceIndex, // deviceIndex + pthread_handle->SOSGrid_XYZ, // SOSGrid_XYZ, + pthread_handle->sosOffset, // sosOffset, ///< Startpoint of SoSGrid + pthread_handle->SOS_RESOLUTION, // SOS_RESOLUTION, ///< Resolution of SoSGrid - pthread_handle->aScan_ptr, // aScan_ptr, - pthread_handle->output_ptr, // output_ptr, - pthread_handle->Duration_ptr, // Duration_ptr, + pthread_handle->att_vec_ptr, // att_vec_ptr - pthread_handle->receiver_index_ptr, // receiver_index_ptr ///< - pthread_handle->emitter_index_ptr, // emitter_index_ptr ///< - pthread_handle->receiver_list_ptr, // receiver_list_ptr ///< - pthread_handle->receiver_list_Size, // receiver_list_Size ///< - pthread_handle->emitter_list_ptr, // emitter_list_ptr ///< - pthread_handle->emitter_list_Size, // emitter_list_Size ///< + pthread_handle->aScanCount, // aScanCount, + pthread_handle->aScanLength, // aScanLength + pthread_handle->res, // resolution => IMAGE_SIZE_XYZ, + pthread_handle->sampleRate, // sampleRate + pthread_handle->volposition, // => regionOfInterestOffset, + pthread_handle->inc, // IMAGE_RESOLUTION, - pthread_handle->speed_vec_ptr, // speed_vec_ptr, + pthread_handle->fixedBlockDimensions, // fixedBlockDimensions + pthread_handle->debugMode, // debugMode + pthread_handle->debugModeParameter, // Parameter for DebugMode - pthread_handle->SOSGrid_XYZ, // SOSGrid_XYZ, - pthread_handle->sosOffset, // sosOffset, ///< Startpoint of SoSGrid - pthread_handle->SOS_RESOLUTION, // SOS_RESOLUTION, ///< Resolution of SoSGrid + pthread_handle->SOSMode_3DVolume, pthread_handle->ATTMode_3DVolume, - pthread_handle->att_vec_ptr, // att_vec_ptr + pthread_handle->SAFT_MODE, pthread_handle->SAFT_VARIANT, pthread_handle->SAFT_VARIANT_Size, + pthread_handle->Abort_ptr // If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch; - pthread_handle->aScanCount, // aScanCount, - pthread_handle->aScanLength, // aScanLength - pthread_handle->res, // resolution => IMAGE_SIZE_XYZ, - pthread_handle->sampleRate, // sampleRate - pthread_handle->volposition, // => regionOfInterestOffset, - pthread_handle->inc, // IMAGE_RESOLUTION, - - pthread_handle->fixedBlockDimensions, // fixedBlockDimensions - pthread_handle->debugMode, // debugMode - pthread_handle->debugModeParameter, // Parameter for DebugMode - - pthread_handle->SOSMode_3DVolume, - pthread_handle->ATTMode_3DVolume, - - pthread_handle->SAFT_MODE, - pthread_handle->SAFT_VARIANT, - pthread_handle->SAFT_VARIANT_Size, - pthread_handle->Abort_ptr // If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch; - - ); + ); saft.performReconstruction(); - - #ifdef debug_OutputSAFTHandlerThreadPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stopSAFTHandler, NULL); - double diff_time = (double)((stopSAFTHandler.tv_sec * 1000000.0 + stopSAFTHandler.tv_usec) - (startSAFTHandler.tv_sec * 1000000.0 + startSAFTHandler.tv_usec)); - printf ("{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}\n"); - printf ("{ ~~~> } Device (%i) - SAFTHandler-Thread : %8.0f µs\n", pthread_handle->deviceId, diff_time); - - double performance_Thread = (((double)pthread_handle->aScanCount * (double)pthread_handle->res.x * (double)pthread_handle->res.y * (double)pthread_handle->res.z) / diff_time )/ 1000.0; - printf ("{ ~~~> } Device (%i) - SAFTHandler-Thread Performance: %.6lf A-Scan * GVoxel/s\n", pthread_handle->deviceId, performance_Thread); - printf ("{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}\n"); - #endif - - //pthread_exit(NULL); - + // pthread_exit(NULL); } - - - /** Check amount of GPUs and divide Volume ins parts with same size Abfrage der Anzahl an GPU-Devices und Einteilung des Volumens in möglichst gleichgroße Volumen in Z-Richtung (3D-Volumen) oder Y-Richtung (2D-Volumen) auf. */ -void multithreaded_processing( - float *aScan_ptr, ///< AScan-Daten - double *output_ptr, ///< OutputDaten der Voxel +void multithreaded_processing(float *aScan_ptr, ///< AScan-Daten + double *output_ptr, ///< OutputDaten der Voxel - unsigned short *receiver_index_ptr, ///< Index Receiver per Ascan - unsigned short *emitter_index_ptr, ///< Index Emitter per Ascan - float *receiver_list_ptr, ///< Positionskoordinaten Receiver - int receiver_list_Size, ///< Menge an Receiver - float *emitter_list_ptr, ///< Positionskoordinaten Emitter - int emitter_list_Size, ///< Menge an Emitter + unsigned short *receiver_index_ptr, ///< Index Receiver per Ascan + unsigned short *emitter_index_ptr, ///< Index Emitter per Ascan + float *receiver_list_ptr, ///< Positionskoordinaten Receiver + int receiver_list_Size, ///< Menge an Receiver + float *emitter_list_ptr, ///< Positionskoordinaten Emitter + int emitter_list_Size, ///< Menge an Emitter - float *speed_vec_ptr, ///< SoS Daten im Blockmode oder als SoSGrid + float *speed_vec_ptr, ///< SoS Daten im Blockmode oder als SoSGrid - int3 SOSGrid_XYZ, ///< Size of SoSGrid - float3 sosOffset, ///< Startpoint of SoSGrid - float SOS_RESOLUTION, ///< Aufloesung des SoSGrid + int3 SOSGrid_XYZ, ///< Size of SoSGrid + float3 sosOffset, ///< Startpoint of SoSGrid + float SOS_RESOLUTION, ///< Aufloesung des SoSGrid - float *att_vec_ptr, ///< Attenuation Daten als ATTGrid + float *att_vec_ptr, ///< Attenuation Daten als ATTGrid - int aScanCount, ///< Anzahl der AScans die im Blockmode verarbeitet werden sollen - int aScanLength, ///< Laenge der AscanDaten (normal 3000) - float3 regionOfInterestOffset, - int3 IMAGE_SIZE_XYZ, ///< Groesse des Bildbereichs in Voxel - float IMAGE_RESOLUTION, ///< Aufloesung des Bildbereichs - float sampleRate, ///< Samplerate für AScans - int3 BlockDim_XYZ, ///< BlockDimension für GPU - double *Duration_ptr, ///< Rückgabepointer an Matlab für Laufzeit des SAFT-Kernels - int selectedNumberGPUs, ///< Anzahl der ausgewählten GPUs bzw. auf maximale Anzhal vorhandener begrenzt - int *enableGPUs_ptr, ///< Gibt an welche GPUs genutzt werden und welche nicht - float debugMode, ///< Ausgabe im Debugmode -> Verschiedene Werte können ausgegeben werden - float debugModeParameter, ///< Parameter der mit fuer Debugmode uebermittelt werden kann + int aScanCount, ///< Anzahl der AScans die im Blockmode verarbeitet werden sollen + int aScanLength, ///< Laenge der AscanDaten (normal 3000) + float3 regionOfInterestOffset, + int3 IMAGE_SIZE_XYZ, ///< Groesse des Bildbereichs in Voxel + float IMAGE_RESOLUTION, ///< Aufloesung des Bildbereichs + float sampleRate, ///< Samplerate für AScans + int3 BlockDim_XYZ, ///< BlockDimension für GPU + double *Duration_ptr, ///< Rückgabepointer an Matlab für Laufzeit des SAFT-Kernels + int selectedNumberGPUs, ///< Anzahl der ausgewählten GPUs bzw. auf maximale Anzhal vorhandener begrenzt + int *enableGPUs_ptr, ///< Gibt an welche GPUs genutzt werden und welche nicht + float debugMode, ///< Ausgabe im Debugmode -> Verschiedene Werte können ausgegeben werden + float debugModeParameter, ///< Parameter der mit fuer Debugmode uebermittelt werden kann - bool SOSMode_3DVolume, ///< Wird 3D Volumen für SOS-Korrektur genutzt? - bool ATTMode_3DVolume, ///< Wird 3D Volumen für ATT-Korrektur genutzt? + bool SOSMode_3DVolume, ///< Wird 3D Volumen für SOS-Korrektur genutzt? + bool ATTMode_3DVolume, ///< Wird 3D Volumen für ATT-Korrektur genutzt? - int SAFT_MODE, ///< Modus für SAFT-Rekonstruktion - int *SAFT_VARIANT, ///< Verschiedene Parameter der Rekonstruktion - int SAFT_VARIANT_Size, ///< Menge der verschiedenen Parameter für Rekonstruktion - int *Abort_ptr ///< FehlerArray - ) + int SAFT_MODE, ///< Modus für SAFT-Rekonstruktion + int *SAFT_VARIANT, ///< Verschiedene Parameter der Rekonstruktion + int SAFT_VARIANT_Size, ///< Menge der verschiedenen Parameter für Rekonstruktion + int *Abort_ptr ///< FehlerArray +) { - #ifdef debug_OutputFunctions - printf( "==> multithreaded_processing - Start\n"); - #endif - dim3 fixedBlockDimensions( // convert int3 to dim3 - BlockDim_XYZ.x, - BlockDim_XYZ.y, - BlockDim_XYZ.z - ); + dim3 fixedBlockDimensions( // convert int3 to dim3 + BlockDim_XYZ.x, BlockDim_XYZ.y, BlockDim_XYZ.z); - // Divide workload and show Information ------------------------------------------------------------------------------------------------------------------ - // Divide workload in pieces with the same size for all available GPUs. - // If the workload can not be divided in pieces with the same size, the last piece will be the one with little less workload. + // Divide workload and show Information ------------------------------------------------------------------------------------------------------------------ + // Divide workload in pieces with the same size for all available GPUs. + // If the workload can not be divided in pieces with the same size, the last piece will be the one with little less workload. - // Testfall simuliert die mehrfache Anzahl an GPUs - int num_devices_factor = 1; // Vielfache an GPUs simulieren bzw. kleinere Pakete erzeugen - int num_workingPackages = selectedNumberGPUs*num_devices_factor; + // Testfall simuliert die mehrfache Anzahl an GPUs + int num_devices_factor = 1; // Vielfache an GPUs simulieren bzw. kleinere Pakete erzeugen + int num_workingPackages = selectedNumberGPUs * num_devices_factor; - float3 *position = (float3*)malloc(num_workingPackages * sizeof(float3)); - int3 *resolution = (int3*) malloc(num_workingPackages * sizeof(int3)); - int3 *volumeStartpoint = (int3*) malloc(num_workingPackages * sizeof(int3)); - size_t *volumePtr = (size_t*)malloc(num_workingPackages * sizeof(size_t)); - //int *Abort_ptr = (int*) malloc(num_workingPackages * sizeof(int)); + float3 *position = (float3 *)malloc(num_workingPackages * sizeof(float3)); + int3 *resolution = (int3 *)malloc(num_workingPackages * sizeof(int3)); + int3 *volumeStartpoint = (int3 *)malloc(num_workingPackages * sizeof(int3)); + size_t *volumePtr = (size_t *)malloc(num_workingPackages * sizeof(size_t)); + // int *Abort_ptr = (int*) malloc(num_workingPackages * sizeof(int)); - float3 volposition = regionOfInterestOffset; // Uebergabe der Parameter - float inc = IMAGE_RESOLUTION; // koennte auch direkt umbenannt werden + float3 volposition = regionOfInterestOffset; // Uebergabe der Parameter + float inc = IMAGE_RESOLUTION; // koennte auch direkt umbenannt werden - #ifdef debug_OutputMultiGpu - printf("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - printf("~ Dividing workload between %i devices ~\n", num_devices ); - printf("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - #endif + size_t volume_size = (size_t)IMAGE_SIZE_XYZ.x * (size_t)IMAGE_SIZE_XYZ.y * (size_t)IMAGE_SIZE_XYZ.z * (size_t)sizeof(double); // = Groesse des Outputvolumens in Byte - size_t volume_size = (size_t)IMAGE_SIZE_XYZ.x * (size_t)IMAGE_SIZE_XYZ.y * (size_t)IMAGE_SIZE_XYZ.z * (size_t)sizeof(double); // = Groesse des Outputvolumens in Byte + if ((IMAGE_SIZE_XYZ.y == 1) && (IMAGE_SIZE_XYZ.z == 1)) + { + selectedNumberGPUs = 1; + num_workingPackages = 1; + } + volumeStartpoint[0].x = 0; + volumeStartpoint[0].y = 0; + volumeStartpoint[0].z = 0; - if (( IMAGE_SIZE_XYZ.y == 1 ) && ( IMAGE_SIZE_XYZ.z == 1 )){ - selectedNumberGPUs = 1; - num_workingPackages = 1; - #ifdef debug_OutputMultiGpu - printf( "IMAGE_SIZE_XYZ.y = IMAGE_SIZE_XYZ.z = 1 ==> Use only one GPU\n", num_devices); - #endif - } + position[0].x = volposition.x; // Startposition + position[0].y = volposition.y; + position[0].z = volposition.z; + std::vector resolutionZs(num_workingPackages, IMAGE_SIZE_XYZ.z / num_workingPackages); + for (size_t i = 0; i < IMAGE_SIZE_XYZ.z % num_workingPackages; i++) resolutionZs[i]++; - volumeStartpoint[0].x = 0; - volumeStartpoint[0].y = 0; - volumeStartpoint[0].z = 0; + int i, j, k; - position[0].x = volposition.x; // Startposition - position[0].y = volposition.y; - position[0].z = volposition.z; + for (i = 0; i < num_workingPackages; i++) + { + if (IMAGE_SIZE_XYZ.z > 1) + { // Divide in Z-Direction - std::vector resolutionZs(num_workingPackages, IMAGE_SIZE_XYZ.z / num_workingPackages); - for (size_t i = 0; i < IMAGE_SIZE_XYZ.z % num_workingPackages; i++) - resolutionZs[i]++; + resolution[i].x = IMAGE_SIZE_XYZ.x; // Initialization + resolution[i].y = IMAGE_SIZE_XYZ.y; + resolution[i].z = resolutionZs[i]; - int i,j,k; + if (i > 0) + { + volumeStartpoint[i].x = 0; // Koordinaten als Startpunkt für Layer in einzelnen GPUs + volumeStartpoint[i].y = 0; + volumeStartpoint[i].z = volumeStartpoint[i - 1].z + resolution[i - 1].z; + } - for (i=0; i < num_workingPackages; i++ ) { - #ifdef debug_OutputMultiGpu - printf("Working Package [%i]\n", i); - #endif + volumePtr[i] = (size_t)((size_t)resolution[0].x * (size_t)resolution[0].y * (size_t)volumeStartpoint[i].z); // Startpunkt der Speicherstellen fuer das Outputvolumen + } + else + { // Divide in Y-Direction - if ( IMAGE_SIZE_XYZ.z > 1 ) { // Divide in Z-Direction - #ifdef debug_OutputMultiGpu - printf( "( IMAGE_SIZE_XYZ.z > 1 ) => Divide through %i WP in Z-Direction for %i GPUs\n", num_workingPackages, selectedNumberGPUs); - #endif + resolution[i].x = IMAGE_SIZE_XYZ.x; // Initialization + resolution[i].z = IMAGE_SIZE_XYZ.z; - resolution[i].x = IMAGE_SIZE_XYZ.x; // Initialization - resolution[i].y = IMAGE_SIZE_XYZ.y; - resolution[i].z = resolutionZs[i]; + if (IMAGE_SIZE_XYZ.y % num_workingPackages == 0) + { // Volume is divisible - if (i>0) - { - volumeStartpoint[i].x = 0; // Koordinaten als Startpunkt für Layer in einzelnen GPUs - volumeStartpoint[i].y = 0; - volumeStartpoint[i].z = volumeStartpoint[i-1].z + resolution[i-1].z; - } + resolution[i].y = IMAGE_SIZE_XYZ.y / num_workingPackages; + } + else + { // if not divisible, - volumePtr[i] = (size_t)((size_t)resolution[0].x * (size_t)resolution[0].y * (size_t)volumeStartpoint[i].z); //Startpunkt der Speicherstellen fuer das Outputvolumen + if (i != (num_workingPackages - 1)) + { // increment each GPU slice by one + resolution[i].y = IMAGE_SIZE_XYZ.y / num_workingPackages + 1; + } + else + { // except the last one, which get the remaining Layers + resolution[i].y = IMAGE_SIZE_XYZ.y % resolution[0].y; + } + } + } - #ifdef debug_OutputMultiGpu - printf(" - volumeStartpoint[%i] = [%d %d %d]\n", i, volumeStartpoint[i].x, volumeStartpoint[i].y, volumeStartpoint[i].z); - printf(" => volume_size[%i] = [%i %i %i]*double = %lld kB\n", i, resolution[i].x, resolution[i].y, resolution[i].z, ((uint64_t)resolution[i].x * (uint64_t)resolution[i].y * (uint64_t)resolution[i].z * (uint64_t)sizeof(double)) / ((uint64_t)1024)); - printf(" => volumePtr[%i] = [%lld]\n", i, (uint64_t)volumePtr[i]); - #endif + position[i].x = volposition.x; // Startposition + position[i].y = volposition.y; + position[i].z = volposition.z; - } - else { // Divide in Y-Direction - #ifdef debug_OutputMultiGpu - printf( "( IMAGE_SIZE_XYZ.z = 1 ) => Divide through %i in Y-Direction for %i GPUs\n", num_workingPackages, selectedNumberGPUs); - #endif + if (IMAGE_SIZE_XYZ.z > 1) + position[i].z += i * inc * resolution[0].z; // Calculate Startpositions for the workload-pieces + else + position[i].y += i * inc * resolution[0].y; + } - resolution[i].x = IMAGE_SIZE_XYZ.x; // Initialization - resolution[i].z = IMAGE_SIZE_XYZ.z; + // Create one thread per GPU ------------------------------------------------------------------------------------------------------------------------ + thread_handle *pthread_handle = (thread_handle *)malloc(selectedNumberGPUs * sizeof(thread_handle)); - if ( IMAGE_SIZE_XYZ.y % num_workingPackages == 0) { // Volume is divisible + for (i = 0; i < num_workingPackages; i++) + { + // initialize control block + pthread_handle[i].deviceId = enableGPUs_ptr[(i % selectedNumberGPUs)]; // Hier DeviceID der GPU setzen + pthread_handle[i].deviceIndex = (i % selectedNumberGPUs); - resolution[i].y = IMAGE_SIZE_XYZ.y / num_workingPackages; - } - else { // if not divisible, + pthread_handle[i].aScan_ptr = aScan_ptr; + if (IMAGE_SIZE_XYZ.z > 1) + pthread_handle[i].output_ptr = &output_ptr[volumePtr[i]]; // Startpoint for Outputvolume. + // volumePtr[i] = (size_t)(resolution[0].x * resolution[0].y * volumeStartpoint[i].z); //Startpunkt der Speicherstellen fuer das Outputvolumen + else + // pthread_handle[i].output_ptr = &output_ptr[ i * resolution[0].x * resolution[0].y * resolution[0].z ]; // Startpoint for Outputvolume. [0] da nur der letzte eine andere Gr��e hat. + pthread_handle[i].output_ptr = + &output_ptr[(size_t)resolution[0].x * (size_t)i * (size_t)resolution[0].y]; // Startpoint for Outputvolume. [0] da nur der letzte eine andere Groesse hat. Z spielt hier keine Rolle - if (i != (num_workingPackages - 1)){ // increment each GPU slice by one - resolution[i].y = IMAGE_SIZE_XYZ.y / num_workingPackages + 1; - } - else { // except the last one, which get the remaining Layers - resolution[i].y = IMAGE_SIZE_XYZ.y % resolution[0].y; - } + pthread_handle[i].Duration_ptr = Duration_ptr; + pthread_handle[i].receiver_index_ptr = receiver_index_ptr; + pthread_handle[i].emitter_index_ptr = emitter_index_ptr; + pthread_handle[i].receiver_list_ptr = receiver_list_ptr; + pthread_handle[i].receiver_list_Size = receiver_list_Size; + pthread_handle[i].emitter_list_ptr = emitter_list_ptr; + pthread_handle[i].emitter_list_Size = emitter_list_Size; + pthread_handle[i].speed_vec_ptr = speed_vec_ptr; + pthread_handle[i].SOSGrid_XYZ = SOSGrid_XYZ; + pthread_handle[i].sosOffset = sosOffset; + pthread_handle[i].SOS_RESOLUTION = SOS_RESOLUTION; + pthread_handle[i].att_vec_ptr = att_vec_ptr; - } - } + pthread_handle[i].aScanCount = aScanCount; + pthread_handle[i].aScanLength = aScanLength, pthread_handle[i].inc = IMAGE_RESOLUTION; + pthread_handle[i].res = resolution[i]; + pthread_handle[i].sampleRate = sampleRate; + pthread_handle[i].volposition = position[i]; // regionOfInterestOffset - #ifdef debug_OutputMultiGpu - printf("WP[%i] on deviceId[%i]=deviceIndex[%i]\n", i, enableGPUs_ptr[(i % selectedNumberGPUs)], (i % selectedNumberGPUs)); - printf(" - resolution[%i].x = %i\n", i, resolution[i].x ); - printf(" - resolution[%i].y = %i\n", i, resolution[i].y ); - printf(" - resolution[%i].z = %i\n", i, resolution[i].z ); - #endif + pthread_handle[i].num_threads = num_workingPackages; - position[i].x = volposition.x; // Startposition - position[i].y = volposition.y; - position[i].z = volposition.z; + pthread_handle[i].fixedBlockDimensions = fixedBlockDimensions; // + pthread_handle[i].debugMode = debugMode; + pthread_handle[i].debugModeParameter = debugModeParameter; + pthread_handle[i].SOSMode_3DVolume = SOSMode_3DVolume; + pthread_handle[i].ATTMode_3DVolume = ATTMode_3DVolume; - if ( IMAGE_SIZE_XYZ.z > 1 ) position[i].z += i*inc*resolution[0].z; // Calculate Startpositions for the workload-pieces - else position[i].y += i*inc*resolution[0].y; + pthread_handle[i].SAFT_MODE = SAFT_MODE; + pthread_handle[i].SAFT_VARIANT = SAFT_VARIANT; + pthread_handle[i].SAFT_VARIANT_Size = SAFT_VARIANT_Size; - #ifdef debug_OutputMultiGpu - printf(" - position[%i].x = %f\n", i, position[i].x ); - printf(" - position[%i].y = %f\n", i, position[i].y ); - printf(" - position[%i].z = %f\n", i, position[i].z ); - #endif + Abort_ptr[i] = 0; // Initialisieren mit kein Fehler + pthread_handle[i].Abort_ptr = &Abort_ptr[i]; + } - } - - // Create one thread per GPU ------------------------------------------------------------------------------------------------------------------------ - thread_handle *pthread_handle = (thread_handle*)malloc(selectedNumberGPUs * sizeof(thread_handle)); - - #ifdef debug_OutputMultiGpu - printf("-> pthread_handles for %i workingPackages created!\n", num_workingPackages); - printf("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - #endif - - - - for ( i = 0; i < num_workingPackages; i++ ) { - - //printf("num_workingPackages = %i !\n", num_workingPackages); - //printf("num_devices = %i\n", num_devices); - //printf("(i %% num_devices) = %i !\n", (i % num_devices)); - //printf("deviceId = enableGPUs_ptr[%i] = %i !\n", (i % num_devices), enableGPUs_ptr[(i % num_devices)]); - //printf("deviceIndex = (i %% num_devices) = %i !\n", (i % num_devices)); - - // initialize control block - pthread_handle[i].deviceId = enableGPUs_ptr[(i % selectedNumberGPUs)]; // Hier DeviceID der GPU setzen - pthread_handle[i].deviceIndex = (i % selectedNumberGPUs); - - pthread_handle[i].aScan_ptr = aScan_ptr; - if ( IMAGE_SIZE_XYZ.z > 1 ) - pthread_handle[i].output_ptr = &output_ptr[volumePtr[i]]; // Startpoint for Outputvolume. - //volumePtr[i] = (size_t)(resolution[0].x * resolution[0].y * volumeStartpoint[i].z); //Startpunkt der Speicherstellen fuer das Outputvolumen - else - //pthread_handle[i].output_ptr = &output_ptr[ i * resolution[0].x * resolution[0].y * resolution[0].z ]; // Startpoint for Outputvolume. [0] da nur der letzte eine andere Gr��e hat. - pthread_handle[i].output_ptr = &output_ptr[ (size_t)resolution[0].x * (size_t)i * (size_t)resolution[0].y]; // Startpoint for Outputvolume. [0] da nur der letzte eine andere Groesse hat. Z spielt hier keine Rolle - - - pthread_handle[i].Duration_ptr = Duration_ptr; - pthread_handle[i].receiver_index_ptr = receiver_index_ptr; - pthread_handle[i].emitter_index_ptr = emitter_index_ptr; - pthread_handle[i].receiver_list_ptr = receiver_list_ptr; - pthread_handle[i].receiver_list_Size = receiver_list_Size; - pthread_handle[i].emitter_list_ptr = emitter_list_ptr; - pthread_handle[i].emitter_list_Size = emitter_list_Size; - pthread_handle[i].speed_vec_ptr = speed_vec_ptr; - pthread_handle[i].SOSGrid_XYZ = SOSGrid_XYZ; - pthread_handle[i].sosOffset = sosOffset; - pthread_handle[i].SOS_RESOLUTION = SOS_RESOLUTION; - pthread_handle[i].att_vec_ptr = att_vec_ptr; - - pthread_handle[i].aScanCount = aScanCount; - pthread_handle[i].aScanLength = aScanLength, - pthread_handle[i].inc = IMAGE_RESOLUTION; - pthread_handle[i].res = resolution[i]; - pthread_handle[i].sampleRate = sampleRate; - pthread_handle[i].volposition = position[i]; //regionOfInterestOffset - - pthread_handle[i].num_threads = num_workingPackages; - - pthread_handle[i].fixedBlockDimensions = fixedBlockDimensions; // - pthread_handle[i].debugMode = debugMode; - pthread_handle[i].debugModeParameter = debugModeParameter; - pthread_handle[i].SOSMode_3DVolume = SOSMode_3DVolume; - pthread_handle[i].ATTMode_3DVolume = ATTMode_3DVolume; - - pthread_handle[i].SAFT_MODE = SAFT_MODE; - pthread_handle[i].SAFT_VARIANT = SAFT_VARIANT; - pthread_handle[i].SAFT_VARIANT_Size = SAFT_VARIANT_Size; - - Abort_ptr[i] = 0; // Initialisieren mit kein Fehler - pthread_handle[i].Abort_ptr = &Abort_ptr[i]; - - } - - auto startAllThreads = std::chrono::steady_clock::now(); - double diff_time = 0.0; - - #ifdef debug_OutputMultiGpu - printf("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - printf("pthread_handles for %i devices initialized!\n", selectedNumberGPUs)-1; - - #endif - - // Zeitmessung ueber alle Threads - //gettimeofday(&startAllThreads, NULL); - - #ifdef debug_OutputMultiGpu - printf("-> pthread_handles for %i workingPackages initialized!\n", num_workingPackages); - printf("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - #endif - - std::vector> futures; - futures.resize(selectedNumberGPUs); + auto startAllThreads = std::chrono::steady_clock::now(); + double diff_time = 0.0; + std::vector> futures; + futures.resize(selectedNumberGPUs); SPDLOG_INFO("Start GPU execute!"); - - for ( j = 0; j < num_devices_factor; j++ ) { + for (j = 0; j < num_devices_factor; j++) + { + for (k = 0; k < selectedNumberGPUs; k++) + { + // new async threads + futures[k] = std::async(std::forward>(thread_function), + std::forward((void *)&pthread_handle[(j * selectedNumberGPUs + k)])); // forward used for perfect forwarding + } - for ( k = 0; k < selectedNumberGPUs; k++ ) { - //for ( i = 0; i < num_devices; i++ ) { + // Synchronization and termination ------------------------------------------------------------------------------------------------------------------- + for (k = 0; k < selectedNumberGPUs; k++) + { + // new async threads + futures[k].wait(); // advantage: async are packaged tasks after c++ with os handling, and consistency handling (if destructor is called, it executes task) + } + } - #ifdef debug_OutputMultiGpu - printf("]~~~~> create thread (%i) [WP (%i) on Device (%i)] and execute thread_function!\n", (j*selectedNumberGPUs+k), (j*selectedNumberGPUs+k), enableGPUs_ptr[k]); - #endif - //create thread and start execution by calling thread_function - //=========================================================================================== - //pthread_create(&pthread_handle[(j*selectedNumberGPUs+k)].pthread, NULL, &thread_function, &pthread_handle[(j*selectedNumberGPUs+k)] ); - //=========================================================================================== - - //new async threads - futures[k] = std::async(std::forward>(thread_function), std::forward((void*)&pthread_handle[(j * selectedNumberGPUs + k)])); //forward used for perfect forwarding - - } - - //Synchronization and termination ------------------------------------------------------------------------------------------------------------------- - for ( k = 0; k < selectedNumberGPUs; k++ ) { - - //wait for threads to finish processing - //pthread_join(pthread_handle[(j*selectedNumberGPUs+k)].pthread, NULL); - #ifdef debug_OutputMultiGpu - printf("<~~~~[ joined thread (%i) [WP (%i) on Device (%i)]!\n", (j*selectedNumberGPUs+k), (j*selectedNumberGPUs+k), enableGPUs_ptr[k]); - #endif - //new async threads - futures[k].wait(); //advantage: async are packaged tasks after c++ with os handling, and consistency handling (if destructor is called, it executes task) - } - } - - //gettimeofday(&stopAllThreads, NULL); + // gettimeofday(&stopAllThreads, NULL); SPDLOG_INFO("GPU execute finish!"); - auto stopAllThreads = std::chrono::steady_clock::now(); - diff_time = std::chrono::duration_cast(stopAllThreads - startAllThreads).count(); // total duration in µs + auto stopAllThreads = std::chrono::steady_clock::now(); + diff_time = std::chrono::duration_cast(stopAllThreads - startAllThreads).count(); // total duration in µs - #ifdef debug_OutputPerformance - printf("\n# NUM_VOXEL = %i * %i * %i = %i\n", IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y * IMAGE_SIZE_XYZ.z ); - printf( "# aScanCount = %i\n", aScanCount); - //printf("volposition.x = %f, .y=%f, .z=%f\n", volposition.x, volposition.y, volposition.z ); + Duration_ptr[0] = diff_time; // Return total duration in µs - - double performance_all = (((double)aScanCount * (double)IMAGE_SIZE_XYZ.x * (double)IMAGE_SIZE_XYZ.y * (double)IMAGE_SIZE_XYZ.z) / diff_time )/ 1000.0; - printf("# Duration of main Processing (all GPUs): %.3f us\n", diff_time); - printf("{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}\n"); - printf("{~~~~~~~~~} Performance: %.6lf A-Scan * GVoxel/s {~~~~~~~~~}\n", performance_all); - printf("{~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~}\n"); - #endif - - Duration_ptr[0] = diff_time; // Return total duration in µs - - - // Speicher wieder freigeben - free (position); - free (resolution); - free (volumeStartpoint); - free (volumePtr); - //free (Abort_ptr); - free (pthread_handle); - - #ifdef debug_OutputFunctions - printf( "<== multithreaded_processing - End\n"); - #endif + // Speicher wieder freigeben + free(position); + free(resolution); + free(volumeStartpoint); + free(volumePtr); + // free (Abort_ptr); + free(pthread_handle); } - - - /** preintegrateAscans Determine maximal SampleWidth, matching to the resolution to be used for reconstruction, and integrate A-scan over an window of this SampleWidth */ -void preintegrateAscans( - float *aScan_ptr, ///< AScan-Daten - float *AscansOut_ptr, ///< AScan-OutputDaten fuer Testrueckgabe - float *speed_vec_ptr, ///< SoS Daten im Blockmode - int aScanCount, ///< Anzahl der AScans die im Blockmode verarbeitet werden sollen - int aScanLength, ///< Laenge der AscanDaten (normal 3000) - float IMAGE_RESOLUTION, ///< Aufloesung des Bildbereichs - float sampleRate, ///< Samplerate fuer AScans - float debugMode, ///< Ausgabe im Debugmode -> Verschiedene Werte können ausgegeben werden - float debugModeParameter ///< Parameter der mit fuer Debugmode uebermittelt werden kann - ) +void preintegrateAscans(float *aScan_ptr, ///< AScan-Daten + float *AscansOut_ptr, ///< AScan-OutputDaten fuer Testrueckgabe + float *speed_vec_ptr, ///< SoS Daten im Blockmode + int aScanCount, ///< Anzahl der AScans die im Blockmode verarbeitet werden sollen + int aScanLength, ///< Laenge der AscanDaten (normal 3000) + float IMAGE_RESOLUTION, ///< Aufloesung des Bildbereichs + float sampleRate, ///< Samplerate fuer AScans + float debugMode, ///< Ausgabe im Debugmode -> Verschiedene Werte können ausgegeben werden + float debugModeParameter ///< Parameter der mit fuer Debugmode uebermittelt werden kann +) { - #ifdef debug_OutputFunctions - printf( "==> preintegrateAscans - Start\n"); - #endif + float windowWidth = 0.0f; + float windowWidthHalf = 0.0f; - // Zweiten AScan-Buffer erzeugen mit Länge eines AScans (z.B. 3000) + // maximale Schrittweite ueber einen Voxel = sqr(3)*2*IMAGE_RESOLUTION*fs/c // sqr(3)*2 = 3.464101615 + // width = ( ceil( 1.7*(( resz / speedz)/ (timeintz/INTERP_RATIO)) )); % Breite berechnen + windowWidth = (float)3.464101615 * IMAGE_RESOLUTION / sampleRate / speed_vec_ptr[0]; + windowWidthHalf = (float)1.732050808 * IMAGE_RESOLUTION / sampleRate / speed_vec_ptr[0]; // halbe Fenster Breite +#pragma omp parallel for num_threads(32) + for (int j = 0; j < aScanCount; j++) + { // über alle A-scans gehen. -// //Debuging: Test: Alle Daten uebertragen von Input-Ascans auf AscansOut_ptr[0..aScanLength] ueber AscanBuffer -// for (int i = 0; i i (%3i) \n",i); -// //AscansOut_ptr[i] = *(aScan_ptr+i); -// AscanBuffer[i] = aScan_ptr[i]; -// AscansOut_ptr[i] = AscanBuffer[i]; -// } + float *AscanBuffer = (float *)malloc(aScanLength * sizeof(float)); + int i_start, i_end = 0; + float nSample = 0.0f; + float windowWidthHalf_minus1 = 0.0f; + float windowSum = 0.0f; + if ((int)ceil(windowWidth) % 2 == 1) + { // Uneven / Ungerade + // // Bei ungeraden Breiten kann symmetrisch sampl = widthHalf_minus1 = floor((ceil(width)-1)/2) genutzt werden - #ifdef preAscanIntegrationVersion1Michael // direkt übernommene Version von Michael Zapf + windowWidthHalf_minus1 = floor((ceil(windowWidth) - 1) / 2); + for (int i = 0; i < aScanLength; i++) + { // über gesamte Breite des A-scans gehen. + i_start = i - (int)windowWidthHalf_minus1; + i_end = i + (int)windowWidthHalf_minus1; - int windowWidth = 0; + // Grenzen einhalten + if (i_start < 0) + i_start = 0; + if (i_end > aScanLength - 1) + i_end = aScanLength - 1; - /////////////////////// Integration über Anzahl "sampl" voxel /////////////////////// - unsigned int i,j,sampl = 0; - double i_buffer=0.0; - /////xsum ueber diagonale der voxellänge - sampl = (unsigned int)(ceil((float)1.7*(( IMAGE_RESOLUTION / speed_vec_ptr[0])/ (sampleRate)) /2)); //halbe Breite bestimmen - windowWidth = (int)2*IMAGE_RESOLUTION/sampleRate/speed_vec_ptr[0]; - #ifdef debug_preAscanIntegration - printf( "~~~~~~~~~~~~~~~~~ V1 Michael ~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - printf( " aScanCount = %5i\n",aScanCount); - printf( " aScanLength = %5i\n",aScanLength); - printf( " IMAGE_RESOLUTION = %12.10f\n",IMAGE_RESOLUTION); - printf( " speed_vec_ptr[%3i] = %12.10f\n",i,speed_vec_ptr[i]); - printf( " sampleRate = %12.10f\n\n",sampleRate); + // Anzahl Sample bestimmen + nSample = i_end - i_start + 1; // +1 da erstes Element auch dazugehört. - printf( " => windowWidth = %5i (Ganze Breite) \n",windowWidth); - printf( " => sampl = %5i (Halbe Breite) \n",sampl); - printf( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - #endif + windowSum = 0; + for (int k = i_start; k <= i_end; k++) + { + windowSum += aScan_ptr[j * aScanLength + k]; + } - printf( " => windowWidth = %5i (Ganze Breite) \n",windowWidth); - printf( " => sampl = %5i (Halbe Breite) \n",sampl); + // geteilt durch nur die genutzten Samples + // AscanBuffer[i] = windowSum/ceil(nSample); + // Michael teilt durch die gesamte Breite an Samples,auch wenn sie an dem + // Rand nicht vorhanden/bzw = 0 sind. --> Abflachung am Rand + AscanBuffer[i] = windowSum / ceil(windowWidth); + } + } + else if ((int)ceil(windowWidth) % 2 == 0) + { // Even / Gerade + // Bei geraden Breiten symmetrisch mit den beiden äußeren Samplewerten zu je 1/2 gewichten. - // Zugriff über - // sec_buffer, das mit INTERP_RATIO gestreckt ist - // bufferz ist der vorher schon angelegte Buffer der gleichen Größe wie sec_buffer - // sampl = ist die Breite des Additionsfensters für die Addition + windowWidthHalf_minus1 = floor((ceil(windowWidth) - 1) / 2); + for (int i = 0; i < aScanLength; i++) + { // über gesamte Breite des A-scans gehen. + i_start = i - (int)windowWidthHalf_minus1; + i_end = i + (int)windowWidthHalf_minus1; + // Grenzen einhalten + if (i_start < 0) + i_start = 0; + if (i_end > aScanLength - 1) + i_end = aScanLength - 1; - if ((debugMode == 0.0) && (debugModeParameter == 2.0)){ - printf( "~~~~~~~~~~~~~~~~~~~~~ !!!Use Abs before Preintegrate Ascans!!! ~~~~~~~~~~~~~~~~~~~~~\n"); - for (int j = 0; j Ascan (j=%3i) ==> j*aScanLength+0 = %i \n\n",j, j*aScanLength); - #endif - i_buffer = 0; - for (i=0;i Abflachung am Rand + windowSum = windowSum / ceil(windowWidth); - for (i=0;i= 0) + { + windowSum = windowSum + aScan_ptr[j * aScanLength + (i_start - 1)] / (2 * ceil(windowWidth)); // Linken Nachbarn zu 1/2 mit dazunehmen + nSample = nSample + 0.5; + } + if (i_end < aScanLength - 1) + { + windowSum = windowSum + aScan_ptr[j * aScanLength + (i_end + 1)] / (2 * ceil(windowWidth)); // Rechten Nachbarn zu 1/2 mit dazunehmen + nSample = nSample + 0.5; + } - for (i=sampl;i<(aScanLength)-sampl;i++) - { if (i+sampl=0){ - i_buffer = i_buffer - aScan_ptr[j*aScanLength+i-sampl]/(2*sampl); - //AscanBuffer[i] = i_buffer / (sampl-(aScanLength)-i); // Michaels alter Code - AscanBuffer[i] = i_buffer ; // passt so besser zu Anfangsstrategie - } - } - - //Alle Daten uebertragen von AscanBuffer auf AscansOut_ptr[0..aScanLength] - for (int i = 0; i windowWidth = %5.2f (Ganze Breite) \n",windowWidth); - printf( " => windowWidthHalf = %5.2f (Halbe Breite) \n",windowWidthHalf); - printf( "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); - - printf ("\n 2. Testvariante mit Start und Endpkt-berechnung --> Neue Implementierung mit Zetrum in der Mitte \n#######################################################################################\n"); - #endif - - //printf( " => windowWidth = %5i (Ganze Breite) \n",(int)ceil(windowWidth)); - //printf( " => windowWidthHalf = %5.2f (Halbe Breite) \n",windowWidthHalf); - #pragma omp parallel for num_threads(32) - for (int j = 0; j Ungerade ==> widthHalf_minus1 = %i \n",(int)windowWidthHalf_minus1); - #endif - - for (int i = 0; i < aScanLength; i++){ // über gesamte Breite des A-scans gehen. - i_start = i - (int)windowWidthHalf_minus1; - i_end = i + (int)windowWidthHalf_minus1; - - // Grenzen einhalten - if (i_start < 0) i_start=0; - if (i_end > aScanLength-1) i_end =aScanLength-1; - - // Anzahl Sample bestimmen - nSample = i_end-i_start+1; // +1 da erstes Element auch dazugehört. - - windowSum = 0; - for (int k = i_start; k <= i_end; k++){ - #ifdef debug_preAscanIntegration - if ((i >= DebugSammleMin) && (i<=DebugSammleMax)){ - printf("Index %3i: windowSum (%3i:%3i) += aScan_ptr(k=%i) %07.5f \n", i, i_start, i_end, k, windowSum); - } - #endif - windowSum += aScan_ptr[j*aScanLength+k]; - } - - // geteilt durch nur die genutzten Samples - //AscanBuffer[i] = windowSum/ceil(nSample); - //Michael teilt durch die gesamte Breite an Samples,auch wenn sie an dem - //Rand nicht vorhanden/bzw = 0 sind. --> Abflachung am Rand - AscanBuffer[i] = windowSum/ceil(windowWidth); - - #ifdef debug_preAscanIntegration - //if ((i >= 1720) && (i<=1730)){ - if ((i >= DebugSammleMin) && (i<=DebugSammleMax)){ - printf("Index %3i: Summe über %5i Samples (%3i:%3i) = %07.5f \n", i, (int)nSample, i_start, i_end, AscanBuffer[i]); - } - #endif - } - - - } - else if ((int)ceil(windowWidth)%2 == 0){ // Even / Gerade - - // Bei geraden Breiten symmetrisch mit den beiden äußeren Samplewerten zu je 1/2 gewichten. - - windowWidthHalf_minus1 = floor(( ceil(windowWidth)-1)/2); - #ifdef debug_preAscanIntegration - printf( " ==> Gerade ==> widthHalf_minus1 = %i \n", (int)windowWidthHalf_minus1); - #endif - - for (int i = 0; i < aScanLength; i++){ // über gesamte Breite des A-scans gehen. - i_start = i - (int)windowWidthHalf_minus1; - i_end = i + (int)windowWidthHalf_minus1; - - // Grenzen einhalten - if (i_start < 0) i_start=0; - if (i_end > aScanLength-1) i_end =aScanLength-1; - - // Anzahl Sample bestimmen - nSample = i_end-i_start+1; // +1 da erstes Element auch dazugehört. - - windowSum = 0; - for (int k = i_start; k <= i_end; k++){ - #ifdef debug_preAscanIntegration - //if ((i >= 1720) && (i<=1730)){ - if ((i >= DebugSammleMin) && (i<=DebugSammleMax)){ - printf("Index %3i: windowSum (%3i:%3i) += aScan_ptr(k=%i) %07.5f \n", i, i_start, i_end, k, windowSum); - } - #endif - windowSum += aScan_ptr[j*aScanLength+k]; - } - - // geteilt wird durch nur die genutzten Samples - //AscanBuffer[i] = windowSum/ceil(nSample); - //Michael teilt durch die gesamte Breite an Samples,auch wenn sie an dem - //Rand nicht vorhanden/bzw = 0 sind.--> Abflachung am Rand - windowSum = windowSum/ceil(windowWidth); - - // Halbe Samplewerte an Grenzen miteinberechen aber absolute Grenzen einhalten - if (i_start >= 0){ - windowSum = windowSum + aScan_ptr[j*aScanLength+(i_start-1)]/(2*ceil(windowWidth)); // Linken Nachbarn zu 1/2 mit dazunehmen - nSample = nSample + 0.5; - } - if (i_end < aScanLength-1){ - windowSum = windowSum + aScan_ptr[j*aScanLength+(i_end+1)]/(2*ceil(windowWidth)); // Rechten Nachbarn zu 1/2 mit dazunehmen - nSample = nSample + 0.5; - } - - AscanBuffer[i] = windowSum; - #ifdef debug_preAscanIntegration - //if ((i >= 1720) && (i<=1730)){ - if ((i >= DebugSammleMin) && (i<=DebugSammleMax)){ - printf("Index %3i: Summe über %5.1f Samples (%3i:%3i) = %07.5f \n", i, nSample, i_start, i_end, AscanBuffer[i]); - } - #endif - - } - - } - - // Transfer Data from Buffer to Memory regions - for (int i = 0; i < aScanLength; i++){ - //printf( " i (%4i) = %6.3f ",i,AscanBuffer[i]); - //AscansOut_ptr[i] = *(aScan_ptr+i); - //AscanBuffer[i] = aScan_ptr[i]; - aScan_ptr[j*aScanLength+i] = AscanBuffer[i]; // Write in A-scans Memory - AscansOut_ptr[j*aScanLength+i] = AscanBuffer[i]; // Also write back for Matlab - } - free(AscanBuffer); - } - - #ifdef debug_preAscanIntegration - printf ("#######################################################################################\n\n"); - #endif - - - - #endif - - #ifdef debug_preAscanIntegration - printf( " \n\n\n\n "); - #ifdef debug_OutputVariables - // Ausgabe einzelner AScan-Samples der �bergabewerte - //float *aScan_ptr; - //aScan_ptr = (float*)mxGetPr(AScan); - //int aScanSampleCountPerReceiver=3000; - //printf( " -> AScan (%3i) [ 0: 2] = [%f %f %f]\n",0 , *(aScan_ptr+0), *(aScan_ptr+1), *(aScan_ptr+2)); - printf( " -> AScan (%3i) [1726:1729] = [%f %f %f %f]\n",0 , *(aScan_ptr+1726),*(aScan_ptr+1726+1), *(aScan_ptr+1726+2), *(aScan_ptr+1726+3)); - // Ale drei Moeglichkeiten koennen angewandt werden - //*(AscansOut_ptr+1726) = *(aScan_ptr+1726); - //*(AscansOut_ptr+1726+1) = 0; - //AscansOut_ptr[1726+2] = (float)*(aScan_ptr+1726); - printf( " -> AScan (%3i) [1726:1729] = [%f %f %f %f]\n",0 , *(aScan_ptr+1726),*(aScan_ptr+1726+1), *(aScan_ptr+1726+2), *(aScan_ptr+1726+3)); - //printf( " -> AScan (%3i) [2997:2999] = [%f %f %f]\n",0 , *(aScan_ptr+2997), *(aScan_ptr+2998), *(aScan_ptr+2999)); - //printf( " -> AScan (%3i) [ 0: 2] = [%f %f %f]\n",1 , aScan_ptr[3000], aScan_ptr[3001], aScan_ptr[3002]); - //printf( " -> AScan (%3i) [2997:2999] = [%f %f %f]\n",1 , aScan_ptr[5997], aScan_ptr[5998], aScan_ptr[5999]); - //printf( " -> AScan (%3i) [ 0: 2] = [%f %f %f]\n",156 , aScan_ptr[0+(156*aScanSampleCountPerReceiver)], aScan_ptr[1+(156*aScanSampleCountPerReceiver)], aScan_ptr[2+(156*aScanSampleCountPerReceiver)]); - //printf( " -> AScan (%3i) [2997:2999] = [%f %f %f]\n",156 , aScan_ptr[2997+(156*aScanSampleCountPerReceiver)], aScan_ptr[2998+(156*aScanSampleCountPerReceiver)], aScan_ptr[2999+(156*aScanSampleCountPerReceiver)]); - #endif - #endif - - // free(AscanBuffer); - -#ifdef debug_OutputFunctions - printf( "<== preintegrateAscans - End\n"); -#endif + // Transfer Data from Buffer to Memory regions + for (int i = 0; i < aScanLength; i++) + { + // printf( " i (%4i) = %6.3f ",i,AscanBuffer[i]); + // AscansOut_ptr[i] = *(aScan_ptr+i); + // AscanBuffer[i] = aScan_ptr[i]; + aScan_ptr[j * aScanLength + i] = AscanBuffer[i]; // Write in A-scans Memory + AscansOut_ptr[j * aScanLength + i] = AscanBuffer[i]; // Also write back for Matlab + } + free(AscanBuffer); + } } -const size_t* GetDimensions(const Matrix_t& matrix) +const size_t *GetDimensions(const Matrix_t &matrix) { return matrix.Dims; } +const void *GetPr(const Matrix_t &matrix) { return matrix.Data; } + +size_t GetNumberOfDimensions(const Matrix_t &matrix) { return matrix.NumberOfDims; } + +size_t GetNumberOfElements(const Matrix_t &matrix) { return matrix.DataSize; } + +Matrix_t SAFT_TOFI(std::vector ¶ms) { - return matrix.Dims; -} -const void* GetPr(const Matrix_t& matrix){ - return matrix.Data; -} - -size_t GetNumberOfDimensions(const Matrix_t& matrix){ - return matrix.NumberOfDims; -} - -size_t GetNumberOfElements(const Matrix_t& matrix){ - return matrix.DataSize; -} - -Matrix_t SAFT_TOFI(std::vector& params){ - #ifdef debug_OutputFunctions - printf( "==> mexFunction - Start\n"); - #endif - - #ifdef debug_OutputFormat_German - setlocale(LC_NUMERIC, "de_DE"); // German Format , instead . for numbers - #endif auto console_sink = std::make_shared(); console_sink->set_level(spdlog::level::info); console_sink->set_pattern(fmt::format("[%Y-%m-%d %T .%f][{}] [%^%l%$] %v", "SAFT")); @@ -882,1090 +515,644 @@ Matrix_t SAFT_TOFI(std::vector& params){ logger->flush_on(spdlog::level::info); SPDLOG_INFO("Start SAFT!"); - size_t AScan_Nx, AScan_Mx, - pix_vect_Nx, pix_vect_Mx, - receiver_index_Nx, receiver_index_Mx, - emitter_index_Nx, emitter_index_Mx, - receiver_list_Nx, receiver_list_Mx, - emitter_list_Nx, emitter_list_Mx, + size_t AScan_Nx, AScan_Mx, pix_vect_Nx, pix_vect_Mx, receiver_index_Nx, receiver_index_Mx, emitter_index_Nx, emitter_index_Mx, receiver_list_Nx, receiver_list_Mx, emitter_list_Nx, emitter_list_Mx, - SAFT_mode_Nx, SAFT_mode_Mx, - SAFT_variant_Nx, SAFT_variant_Mx, + SAFT_mode_Nx, SAFT_mode_Mx, SAFT_variant_Nx, SAFT_variant_Mx, - speed_Nx, speed_Mx, - SOSGrid_Xx, SOSGrid_Yx, SOSGrid_Zx, - sos_startPoint_Nx, sos_startPoint_Mx, - sos_res_Nx, sos_res_Mx, + speed_Nx, speed_Mx, SOSGrid_Xx, SOSGrid_Yx, SOSGrid_Zx, sos_startPoint_Nx, sos_startPoint_Mx, sos_res_Nx, sos_res_Mx, - attVolume_Nx,attVolume_Mx, - ATTGrid_Xx, ATTGrid_Yx, ATTGrid_Zx, + attVolume_Nx, attVolume_Mx, ATTGrid_Xx, ATTGrid_Yx, ATTGrid_Zx, - res_Nx, res_Mx, - timeint_Nx, timeint_Mx, - IMAGE_XYZ_Nx, IMAGE_XYZ_Mx, - IMAGE_SUM_Xx, IMAGE_SUM_Yx, IMAGE_SUM_Zx, - BlockDim_XYZ_Nx, BlockDim_XYZ_Mx, + res_Nx, res_Mx, timeint_Nx, timeint_Mx, IMAGE_XYZ_Nx, IMAGE_XYZ_Mx, IMAGE_SUM_Xx, IMAGE_SUM_Yx, IMAGE_SUM_Zx, BlockDim_XYZ_Nx, BlockDim_XYZ_Mx, - GPUs_Nx, GPUs_Mx, - dbgMode_Nx,dbgMode_Mx - ; + GPUs_Nx, GPUs_Mx, dbgMode_Nx, dbgMode_Mx; - int aScanCount; - int aScanLength; - float *aScan_ptr; - int3 IMAGE_SIZE_XYZ; - int3 BlockDim_XYZ; - bool SOSMode_3DVolume; // Mode of SOS use: Grid (1) or Block (0) --> SOSGrid_XYZ, SOS_RESOLUTION, sosOffset not neccessary - bool ATTMode_3DVolume; // Mode of SOS use: Grid (1) or Block (0) --> ATTGrid_XYZ not neccessary - int SAFT_MODE; - int *SAFT_VARIANT; // Variances of SAFT - int SAFT_VARIANT_Size; // Size of Varainces - int3 SOSGrid_XYZ; // Size of SOSGrid - int3 ATTGrid_XYZ; // Size of ATTGrid - float3 regionOfInterestOffset; // Startpoint - float3 sosOffset; // Startpoint SoS - float IMAGE_RESOLUTION; // Aufloesung - float SOS_RESOLUTION; // Aufloesung - float sampleRate; // Samplerate für AScans - int selectedNumberGPUs; // Anzahl der genutzten GPUs durch uebergebene Groesse der Ausgewaehlten GPUs - float debugMode; - float debugModeParameter; + int aScanCount; + int aScanLength; + float *aScan_ptr; + int3 IMAGE_SIZE_XYZ; + int3 BlockDim_XYZ; + bool SOSMode_3DVolume; // Mode of SOS use: Grid (1) or Block (0) --> SOSGrid_XYZ, SOS_RESOLUTION, sosOffset not neccessary + bool ATTMode_3DVolume; // Mode of SOS use: Grid (1) or Block (0) --> ATTGrid_XYZ not neccessary + int SAFT_MODE; + int *SAFT_VARIANT; // Variances of SAFT + int SAFT_VARIANT_Size; // Size of Varainces + int3 SOSGrid_XYZ; // Size of SOSGrid + int3 ATTGrid_XYZ; // Size of ATTGrid + float3 regionOfInterestOffset; // Startpoint + float3 sosOffset; // Startpoint SoS + float IMAGE_RESOLUTION; // Aufloesung + float SOS_RESOLUTION; // Aufloesung + float sampleRate; // Samplerate für AScans + int selectedNumberGPUs; // Anzahl der genutzten GPUs durch uebergebene Groesse der Ausgewaehlten GPUs + float debugMode; + float debugModeParameter; - // check number of arguments... // Anzahl der Argumente überprüfen - #ifdef debug_OutputParameter - printf( "Number of Arguments: Input(nrhs) = %i Output(nlhs) = %i\n", nrhs, nlhs); - #endif - - if (params.size() != 19) + if (params.size() != 19) { - printf( " \n"); - printf( " Inputparameter \n"); - printf( " In[n] Meaning [Row N x Col M] Type\n"); - printf( " =============================================================================\n"); - printf( " 1-prhs[0] AScan-Data [3000?xnAscans] single\n"); - printf( " 2-prhs[1] IMAGE_STARTPOINT_S [1x3] single\n"); + printf(" \n"); + printf(" Inputparameter \n"); + printf(" In[n] Meaning [Row N x Col M] Type\n"); + printf(" =============================================================================\n"); + printf(" 1-prhs[0] AScan-Data [3000?xnAscans] single\n"); + printf(" 2-prhs[1] IMAGE_STARTPOINT_S [1x3] single\n"); - printf( " 3-prhs[2] receiver_index [1xnAscans] uint16\n"); - printf( " 4-prhs[3] emitter_index [1xnAscans] uint16\n"); - printf( " 5-prhs[4] receiver_list [3xnReceiver] single\n"); - printf( " 6-prhs[5] emitter_list [3xnEmitter] single\n"); + printf(" 3-prhs[2] receiver_index [1xnAscans] uint16\n"); + printf(" 4-prhs[3] emitter_index [1xnAscans] uint16\n"); + printf(" 5-prhs[4] receiver_list [3xnReceiver] single\n"); + printf(" 6-prhs[5] emitter_list [3xnEmitter] single\n"); - printf( " 7-prhs[6] SAFT_mode [1x1] uint32\n"); - printf( " 8-prhs[7] SAFT_variant [1x6] uint32\n"); + printf(" 7-prhs[6] SAFT_mode [1x1] uint32\n"); + printf(" 8-prhs[7] SAFT_variant [1x6] uint32\n"); - printf( " Standard: [1 1 1 1 0 0] \n"); - printf( " -> Ascan Preintegration\n"); - printf( " -> Ascan Interpolation\n"); - printf( " -> Preprocessing SOS&ATT 3D Volume Interpolation\n"); - printf( " -> Reconstruction SOS&ATT 3D Volume Interpolation\n"); - printf( " -> not yet\n"); - printf( " -> not yet\n"); + printf(" Standard: [1 1 1 1 0 0] \n"); + printf(" -> Ascan Preintegration\n"); + printf(" -> Ascan Interpolation\n"); + printf(" -> Preprocessing SOS&ATT 3D Volume Interpolation\n"); + printf(" -> Reconstruction SOS&ATT 3D Volume Interpolation\n"); + printf(" -> not yet\n"); + printf(" -> not yet\n"); - printf( " 9-prhs[8] SOSVolume [SOS_XxYxZ] single in m/s\n"); - printf( " 10-prhs[9] SOS_STARTPOINT_S [1x3] single\n"); - printf( " 11-prhs[10] SOS_RESOLUTION_S [1x1] single\n"); + printf(" 9-prhs[8] SOSVolume [SOS_XxYxZ] single in m/s\n"); + printf(" 10-prhs[9] SOS_STARTPOINT_S [1x3] single\n"); + printf(" 11-prhs[10] SOS_RESOLUTION_S [1x1] single\n"); - printf( " 12-prhs[11] ATTVolume [ATT_XxYxZ] single in dB/cm\n"); + printf(" 12-prhs[11] ATTVolume [ATT_XxYxZ] single in dB/cm\n"); - printf( " 13-prhs[12] IMAGE_RESOLUTION_S [1x1] single\n"); - printf( " 14-prhs[13] TimeInterval_S [1x1] single\n"); - printf( " 15-prhs[14] IMAGE_XYZ [1x3] uint32\n"); - printf( " 16-prhs[15] IMAGE_SUM [Output_XxYxZ] double\n"); + printf(" 13-prhs[12] IMAGE_RESOLUTION_S [1x1] single\n"); + printf(" 14-prhs[13] TimeInterval_S [1x1] single\n"); + printf(" 15-prhs[14] IMAGE_XYZ [1x3] uint32\n"); + printf(" 16-prhs[15] IMAGE_SUM [Output_XxYxZ] double\n"); - printf( " 17-prhs[16] BlockDim_XYZ (GPU) [1x3] uint32\n"); - printf( " 18-prhs[17] GPUs (DeviceNr GPU) [1xn] uint32\n"); + printf(" 17-prhs[16] BlockDim_XYZ (GPU) [1x3] uint32\n"); + printf(" 18-prhs[17] GPUs (DeviceNr GPU) [1xn] uint32\n"); - printf( " 19-prhs[18] dbgMode,dbgModeParam [1x2] single\n"); + printf(" 19-prhs[18] dbgMode,dbgModeParam [1x2] single\n"); - printf( " ==============================================================================\n"); - printf( "\n"); - printf( " Outputparameter \n"); - printf( " Out[n] Meaning \n"); - printf( " ================================================================================================= \n"); - printf( " plhs[0] = Output_Voxels = mxCreateNumericArray ( [IMAGE_XYZ] , mxDOUBLE_CLASS, mxREAL); \n"); - printf( " plhs[1] = Duration = mxCreateDoubleMatrix ( [nGPUs+1, 1] , mxREAL); \n"); - printf( " plhs[2] = Output_Ascans = mxCreateNumericMatrix( [3000?,nAscans], mxSINGLE_CLASS, mxREAL); \n"); - printf( " ================================================================================================= \n"); - printf("Wrong number of input arguments. Should be 19."); + printf(" ==============================================================================\n"); + printf("\n"); + printf(" Outputparameter \n"); + printf(" Out[n] Meaning \n"); + printf(" ================================================================================================= \n"); + printf(" plhs[0] = Output_Voxels = mxCreateNumericArray ( [IMAGE_XYZ] , mxDOUBLE_CLASS, mxREAL); \n"); + printf(" plhs[1] = Duration = mxCreateDoubleMatrix ( [nGPUs+1, 1] , mxREAL); \n"); + printf(" plhs[2] = Output_Ascans = mxCreateNumericMatrix( [3000?,nAscans], mxSINGLE_CLASS, mxREAL); \n"); + printf(" ================================================================================================= \n"); + printf("Wrong number of input arguments. Should be 19."); } // assign input arguments... // Bestimme die Eingangswerte - const Matrix_t&AScan = params [0]; // AScan-Data - const Matrix_t&pix_vect = params [1]; // Image Startpoint (IMAGE_STARTPOINT_S) + const Matrix_t &AScan = params[0]; // AScan-Data + const Matrix_t &pix_vect = params[1]; // Image Startpoint (IMAGE_STARTPOINT_S) + + const Matrix_t &receiver_index = params[2]; // Index Data for Receiver-Position Data + const Matrix_t &emitter_index = params[3]; // Index Data for Emitter-Position Data + const Matrix_t &receiver_list = params[4]; // Assignment Index to Receiver-Position Data + const Matrix_t &emitter_list = params[5]; // Assignment Index to Emitter-Position Data + + const Matrix_t &SAFT_mode = params[6]; // SOS?, ATT? + const Matrix_t &SAFT_variant = params[7]; // Differnt Mode-Parameter for Reconstruction - const Matrix_t&receiver_index = params [2]; // Index Data for Receiver-Position Data - const Matrix_t&emitter_index = params [3]; // Index Data for Emitter-Position Data - const Matrix_t&receiver_list = params [4]; // Assignment Index to Receiver-Position Data - const Matrix_t&emitter_list = params [5]; // Assignment Index to Emitter-Position Data - - const Matrix_t&SAFT_mode = params [6]; // SOS?, ATT? - const Matrix_t&SAFT_variant = params [7]; // Differnt Mode-Parameter for Reconstruction - - const Matrix_t&speed = params [8]; // Speed of Sound Data (Single, SoS-Grid) - const Matrix_t&sos_startPoint = params [9]; // Startpoint of Speed of Sound Grid - const Matrix_t&sos_res = params [10]; // SoS Grid Resolution - - const Matrix_t&attVolume = params [11]; // Attenuation Data (Single, SoS-Grid) - - const Matrix_t&res = params [12]; // Output Volume Resolution - const Matrix_t&timeint = params [13]; // 1/Sample-Rate - const Matrix_t&IMAGE_XYZ = params [14]; // Output Volume Size XYZ - const Matrix_t&IMAGE_SUM = params [15]; // Volume from previous Call - - const Matrix_t&BlockDim = params [16]; // Block Dimension to use for GPU - const Matrix_t&GPUs = params [17]; // Welche GPUs sollen genutzt werden? - - const Matrix_t&dbgMode = params [18]; // DebugMode and DebugMode-Parameter - - - // check data types and assign variables... // Ueberpruefe die Datentypen und lege Variablen fest - - //================================================================================================================ Check Dimensions of input parameters - #ifdef debug_OutputParameter - printf( "\n"); - printf( "In[n] Meaning [Row N x Col M] \n"); - printf( "=================================================================================================\n"); - #endif - - - //====================================================================== 1.Input Parameter - Check AScan - AScan_Nx = GetDimensions(AScan)[0]; // Reihen N ermitteln - AScan_Mx = GetDimensions(AScan)[1]; // Spalten M ermitteln - - aScanCount = AScan_Mx; - aScanLength = AScan_Nx; - - - #ifdef debug_OutputParameter - printf( "prhs[0] Ascan-Data [%ix%i]\n", AScan_Nx , AScan_Mx); - #endif - - //printf( "mxGetNumberOfDimensions(AScan)=%i\n", mxGetNumberOfDimensions(AScan)); - if ((aScanCount > 65535)) // new 2019: increasing the limit of the A-Scan block size. however this is limited by the datatype of unsigned short which is used for a pointer. - { - printf( " -> AScanBlock size = %i\n", aScanCount); - printf( "AScanBlock size might be too large (=> 2^16)!!!"); - } - if ((aScanCount > 1)) - #ifdef debug_OutputParameter - printf( " -> Blockmode with [%i x %i]\n", AScan_Nx, aScanCount); - #endif - // if(!(mxIsSingle(AScan))) - // printf("AScans must be Single"); - - aScan_ptr = (float*)GetPr(AScan); - - #ifdef debug_OutputVariables - // Ausgabe einzelner AScan-Samples der übergabewerte - //int aScanSampleCountPerReceiver=3000; - //printf( " -> AScan (%3i) [ 0: 2] = [%f %f %f]\n",0 , *(aScan_ptr+0), *(aScan_ptr+1), *(aScan_ptr+2)); - //printf( " -> AScan (%3i) [2997:2999] = [%f %f %f]\n",0 , *(aScan_ptr+2997), *(aScan_ptr+2998), *(aScan_ptr+2999)); - //printf( " -> AScan (%3i) [ 0: 2] = [%f %f %f]\n",1 , aScan_ptr[3000], aScan_ptr[3001], aScan_ptr[3002]); - //printf( " -> AScan (%3i) [2997:2999] = [%f %f %f]\n",1 , aScan_ptr[5997], aScan_ptr[5998], aScan_ptr[5999]); - //printf( " -> AScan (%3i) [ 0: 2] = [%f %f %f]\n",156 , aScan_ptr[0+(156*aScanSampleCountPerReceiver)], aScan_ptr[1+(156*aScanSampleCountPerReceiver)], aScan_ptr[2+(156*aScanSampleCountPerReceiver)]); - //printf( " -> AScan (%3i) [2997:2999] = [%f %f %f]\n",156 , aScan_ptr[2997+(156*aScanSampleCountPerReceiver)], aScan_ptr[2998+(156*aScanSampleCountPerReceiver)], aScan_ptr[2999+(156*aScanSampleCountPerReceiver)]); - #endif - - //====================================================================== 2.Input Parameter - Check IMAGE_STARTPOINT_S / pix_vect - pix_vect_Nx = GetDimensions(pix_vect)[0]; // Reihen N ermitteln - pix_vect_Mx = GetDimensions(pix_vect)[1]; // Spalten M ermitteln - - regionOfInterestOffset.x = *((float*)GetPr(pix_vect)); - regionOfInterestOffset.y = *((float*)GetPr(pix_vect)+1); - regionOfInterestOffset.z = *((float*)GetPr(pix_vect)+2); - - #ifdef debug_OutputParameter - printf( "prhs[1] IMAGE_STARTPOINT_S [%ix%i] = [%f x %f x %f]\n", pix_vect_Nx , pix_vect_Mx, regionOfInterestOffset.x, regionOfInterestOffset.y, regionOfInterestOffset.z); - #endif - - if (!(pix_vect_Nx == 1)||!(pix_vect_Mx == 3)) - printf(" -> Dimension of IMAGE_STARTPOINT_S must be [1 x 3]"); - if ((pix_vect_Nx > 1)) - printf( " -> No Blockmode [%i x 3] allowed for IMAGE_STARTPOINT_S\n", pix_vect_Nx); - // if(!(mxIsSingle(pix_vect))) - // printf(" -> IMAGE_STARTPOINT_S must be Single"); - - //====================================================================== 3.Input Parameter - Check Receiver Index - receiver_index_Nx = GetDimensions(receiver_index)[0]; // Reihen N ermitteln - receiver_index_Mx = GetDimensions(receiver_index)[1]; // Spalten M ermitteln - - #ifdef debug_OutputParameter - printf( "prhs[2] receiver_index [%ix%i]\n", receiver_index_Nx , receiver_index_Mx); - #endif - - if (!(receiver_index_Nx == 1)) - printf(" -> Dimension of receiver_index must be [1 x M]"); - if (!(receiver_index_Mx == aScanCount)){ - printf (" -> aScanCount(%i)!= M(%i)\n", aScanCount, receiver_index_Mx); - printf(" -> Dimension of receiver_index has different size as Ascan-Data\n"); - } - // if (!(receiver_index_Mx == 1)) - #ifdef debug_OutputParameter - printf( " -> Blockmode with [1 x %i]\n", receiver_index_Mx); - #endif - // if(!(mxIsUint16(receiver_index))) - // printf(" -> receiver_index must be Uint16"); - - - - // Ausgabe einzelner Geometriedaten der Uebergabewerte mit verschiedenen Varianten - unsigned short *receiver_index_ptr; - receiver_index_ptr = (unsigned short*)GetPr(receiver_index); - #ifdef debug_OutputVariables - - if ((receiver_index_Mx > 1)) - { - printf( " -> receiver_index: %i = [%i]\n",1 , *(receiver_index_ptr+0)); - printf( " -> receiver_index: %i = [%i]\n",2 , *(receiver_index_ptr+1)); - printf( " -> receiver_index: %i = [%i]\n",3 , *(receiver_index_ptr+2)); - } - else - printf( " -> receiver_index: %i = [%i]\n",1 , *(receiver_index_ptr+0)); - #endif - - //====================================================================== 4.Input Parameter - Check Emitter Index - emitter_index_Nx = GetDimensions(emitter_index)[0]; // Reihen N ermitteln - emitter_index_Mx = GetDimensions(emitter_index)[1]; // Spalten M ermitteln - - #ifdef debug_OutputParameter - printf( "prhs[3] emitter_index [%ix%i]\n", emitter_index_Nx , emitter_index_Mx); - #endif - - if (!(emitter_index_Nx == 1)) - printf(" -> Dimension of emitter_index must be [1 x M]"); - if (!(emitter_index_Mx == aScanCount)){ - printf (" -> aScanCount(%i)!= M(%i)\n", aScanCount, emitter_index_Mx); - printf(" -> Dimension of emitter_index has different size as Ascan-Data\n"); - } - // if (!(emitter_index_Mx == 1)) - #ifdef debug_OutputParameter - printf( " -> Blockmode with [1 x %i]\n", emitter_index_Mx); - #endif - // if(!(mxIsUint16(emitter_index))) - // printf(" -> emitter_index must be Uint16"); - - // Ausgabe einzelner Geometriedaten der Uebergabewerte mit verschiedenen Varianten - unsigned short * emitter_index_ptr = (unsigned short*)GetPr(emitter_index); - - #ifdef debug_OutputVariables - - if ((receiver_index_Mx > 1)) - { - printf( " -> emitter_index: %i = [%i]\n",1 , *(emitter_index_ptr+0)); - printf( " -> emitter_index: %i = [%i]\n",2 , *(emitter_index_ptr+1)); - printf( " -> emitter_index: %i = [%i]\n",3 , *(emitter_index_ptr+2)); - } - else - printf( " -> emitter_index: %i = [%i]\n",1 , *(emitter_index_ptr+0)); - #endif - - - - //====================================================================== 5.Input Parameter - Check receiver_list - receiver_list_Nx = GetDimensions(receiver_list)[0]; // Reihen N ermitteln - receiver_list_Mx = GetDimensions(receiver_list)[1]; // Spalten M ermitteln - - #ifdef debug_OutputParameter - printf( "prhs[4] receiver_list [%ix%i]\n", receiver_list_Nx , receiver_list_Mx); - #endif - - if (!(receiver_list_Nx == 3)) - printf(" -> Dimension of receiver_list must be [3 x M]"); - // if(!(mxIsSingle(receiver_list))) - // printf(" -> receiver_list must be Single"); - - // Ausgabe einzelner Geometriedaten der Uebergabewerte mit verschiedenen Varianten - float *receiver_list_ptr; - receiver_list_ptr = (float*)GetPr(receiver_list); - - #ifdef debug_OutputVariables - - if ((receiver_list_Mx > 1)) - { - printf( " -> receiver_list-Position: %i = [%f %f %f]\n",1 , *(receiver_list_ptr+0), *(receiver_list_ptr+1), *(receiver_list_ptr+2)); - printf( " -> receiver_list-Position: %i = [%f %f %f]\n",2 , *(receiver_list_ptr+3), *(receiver_list_ptr+4), *(receiver_list_ptr+5)); - } - else - printf( " -> receiver_list-Position: %i = [%f %f %f]\n",1 , *(receiver_list_ptr+0), *(receiver_list_ptr+1), *(receiver_list_ptr+2)); - - #endif - - - - //====================================================================== 6.Input Parameter - Check emitter_list - emitter_list_Nx = GetDimensions(emitter_list)[0]; // Reihen N ermitteln - emitter_list_Mx = GetDimensions(emitter_list)[1]; // Spalten M ermitteln // emitter_list gibt die maximale Anzahl an Emittern die in diesem Block vorkommen können wieder! - - #ifdef debug_OutputParameter - printf( "prhs[5] emitter_list [%ix%i]\n", emitter_list_Nx , emitter_list_Mx); - #endif - - if (!(emitter_list_Nx == 3)) - printf(" -> Dimension of emitter_list must be [3 x M]"); - // if(!(mxIsSingle(receiver_list))) - // printf(" -> emitter_list must be Single"); - - // Ausgabe einzelner Geometriedaten der übergabewerte mit verschiedenen Varianten - float *emitter_list_ptr; - emitter_list_ptr = (float*)GetPr(emitter_list); - - #ifdef debug_OutputVariables - - - if ((emitter_list_Mx > 1)) - { - printf( " -> emitter_list-Position: %i = [%f %f %f]\n",1 , *(emitter_list_ptr+0), *(emitter_list_ptr+1), *(emitter_list_ptr+2)); - printf( " -> emitter_list-Position: %i = [%f %f %f]\n",2 , *(emitter_list_ptr+3), *(emitter_list_ptr+4), *(emitter_list_ptr+5)); - } - else - printf( " -> emitter_list-Position: %i = [%f %f %f]\n",1 , *(emitter_list_ptr+0), *(emitter_list_ptr+1), *(emitter_list_ptr+2)); - - #endif - - - - //====================================================================== 7.Input Parameter - Check SAFT_mode - SAFT_mode_Nx = GetDimensions(SAFT_mode)[0]; // Reihen N ermitteln - SAFT_mode_Mx = GetDimensions(SAFT_mode)[1]; // Spalten M ermitteln - - SAFT_MODE = *((int*)GetPr(SAFT_mode)); - - - #ifdef debug_OutputParameter - printf( "prhs[6] SAFT_MODE [%ix%i] = [%i]\n", SAFT_mode_Nx , SAFT_mode_Mx, SAFT_MODE); - #endif - - if (!(SAFT_mode_Nx == 1)) - printf(" -> Dimension of SAFT_MODE must be [1 x 1]"); - // if(!(mxIsUint32(SAFT_mode))) - // printf(" -> SAFT_MODE must be Uint32"); - - #ifdef debug_OutputParameter - printf ( "\e[7;37m ======================================================================================== \e[0m\n"); - printf ( "\e[7;37m Used SAFT (GPU-Version):\e[0m"); - #endif - switch (SAFT_MODE) - { - case 0: - SOSMode_3DVolume = false; ATTMode_3DVolume = false; - //printf ( "\e[7;37m Standard SAFT without correction (-SOS -ATT) (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); - printf (" -> AscanIndexVersion only make sense with SOS or SOS and ATT Volume => exit"); - break; - case 1: - SOSMode_3DVolume = true; ATTMode_3DVolume = false; - //printf ( "\e[7;37m + Speed of sound correction - Attenuation correction (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); - break; - case 2: - SOSMode_3DVolume = true; ATTMode_3DVolume = true; - //printf ( "\e[7;37m + Speed of sound correction + Attenuation correction (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); - break; - case 3: - //SOSMode_3DVolume = true; ATTMode_3DVolume = true; - //printf ( "\e[7;37m SAFT_MODE = 3 \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); - printf(" -> not implemented => exit"); - break; - case 4: - //SOSMode_3DVolume = false; ATTMode_3DVolume = false; - //printf ( "\e[7;37m SAFT_MODE = 4 \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); - printf(" -> not implemented => exit"); - break; - default: SOSMode_3DVolume = false; ATTMode_3DVolume = false; - //printf ( " -> SAFT_MODE %i is out of range [0..3] => use Standard SAFT\n", SAFT_MODE); - //printf ( "\e[7;37m Standard SAFT without correction (-SOS -ATT) (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); - break; - } - - #ifdef debug_OutputParameter - printf ( "\n\e[7;37m ======================================================================================== \e[0m\n"); - #else - //printf ( "\n"); - #endif + const Matrix_t &speed = params[8]; // Speed of Sound Data (Single, SoS-Grid) + const Matrix_t &sos_startPoint = params[9]; // Startpoint of Speed of Sound Grid + const Matrix_t &sos_res = params[10]; // SoS Grid Resolution + const Matrix_t &attVolume = params[11]; // Attenuation Data (Single, SoS-Grid) - //====================================================================== 8.Input Parameter - Check SAFT_variant - SAFT_variant_Nx = GetDimensions(SAFT_variant)[0]; // Reihen N ermitteln - SAFT_variant_Mx = GetDimensions(SAFT_variant)[1]; // Spalten M ermitteln + const Matrix_t &res = params[12]; // Output Volume Resolution + const Matrix_t &timeint = params[13]; // 1/Sample-Rate + const Matrix_t &IMAGE_XYZ = params[14]; // Output Volume Size XYZ + const Matrix_t &IMAGE_SUM = params[15]; // Volume from previous Call - SAFT_VARIANT = (int*)GetPr(SAFT_variant); - SAFT_VARIANT_Size = SAFT_variant_Mx; - - #ifdef debug_OutputParameter - //printf( "prhs[7] SAFT_VARIANT [%ix%i] = [%i %i %i %i %i %i]\n", SAFT_variant_Nx , SAFT_variant_Mx, SAFT_VARIANT[0], SAFT_VARIANT[1], SAFT_VARIANT[2], SAFT_VARIANT[3], SAFT_VARIANT[4], SAFT_VARIANT[5]); - printf(" -> Ascan Preintegration = [%i]\n", SAFT_VARIANT[SAFT_VARIANT_AscanPreintegration]); - printf(" -> Ascan Interpolation = [%i]\n", SAFT_VARIANT[SAFT_VARIANT_AscanInterpolation]); - printf(" -> Preprocessing SOS&ATT 3D Volume Interpolation = [%i]\n", SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing]); - printf(" -> Reconstruction SOS&ATT 3D Volume Interpolation = [%i]\n", SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]); - printf(" -> Standard deviation (STD) not yet = [%i]\n", SAFT_VARIANT[SAFT_VARIANT_CalcStandardDeviation]); - printf(" -> Sum up over boarder Indices not yet = [%i]\n", SAFT_VARIANT[SAFT_VARIANT_SumUpOverBoarderIndices]); - #endif - - // if(!(mxIsUint32(SAFT_variant))) - // printf(" -> SAFT_VARIANT must be Uint32"); - if (!(SAFT_variant_Nx == 1)||!(SAFT_variant_Mx == 6)) - printf(" -> Dimension of SAFT_VARIANT must be [1 x 6]"); - - - - - - - //====================================================================== 9.Input Parameter - Check for SOS volume - speed_Nx = GetDimensions(speed)[0]; // Reihen N ermitteln - speed_Mx = GetDimensions(speed)[1]; // Spalten M ermitteln - float Sos = *((float*)GetPr(speed)); - - float *speed_vec_ptr; - speed_vec_ptr = (float*)GetPr(speed); // Pointer für SoSDaten ermitteln - - // if(!(mxIsSingle(speed))) - // printf(" -> SOSVolume must be Single"); - - - if (SOSMode_3DVolume == true) // SOS correction need 3D Volume - { - if (GetNumberOfDimensions(speed) == 3){ - #ifdef debug_OutputParameter - printf( "prhs[8] SOSVolume [%ix%ix%i]\n", mxGetDimensions(speed)[0] , mxGetDimensions(speed)[1], mxGetDimensions(speed)[2]); - printf( " -> use SoS-Mode with SoS-Correction per Path with SOS 3D Volume\n"); - #endif - - if (!( (speed_Nx > 1)&&(speed_Mx > 1) )){ - printf( " -> SOSGrid_XYZ.x and SOSGrid_XYZ.y must be > 1 for SOS Correction with 3D Volume!!!"); - } - } - else if (GetNumberOfDimensions(speed) == 2){ - printf ( "prhs[8] SOSVolume [%ix%i]\n",(int)GetDimensions(speed)[0] , (int)GetDimensions(speed)[1]); - printf( " -> SOSVolume is not a 3D Volume as expected!"); - } - - SOSGrid_Xx = GetDimensions(speed)[0]; // SOSGrid_X ermitteln - SOSGrid_Yx = GetDimensions(speed)[1]; // SOSGrid_Y ermitteln - SOSGrid_Zx = GetDimensions(speed)[2]; // SOSGrid_Z ermitteln - - SOSGrid_XYZ.x = SOSGrid_Xx; - SOSGrid_XYZ.y = SOSGrid_Yx; - SOSGrid_XYZ.z = SOSGrid_Zx; - - if ((SOSGrid_XYZ.x > 128)||(SOSGrid_XYZ.y > 128)|| (SOSGrid_XYZ.z > 128)){ - printf ( " -> SOSGrid_XYZ [%i x %i x %i]\n", (int)SOSGrid_Xx, (int)SOSGrid_Yx, (int)SOSGrid_Zx); - printf ( " Warning -> SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z > 128!!! --> can be problematic due to memory requirement\n"); - } - - #ifdef debug_OutputVariables - printf(" -> SoS-Data[1-3] = [%f %f %f]\n",speed_vec_ptr[0], speed_vec_ptr[1], speed_vec_ptr[2]); - //int speedOfSoundFieldIndex = (currentVoxel.z * SOSGrid_XYZ.y + currentVoxel.y) * SOSGrid_XYZ.x + currentVoxel.x; - int speedOfSoundFieldIndex = (0 * SOSGrid_XYZ.y + 1) * SOSGrid_XYZ.x + 9; - printf(" -> SoS-Data[%i-%i] = [%f %f %f]\n",speedOfSoundFieldIndex, (speedOfSoundFieldIndex+2), speed_vec_ptr[speedOfSoundFieldIndex + 0], speed_vec_ptr[speedOfSoundFieldIndex + 1], speed_vec_ptr[speedOfSoundFieldIndex + 2]); - speedOfSoundFieldIndex = (4 * SOSGrid_XYZ.y + 37) * SOSGrid_XYZ.x + 9; - printf(" -> SoS-Data[%i-%i] = [%f %f %f]\n",speedOfSoundFieldIndex, (speedOfSoundFieldIndex+2), speed_vec_ptr[speedOfSoundFieldIndex + 0], speed_vec_ptr[speedOfSoundFieldIndex + 1], speed_vec_ptr[speedOfSoundFieldIndex + 2]); - - int SOSGrid_END = (SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z); - printf(" -> SoS-Data[%i-%i] = [%f %f %f]\n",(SOSGrid_END-3), (SOSGrid_END-1), speed_vec_ptr[SOSGrid_END-3], speed_vec_ptr[SOSGrid_END-2], speed_vec_ptr[SOSGrid_END-1]); - #endif - - } - - - - - //====================================================================== 10.Input Parameter - Check SoS Startpoint - sos_startPoint_Nx = GetDimensions(sos_startPoint)[0]; // Reihen N ermitteln - sos_startPoint_Mx = GetDimensions(sos_startPoint)[1]; // Spalten M ermitteln - - sosOffset.x = *((float*)GetPr(sos_startPoint)); - sosOffset.y = *((float*)GetPr(sos_startPoint)+1); - sosOffset.z = *((float*)GetPr(sos_startPoint)+2); - - #ifdef debug_OutputParameter - printf( "prhs[9] SOS_STARTPOINT_S [%ix%i] = [%f x %f x %f]\n", sos_startPoint_Nx , sos_startPoint_Mx, sosOffset.x, sosOffset.y, sosOffset.z); - #endif - - - if (!(sos_startPoint_Nx == 1)||!(sos_startPoint_Mx == 3)) - printf(" -> Dimension of SOS_STARTPOINT_S must be [1 x 3]"); - if ((sos_startPoint_Nx > 1)) - printf( " -> No Blockmode [%i x 3] allowed for SOS_STARTPOINT_S\n", sos_startPoint_Nx); - // if(!(mxIsSingle(sos_startPoint))) - // printf(" -> SOS_STARTPOINT_S must be Single"); - - //====================================================================== 11.Input Parameter - Check SoS_RESOLUTION / sos_res - if (SOSMode_3DVolume == true){ - sos_res_Nx = GetDimensions(sos_res)[0]; // Reihen N ermitteln - sos_res_Mx = GetDimensions(sos_res)[1]; // Spalten M ermitteln - - SOS_RESOLUTION = *((float*)GetPr(sos_res)); - - #ifdef debug_OutputParameter - printf( "prhs[10] SOS_RESOLUTION_S [%ix%i] = [%f]\n", sos_res_Nx , sos_res_Mx, SOS_RESOLUTION); - #endif - - - if (!(sos_res_Nx == 1)) - printf(" -> Dimension of SOS_RESOLUTION_S must be [1 x 1]"); - if ((sos_res_Mx > 1)) - printf( " -> No Blockmode allowed for SOS_RESOLUTION_S! [1 x %i]\n", sos_res_Mx); - // if(!(mxIsSingle(sos_res))) - // printf(" -> SOS_RESOLUTION_S must be Single"); - } - - //====================================================================== 12.Input Parameter - Check for ATTVolume / Attenuation-Data - - attVolume_Nx = GetDimensions(attVolume)[0]; // Reihen N ermitteln - attVolume_Mx = GetDimensions(attVolume)[1]; // Spalten M ermitteln - - float *att_vec_ptr; - att_vec_ptr = (float*)GetPr(attVolume); // Pointer für ATT-Daten ermitteln - - // if(!(mxIsSingle(attVolume))) - // printf(" -> attVolume must be Single"); - - - if (GetNumberOfDimensions(attVolume) == 3){ - #ifdef debug_OutputParameter - printf( "prhs[11] attVolume [%ix%ix%i]\n", mxGetDimensions(attVolume)[0] , mxGetDimensions(attVolume)[1], mxGetDimensions(attVolume)[2]); - printf( " -> use ATT-Mode with ATT-Correction per Path with ATT 3D Volume\n"); - #endif - - if (!( (attVolume_Nx > 1)&&(attVolume_Mx > 1) )){ - printf( " -> ATTGrid_XYZ.x and ATTGrid_XYZ.y must be > 1 for ATT Correction with 3D Volume!!!"); - } - } - else if (GetNumberOfDimensions(attVolume) == 2){ - #ifdef debug_OutputParameter - printf ( "prhs[11] attVolume [%ix%i]\n", mxGetDimensions(attVolume)[0] , mxGetDimensions(attVolume)[1]); - printf( " -> attVolume is not a 3D Volume as expected!"); - #endif - } - - if ((SOSMode_3DVolume == true)&&(ATTMode_3DVolume == true)){ // 3D Volume muss bei SOS und ATT angegeben sein damit ATT Korrektur durchgefuehrt werden kann - ATTGrid_Xx = GetDimensions(attVolume)[0]; // ATTGrid_X ermitteln - ATTGrid_Yx = GetDimensions(attVolume)[1]; // ATTGrid_Y ermitteln - ATTGrid_Zx = GetDimensions(attVolume)[2]; // ATTGrid_Z ermitteln - - ATTGrid_XYZ.x = ATTGrid_Xx; - ATTGrid_XYZ.y = ATTGrid_Yx; - ATTGrid_XYZ.z = ATTGrid_Zx; - - if ((ATTGrid_XYZ.x > 128) || (ATTGrid_XYZ.y > 128) || (ATTGrid_XYZ.z > 128)){ - printf ( " -> ATTGrid_XYZ [%i x %i x %i]\n", ATTGrid_XYZ.x, ATTGrid_XYZ.y, ATTGrid_XYZ.z); - printf ( " Warning -> ATTGrid_XYZ.x, ATTGrid_XYZ.y, ATTGrid_XYZ.z > 128!!! --> can be problematic due to memory requirement\n"); - } - - if ((ATTGrid_XYZ.x != SOSGrid_XYZ.x) || (ATTGrid_XYZ.y != SOSGrid_XYZ.y) || (ATTGrid_XYZ.z != SOSGrid_XYZ.z)){ // Restriction: Volume parameter of ATT & SOS must be the same - printf( " -> ATTGrid[%i %i %i] != SOSGrid[%i %i %i]!\n", ATTGrid_XYZ.x , ATTGrid_XYZ.y, ATTGrid_XYZ.z, SOSGrid_XYZ.x , SOSGrid_XYZ.y, SOSGrid_XYZ.z); - printf(" -> ATTGrid must have the same size as SOSGrid \n"); - } - - #ifdef debug_OutputVariables - printf(" -> ATT-Data[1-3] = [%f %f %f]\n", att_vec_ptr[0], att_vec_ptr[1], att_vec_ptr[2]); - int ATTFieldIndex = (14 * ATTGrid_XYZ.y + 30 ) * ATTGrid_XYZ.x + 16; - printf(" -> ATT-Data[%i-%i] = [%f %f %f]\n", ATTFieldIndex, (ATTFieldIndex+2), att_vec_ptr[ATTFieldIndex + 0], att_vec_ptr[ATTFieldIndex + 1], att_vec_ptr[ATTFieldIndex + 2]); - ATTFieldIndex = (15 * ATTGrid_XYZ.y + 32 ) * ATTGrid_XYZ.x + 32; - printf(" -> ATT-Data[%i-%i] = [%f %f %f]\n", ATTFieldIndex, (ATTFieldIndex+2), att_vec_ptr[ATTFieldIndex + 0], att_vec_ptr[ATTFieldIndex + 1], att_vec_ptr[ATTFieldIndex + 2]); - - int ATTGrid_END = (ATTGrid_XYZ.x * ATTGrid_XYZ.y * ATTGrid_XYZ.z); - printf(" -> ATT-Data[%i-%i] = [%f %f %f]\n", (ATTGrid_END-3), (ATTGrid_END-1), att_vec_ptr[ATTGrid_END-3], att_vec_ptr[ATTGrid_END-2], att_vec_ptr[ATTGrid_END-1]); - #endif - - } - else{ - //#ifdef debug_OutputVariables - printf( " -> ATTMode_3DVolume == false => skip ATTGrid\n"); - //#endif - - ATTGrid_Xx = 0; // ATTGrid_X ermitteln - ATTGrid_Yx = 0; // ATTGrid_Y ermitteln - ATTGrid_Zx = 0; // ATTGrid_Z ermitteln - - ATTGrid_XYZ.x = ATTGrid_Xx; - ATTGrid_XYZ.y = ATTGrid_Yx; - ATTGrid_XYZ.z = ATTGrid_Zx; - } - - - - - - - //====================================================================== 13.Input Parameter - Check IMAGE_RESOLUTION_S / res - res_Nx = GetDimensions(res)[0]; // Reihen N ermitteln - res_Mx = GetDimensions(res)[1]; // Spalten M ermitteln - - IMAGE_RESOLUTION = *((float*)GetPr(res)); - - #ifdef debug_OutputParameter - printf( "prhs[12] IMAGE_RESOLUTION_S [%ix%i] = [%f]\n", res_Nx , res_Mx, IMAGE_RESOLUTION); - #endif - - if (!(res_Nx == 1)) - printf(" -> Dimension of IMAGE_RESOLUTION must be [1 x 1]"); - if ((res_Mx > 1)) - printf( " -> No Blockmode allowed for IMAGE_RESOLUTION! [1 x %i]\n", res_Mx); - // if(!(mxIsSingle(res))) - // printf(" -> IMAGE_RESOLUTION must be Single"); - - if (SOSMode_3DVolume == true){ - if(IMAGE_RESOLUTION > SOS_RESOLUTION){ - printf( " -> IMAGE_RESOLUTION (%f) > SOS_RESOLUTION (%f)\n", IMAGE_RESOLUTION, SOS_RESOLUTION); - printf(" -> IMAGE_RESOLUTION must not > SOS_RESOLUTION !!!"); - } - } - - - //====================================================================== 14.Input Parameter - Check TimeInterval_S / Timeint - timeint_Nx = GetDimensions(timeint)[0]; // Reihen N ermitteln - timeint_Mx = GetDimensions(timeint)[1]; // Spalten M ermitteln - - sampleRate = *((float*)GetPr(timeint)); - - #ifdef debug_OutputParameter - printf( "prhs[13] TimeInterval_S [%ix%i] = [%e]\n", timeint_Nx , timeint_Mx, sampleRate); - #endif - - if (!(timeint_Nx == 1)) - printf(" -> Dimension of TimeInterval_S must be [1 x 1]"); - if ((timeint_Mx > 1)) - printf( " -> No Blockmode allowed for TimeInterval_S! [1 x %i]\n", timeint_Mx); - // if(!(mxIsSingle(timeint))) - // printf(" -> TimeInterval_S must be Single"); - - //====================================================================== 15.Input Parameter - Check IMAGE_XYZ_UI32 / IMAGE_XYZ - IMAGE_XYZ_Nx = GetDimensions(IMAGE_XYZ)[0]; // Reihen N ermitteln - IMAGE_XYZ_Mx = GetDimensions(IMAGE_XYZ)[1]; // Spalten M ermitteln - - IMAGE_SIZE_XYZ.x = *((int*)GetPr(IMAGE_XYZ)); - IMAGE_SIZE_XYZ.y = *((int*)GetPr(IMAGE_XYZ)+1); - IMAGE_SIZE_XYZ.z = *((int*)GetPr(IMAGE_XYZ)+2); - - #ifdef debug_OutputParameter - printf( "prhs[14] IMAGE_XYZ [%ix%i] = [%ix%ix%i]\n", IMAGE_XYZ_Nx , IMAGE_XYZ_Mx,IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z); - #endif - - if (!(IMAGE_XYZ_Nx == 1)||!(IMAGE_XYZ_Mx == 3)) - printf(" -> Dimension of IMAGE_XYZ must be [1 x 3]"); - if ((IMAGE_XYZ_Nx > 1)) - printf( " -> No Blockmode allowed for IMAGE_XYZ! [%i x 3]\n", IMAGE_XYZ_Nx); - // if(!(mxIsUint32(IMAGE_XYZ))) - // printf(" -> IMAGE_XYZ must be UINT32"); - - if ((IMAGE_SIZE_XYZ.x > 8192)||(IMAGE_SIZE_XYZ.y > 8192)) // Aufteilung in BlockDim 512,1,1 passt für 5632x5632. Es würde etwas weiter gehen aber dann muss Y kleiner sein. - printf(" -> IMAGE_XYZ must not > [8192 x 8192 x N]!!!"); - - //====================================================================== 16.Input Parameter - Check Env / IMAGE_SUM - IMAGE_SUM_Xx = GetDimensions(IMAGE_SUM)[0]; // Spalten M ermitteln X - IMAGE_SUM_Yx = GetDimensions(IMAGE_SUM)[1]; // Reihen N ermitteln Y - - if (GetNumberOfDimensions(IMAGE_SUM) > 2) - IMAGE_SUM_Zx = GetDimensions(IMAGE_SUM)[2]; // Z-Schichten ermitteln Z - else if (GetNumberOfDimensions(IMAGE_SUM) == 2) - IMAGE_SUM_Zx = 1; // Z-Schichten = 1 - else - { - printf( " -> mxGetNumberOfDimensions of IMAGE_SUM = %i\n", (int)GetNumberOfDimensions(IMAGE_SUM)); - printf(" -> Dimension of IMAGE_SUM must be 3: [X x Y x Z]"); - } - - #ifdef debug_OutputParameter - printf( "prhs[15] IMAGE_SUM [%ix%ix%i]\n", IMAGE_SUM_Xx , IMAGE_SUM_Yx, IMAGE_SUM_Zx); - #endif - - // if(!(mxIsDouble(IMAGE_SUM))) - // printf(" -> IMAGE_SUM must be Double"); - // if(!(mxGetNumberOfElements(IMAGE_SUM) == ((size_t)IMAGE_SIZE_XYZ.x * (size_t)IMAGE_SIZE_XYZ.y * (size_t)IMAGE_SIZE_XYZ.z))) - // { - // printf( " -> IMAGE_SUM and the Number of Voxels don't match: %lld = [%lldx%lldx%lld]\n",(int)GetNumberOfElements(IMAGE_SUM), IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z); - // printf(" -> Make sure that they have the same size"); - // } - - uint64_t IMAGE_SUM_Count = GetNumberOfElements(IMAGE_SUM); - float *IMAGE_SUM_vec_ptr = (float*)GetPr(IMAGE_SUM); - - #ifdef debug_OutputVariables - printf( " -> IMAGE_SUM: %i = [%f %f %f]\n",0 , IMAGE_SUM_vec_ptr[0], IMAGE_SUM_vec_ptr[1], IMAGE_SUM_vec_ptr[2]); - printf( " -> IMAGE_SUM: %i = [%f %f %f]\n",1 , IMAGE_SUM_vec_ptr[3], IMAGE_SUM_vec_ptr[4], IMAGE_SUM_vec_ptr[5]); - printf( " -> IMAGE_SUM: %i = [%f %f %f]\n",2 , IMAGE_SUM_vec_ptr[6], IMAGE_SUM_vec_ptr[7], IMAGE_SUM_vec_ptr[8]); - printf( " -> IMAGE_SUM: %lld = [%f %f %f]\n",(IMAGE_SUM_Count-3) , IMAGE_SUM_vec_ptr[IMAGE_SUM_Count - 3], IMAGE_SUM_vec_ptr[IMAGE_SUM_Count - 2], IMAGE_SUM_vec_ptr[IMAGE_SUM_Count -1]); - #endif - - //====================================================================== 17.Input Parameter - Check BlockDimension for GPU - - BlockDim_XYZ_Nx = GetDimensions(BlockDim)[0]; // Reihen N ermitteln - BlockDim_XYZ_Mx = GetDimensions(BlockDim)[1]; // Spalten M ermitteln - - BlockDim_XYZ.x = *((int*)GetPr(BlockDim)); - BlockDim_XYZ.y = *((int*)GetPr(BlockDim)+1); - BlockDim_XYZ.z = *((int*)GetPr(BlockDim)+2); - - #ifdef debug_OutputParameter - printf( "prhs[16] BlockDim_XYZ (GPU) [%ix%i] = [%ix%ix%i]\n", BlockDim_XYZ_Nx , BlockDim_XYZ_Mx, BlockDim_XYZ.x, BlockDim_XYZ.y, BlockDim_XYZ.z); - #endif - - if (!(BlockDim_XYZ_Nx == 1)||!(BlockDim_XYZ_Mx == 3)) - printf(" -> Dimension of BlockDim_XYZ must be [1 x 3]"); - if ((BlockDim_XYZ_Nx > 1)) - printf( " -> No Blockmode! [%i x 3]\n", (int)BlockDim_XYZ_Nx); - // if(!(mxIsUint32(BlockDim))) - // printf(" -> BlockDim_XYZ must be UINT32"); - - if ((BlockDim_XYZ.x * BlockDim_XYZ.y * BlockDim_XYZ.z) > 1024){ // BlockSize limited to 1024. Perhaps newer GPUs will support more Threads per Block - printf( " -> BlockDim_XYZ.x * BlockDim_XYZ.y * BlockDim_XYZ.z must not > 1024!!!"); - - // Here Adaption for BlockSize can be done. - BlockDim_XYZ.x = 1024; // If Blockdimensions are not specified than standard Blockdimensions will be used. - BlockDim_XYZ.x = 1; - BlockDim_XYZ.x = 1; - - printf( " -> Standard Size for BlockDim_XYZ is used [%ix%ix%i]\n", BlockDim_XYZ.x, BlockDim_XYZ.y, BlockDim_XYZ.z); - } - - - //====================================================================== 18.Input Parameter - Check GPUs - - int *enableGPUs_ptr; - - GPUs_Nx = GetDimensions(GPUs)[0]; // Reihen N ermitteln - GPUs_Mx = GetDimensions(GPUs)[1]; // Spalten M ermitteln - - #ifdef debug_OutputParameter - printf( "prhs[17] GPUs [%ix%i] \n", GPUs_Nx , GPUs_Mx); - #endif - - enableGPUs_ptr = (int*)GetPr(GPUs); - selectedNumberGPUs = GPUs_Mx; - - - //Determine Number of GPU-Devices and check if there are so many available - int num_devices = 0; - //printf( " -> cudaGetDeviceCount: %i\n", num_devices); - CUDA_CHECK(cudaGetDeviceCount(&num_devices)); - - #ifdef debug_OutputInfo - printf( " -> GPU-devices available: %i\n", num_devices); - #endif - - - if (selectedNumberGPUs <= num_devices) - { - #ifdef debug_OutputParameter - printf( " (selectedNumberGPUs(%i) <= num_devices(%i)) -> OK -> selectedNumberGPUs = %i!\n", selectedNumberGPUs, num_devices, selectedNumberGPUs); - #endif - } - else - { - printf( " !!! !!! selectedNumberGPUs(%i) > num_devices(%i) !!!! !!!! -> selectedNumberGPUs = num_devices = %i!\n", selectedNumberGPUs, num_devices, num_devices); - selectedNumberGPUs = num_devices; // Reduce number of selected to number of GPUs in PC system! - } - - - // Check passed GPU-ID-Numbers and amount of GPUs - #ifdef debug_OutputParameter - printf( "prhs[17] GPUs [%ix%i] = [", GPUs_Nx , selectedNumberGPUs); - #endif - int gpuNr = 0; - int gpuNrCheck = 0; - for (gpuNr = 0; gpuNr < selectedNumberGPUs; ++gpuNr){ - #ifdef debug_OutputParameter - printf( " %i ", enableGPUs_ptr[gpuNr]); - #endif - - if (enableGPUs_ptr[gpuNr] > (num_devices-1)){ // Check if more GPUs are selected then available in System - printf( "\n enableGPUs_ptr[gpuNr=%i] = %i !!!\n", gpuNr, enableGPUs_ptr[gpuNr]); - printf(" -> selected number of GPU > available Devices is not allowed!"); - } - - for (gpuNrCheck = 0; gpuNrCheck < gpuNr; ++gpuNrCheck){ - #ifdef debug_OutputParameter - printf( "\n enableGPUs_ptr[gpuNrCheck %i] = %i, enableGPUs_ptr[gpuNr %i] = %i\n",gpuNrCheck, enableGPUs_ptr[gpuNrCheck], gpuNr, enableGPUs_ptr[gpuNr]); - #endif - if (enableGPUs_ptr[gpuNrCheck] == enableGPUs_ptr[gpuNr]) - printf(" -> GPU Device can only be used once!!!"); - } - - } - #ifdef debug_OutputParameter - printf( "]; Number of GPUs to use = %i\n", selectedNumberGPUs); - #endif - - if (!(GPUs_Nx == 1)||!(GPUs_Mx < 10)) - printf(" -> Dimension of GPUs must be [1 x <10]"); - if ((pix_vect_Nx > 1)) - printf( " -> No Blockmode [%i x n] allowed for GPUs\n", GPUs_Nx); - // if(!(mxIsUint32(GPUs))) - // printf(" -> GPUs must be UINT32"); - - - - //====================================================================== 19.Input Parameter - debugMode, debugModeParameter - dbgMode_Nx = GetDimensions(dbgMode)[0]; // Reihen N ermitteln - dbgMode_Mx = GetDimensions(dbgMode)[1]; // Spalten M ermitteln - - debugMode = *((float*)GetPr(dbgMode) ); - debugModeParameter = *((float*)GetPr(dbgMode)+1); - - #ifdef debug_OutputParameter - printf( "prhs[18] debugMode [%ix%i] = [%f %f]\n", dbgMode_Nx , dbgMode_Mx, debugMode, debugModeParameter); - #endif - - if ((dbgMode_Nx != 1)||(dbgMode_Mx != 2)) - printf(" -> Dimension of debugMode must be [1 x 2]\n"); - // if(!(mxIsSingle(dbgMode))) - // printf(" -> debugMode must be single"); - if(debugMode != 0.0) - printf (" -> debugMode = [%f], debugModeParameter = [%f]\n", debugMode, debugModeParameter); - - - - - - - #ifdef debug_OutputParameter - printf( "=================================================================================================\n"); // End of Inputparameter - #endif - - // // Enable colored Output // http://linuxgazette.net/issue65/padala.html - // printf ("\e[1;34mThis is a blue text.\e[0m"); // farbige Ausgabe mit - // printf ("\e[1;34m %-6s \e[m", "This is text"); // Farbige Text-Ausgabe - // printf ("\n"); - - - - - //================================================================================================================ Create Output Memory / Parameter - #ifdef debug_OutputParameter - printf( "\n"); - printf( "Create Outputparameter\n"); - printf( "Out[n] Meaning \n"); - printf( "=================================================================================================\n"); - #endif - - // ~~~~ Create 3D-Matrix for the Output-Values - // Output-Dimension is {IMAGE_XYZ_X, IMAGE_XYZ_Y, IMAGE_XYZ_Z} - const int dims[]={IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z}; - int ndim = 3; - - #ifdef debug_OutputParameter - printf( "plhs[0] = Output_Voxels = mxCreateNumericArray( ndim(%i), dims{%i %i %i}, mxDOUBLE_CLASS, mxREAL);\n", ndim, dims[0], dims[1], dims[2]); - #endif - - Matrix_t Output_Voxels; - Output_Voxels.NumberOfDims = ndim; - Output_Voxels.Dims[0] = dims[0]; - Output_Voxels.Dims[1] = dims[1]; - Output_Voxels.Dims[2] = dims[2]; - Output_Voxels.Data = new float[dims[0]*dims[1]*(dims[2]?dims[2]:1)]; - - double *Output_Voxels_ptr = new double[dims[0]*dims[1]*(dims[2]?dims[2]:1)];; - - - // ~~~~ Create Pointer to return value from Duration of Kernel - // Erstelle Array mit folgender Formatierung - // 0: Total Durationtime all GPUs - // 1: Total Durationtime GPU 1 - // 2: Total Durationtime GPU 2 - // n: Total Durationtime GPU n - - int - m = (1 + selectedNumberGPUs), - n = 1; - - #ifdef debug_OutputParameter - printf( "plhs[1] = Duration = mxCreateDoubleMatrix( m(%i), n(%i), mxREAL);\n", m, n); - #endif - - double *Duration_ptr = new double[m*n]; - - // ~~~~ Create Pointer to return Error/Abortvalue of each multithread - //int *Abort_ptr = (int*) malloc(num_workingPackages * sizeof(int)); - int *Abort_ptr = (int*) malloc(selectedNumberGPUs*sizeof(int)); - - - // ~~~~ Erstelle Array fuer Testrueckgabe der integrierten Ascans - //AScan_Nx = mxGetDimensions(AScan)[0]; // Reihen N ermitteln - //AScan_Mx = mxGetDimensions(AScan)[1]; // Spalten M ermitteln - //int - m = aScanCount; // 1 - n = AScan_Nx; //z.B. 3000 - #ifdef debug_OutputParameter - printf( "plhs[2] = Ascans = mxCreateNumericMatrix( m(%i), n(%i), mxREAL);\n", m, n); - #endif - - float *AscansOut_ptr = new float[m*n]; - - - #ifdef debug_OutputParameter - printf( "=================================================================================================\n\n"); // End of Outputparameter - #endif + const Matrix_t &BlockDim = params[16]; // Block Dimension to use for GPU + const Matrix_t &GPUs = params[17]; // Welche GPUs sollen genutzt werden? + + const Matrix_t &dbgMode = params[18]; // DebugMode and DebugMode-Parameter + + //====================================================================== 1.Input Parameter - Check AScan + AScan_Nx = GetDimensions(AScan)[0]; // Reihen N ermitteln + AScan_Mx = GetDimensions(AScan)[1]; // Spalten M ermitteln + + aScanCount = AScan_Mx; + aScanLength = AScan_Nx; + + // printf( "mxGetNumberOfDimensions(AScan)=%i\n", mxGetNumberOfDimensions(AScan)); + if ((aScanCount > 65535)) // new 2019: increasing the limit of the A-Scan block size. however this is limited by the datatype of unsigned short which is used for a pointer. + { + printf(" -> AScanBlock size = %i\n", aScanCount); + printf("AScanBlock size might be too large (=> 2^16)!!!"); + } + + aScan_ptr = (float *)GetPr(AScan); + + //====================================================================== 2.Input Parameter - Check IMAGE_STARTPOINT_S / pix_vect + pix_vect_Nx = GetDimensions(pix_vect)[0]; // Reihen N ermitteln + pix_vect_Mx = GetDimensions(pix_vect)[1]; // Spalten M ermitteln + + regionOfInterestOffset.x = *((float *)GetPr(pix_vect)); + regionOfInterestOffset.y = *((float *)GetPr(pix_vect) + 1); + regionOfInterestOffset.z = *((float *)GetPr(pix_vect) + 2); + + if (!(pix_vect_Nx == 1) || !(pix_vect_Mx == 3)) + printf(" -> Dimension of IMAGE_STARTPOINT_S must be [1 x 3]"); + if ((pix_vect_Nx > 1)) + printf(" -> No Blockmode [%i x 3] allowed for IMAGE_STARTPOINT_S\n", pix_vect_Nx); + // if(!(mxIsSingle(pix_vect))) + // printf(" -> IMAGE_STARTPOINT_S must be Single"); + + //====================================================================== 3.Input Parameter - Check Receiver Index + receiver_index_Nx = GetDimensions(receiver_index)[0]; // Reihen N ermitteln + receiver_index_Mx = GetDimensions(receiver_index)[1]; // Spalten M ermitteln + + if (!(receiver_index_Nx == 1)) + printf(" -> Dimension of receiver_index must be [1 x M]"); + if (!(receiver_index_Mx == aScanCount)) + { + printf(" -> aScanCount(%i)!= M(%i)\n", aScanCount, receiver_index_Mx); + printf(" -> Dimension of receiver_index has different size as Ascan-Data\n"); + } + + // Ausgabe einzelner Geometriedaten der Uebergabewerte mit verschiedenen Varianten + unsigned short *receiver_index_ptr; + receiver_index_ptr = (unsigned short *)GetPr(receiver_index); + + //====================================================================== 4.Input Parameter - Check Emitter Index + emitter_index_Nx = GetDimensions(emitter_index)[0]; // Reihen N ermitteln + emitter_index_Mx = GetDimensions(emitter_index)[1]; // Spalten M ermitteln + + if (!(emitter_index_Nx == 1)) + printf(" -> Dimension of emitter_index must be [1 x M]"); + if (!(emitter_index_Mx == aScanCount)) + { + printf(" -> aScanCount(%i)!= M(%i)\n", aScanCount, emitter_index_Mx); + printf(" -> Dimension of emitter_index has different size as Ascan-Data\n"); + } + + // Ausgabe einzelner Geometriedaten der Uebergabewerte mit verschiedenen Varianten + unsigned short *emitter_index_ptr = (unsigned short *)GetPr(emitter_index); + + //====================================================================== 5.Input Parameter - Check receiver_list + receiver_list_Nx = GetDimensions(receiver_list)[0]; // Reihen N ermitteln + receiver_list_Mx = GetDimensions(receiver_list)[1]; // Spalten M ermitteln + + if (!(receiver_list_Nx == 3)) + printf(" -> Dimension of receiver_list must be [3 x M]"); + + // Ausgabe einzelner Geometriedaten der Uebergabewerte mit verschiedenen Varianten + float *receiver_list_ptr; + receiver_list_ptr = (float *)GetPr(receiver_list); + + //====================================================================== 6.Input Parameter - Check emitter_list + emitter_list_Nx = GetDimensions(emitter_list)[0]; // Reihen N ermitteln + emitter_list_Mx = GetDimensions(emitter_list)[1]; // Spalten M ermitteln // emitter_list gibt die maximale Anzahl an Emittern die in diesem Block vorkommen können wieder! + + if (!(emitter_list_Nx == 3)) + printf(" -> Dimension of emitter_list must be [3 x M]"); + + // Ausgabe einzelner Geometriedaten der übergabewerte mit verschiedenen Varianten + float *emitter_list_ptr; + emitter_list_ptr = (float *)GetPr(emitter_list); + + //====================================================================== 7.Input Parameter - Check SAFT_mode + SAFT_mode_Nx = GetDimensions(SAFT_mode)[0]; // Reihen N ermitteln + SAFT_mode_Mx = GetDimensions(SAFT_mode)[1]; // Spalten M ermitteln + + SAFT_MODE = *((int *)GetPr(SAFT_mode)); + + if (!(SAFT_mode_Nx == 1)) + printf(" -> Dimension of SAFT_MODE must be [1 x 1]"); + + switch (SAFT_MODE) + { + case 0: + SOSMode_3DVolume = false; + ATTMode_3DVolume = false; + // printf ( "\e[7;37m Standard SAFT without correction (-SOS -ATT) (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); + printf(" -> AscanIndexVersion only make sense with SOS or SOS and ATT Volume => exit"); + break; + case 1: + SOSMode_3DVolume = true; + ATTMode_3DVolume = false; + // printf ( "\e[7;37m + Speed of sound correction - Attenuation correction (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); + break; + case 2: + SOSMode_3DVolume = true; + ATTMode_3DVolume = true; + // printf ( "\e[7;37m + Speed of sound correction + Attenuation correction (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); + break; + case 3: + // SOSMode_3DVolume = true; ATTMode_3DVolume = true; + // printf ( "\e[7;37m SAFT_MODE = 3 \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); + printf(" -> not implemented => exit"); + break; + case 4: + // SOSMode_3DVolume = false; ATTMode_3DVolume = false; + // printf ( "\e[7;37m SAFT_MODE = 4 \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); + printf(" -> not implemented => exit"); + break; + default: + SOSMode_3DVolume = false; + ATTMode_3DVolume = false; + // printf ( " -> SAFT_MODE %i is out of range [0..3] => use Standard SAFT\n", SAFT_MODE); + // printf ( "\e[7;37m Standard SAFT without correction (-SOS -ATT) (%i,%i) \e[0m", SOSMode_3DVolume, ATTMode_3DVolume); + break; + } + + //====================================================================== 8.Input Parameter - Check SAFT_variant + SAFT_variant_Nx = GetDimensions(SAFT_variant)[0]; // Reihen N ermitteln + SAFT_variant_Mx = GetDimensions(SAFT_variant)[1]; // Spalten M ermitteln + + SAFT_VARIANT = (int *)GetPr(SAFT_variant); + SAFT_VARIANT_Size = SAFT_variant_Mx; + + // if(!(mxIsUint32(SAFT_variant))) + // printf(" -> SAFT_VARIANT must be Uint32"); + if (!(SAFT_variant_Nx == 1) || !(SAFT_variant_Mx == 6)) + printf(" -> Dimension of SAFT_VARIANT must be [1 x 6]"); + + //====================================================================== 9.Input Parameter - Check for SOS volume + speed_Nx = GetDimensions(speed)[0]; // Reihen N ermitteln + speed_Mx = GetDimensions(speed)[1]; // Spalten M ermitteln + float Sos = *((float *)GetPr(speed)); + + float *speed_vec_ptr; + speed_vec_ptr = (float *)GetPr(speed); // Pointer für SoSDaten ermitteln + + // if(!(mxIsSingle(speed))) + // printf(" -> SOSVolume must be Single"); + + if (SOSMode_3DVolume == true) // SOS correction need 3D Volume + { + if (GetNumberOfDimensions(speed) == 3) + { + if (!((speed_Nx > 1) && (speed_Mx > 1))) + { + printf(" -> SOSGrid_XYZ.x and SOSGrid_XYZ.y must be > 1 for SOS Correction with 3D Volume!!!"); + } + } + else if (GetNumberOfDimensions(speed) == 2) + { + printf("prhs[8] SOSVolume [%ix%i]\n", (int)GetDimensions(speed)[0], (int)GetDimensions(speed)[1]); + printf(" -> SOSVolume is not a 3D Volume as expected!"); + } + + SOSGrid_Xx = GetDimensions(speed)[0]; // SOSGrid_X ermitteln + SOSGrid_Yx = GetDimensions(speed)[1]; // SOSGrid_Y ermitteln + SOSGrid_Zx = GetDimensions(speed)[2]; // SOSGrid_Z ermitteln + + SOSGrid_XYZ.x = SOSGrid_Xx; + SOSGrid_XYZ.y = SOSGrid_Yx; + SOSGrid_XYZ.z = SOSGrid_Zx; + + if ((SOSGrid_XYZ.x > 128) || (SOSGrid_XYZ.y > 128) || (SOSGrid_XYZ.z > 128)) + { + printf(" -> SOSGrid_XYZ [%i x %i x %i]\n", (int)SOSGrid_Xx, (int)SOSGrid_Yx, (int)SOSGrid_Zx); + printf(" Warning -> SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z > 128!!! --> can be problematic due to memory requirement\n"); + } + } + + //====================================================================== 10.Input Parameter - Check SoS Startpoint + sos_startPoint_Nx = GetDimensions(sos_startPoint)[0]; // Reihen N ermitteln + sos_startPoint_Mx = GetDimensions(sos_startPoint)[1]; // Spalten M ermitteln + + sosOffset.x = *((float *)GetPr(sos_startPoint)); + sosOffset.y = *((float *)GetPr(sos_startPoint) + 1); + sosOffset.z = *((float *)GetPr(sos_startPoint) + 2); + + if (!(sos_startPoint_Nx == 1) || !(sos_startPoint_Mx == 3)) + printf(" -> Dimension of SOS_STARTPOINT_S must be [1 x 3]"); + if ((sos_startPoint_Nx > 1)) + printf(" -> No Blockmode [%i x 3] allowed for SOS_STARTPOINT_S\n", sos_startPoint_Nx); + // if(!(mxIsSingle(sos_startPoint))) + // printf(" -> SOS_STARTPOINT_S must be Single"); + + //====================================================================== 11.Input Parameter - Check SoS_RESOLUTION / sos_res + if (SOSMode_3DVolume == true) + { + sos_res_Nx = GetDimensions(sos_res)[0]; // Reihen N ermitteln + sos_res_Mx = GetDimensions(sos_res)[1]; // Spalten M ermitteln + + SOS_RESOLUTION = *((float *)GetPr(sos_res)); + + if (!(sos_res_Nx == 1)) + printf(" -> Dimension of SOS_RESOLUTION_S must be [1 x 1]"); + if ((sos_res_Mx > 1)) + printf(" -> No Blockmode allowed for SOS_RESOLUTION_S! [1 x %i]\n", sos_res_Mx); + // if(!(mxIsSingle(sos_res))) + // printf(" -> SOS_RESOLUTION_S must be Single"); + } + + //====================================================================== 12.Input Parameter - Check for ATTVolume / Attenuation-Data + + attVolume_Nx = GetDimensions(attVolume)[0]; // Reihen N ermitteln + attVolume_Mx = GetDimensions(attVolume)[1]; // Spalten M ermitteln + + float *att_vec_ptr; + att_vec_ptr = (float *)GetPr(attVolume); // Pointer für ATT-Daten ermitteln + + if (GetNumberOfDimensions(attVolume) == 3) + { + if (!((attVolume_Nx > 1) && (attVolume_Mx > 1))) + { + printf(" -> ATTGrid_XYZ.x and ATTGrid_XYZ.y must be > 1 for ATT Correction with 3D Volume!!!"); + } + } + else if (GetNumberOfDimensions(attVolume) == 2) + { + } + + if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true)) + { // 3D Volume muss bei SOS und ATT angegeben sein damit ATT Korrektur durchgefuehrt werden kann + ATTGrid_Xx = GetDimensions(attVolume)[0]; // ATTGrid_X ermitteln + ATTGrid_Yx = GetDimensions(attVolume)[1]; // ATTGrid_Y ermitteln + ATTGrid_Zx = GetDimensions(attVolume)[2]; // ATTGrid_Z ermitteln + + ATTGrid_XYZ.x = ATTGrid_Xx; + ATTGrid_XYZ.y = ATTGrid_Yx; + ATTGrid_XYZ.z = ATTGrid_Zx; + + if ((ATTGrid_XYZ.x > 128) || (ATTGrid_XYZ.y > 128) || (ATTGrid_XYZ.z > 128)) + { + printf(" -> ATTGrid_XYZ [%i x %i x %i]\n", ATTGrid_XYZ.x, ATTGrid_XYZ.y, ATTGrid_XYZ.z); + printf(" Warning -> ATTGrid_XYZ.x, ATTGrid_XYZ.y, ATTGrid_XYZ.z > 128!!! --> can be problematic due to memory requirement\n"); + } + + if ((ATTGrid_XYZ.x != SOSGrid_XYZ.x) || (ATTGrid_XYZ.y != SOSGrid_XYZ.y) || (ATTGrid_XYZ.z != SOSGrid_XYZ.z)) + { // Restriction: Volume parameter of ATT & SOS must be the same + printf(" -> ATTGrid[%i %i %i] != SOSGrid[%i %i %i]!\n", ATTGrid_XYZ.x, ATTGrid_XYZ.y, ATTGrid_XYZ.z, SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z); + printf(" -> ATTGrid must have the same size as SOSGrid \n"); + } + } + else + { + printf(" -> ATTMode_3DVolume == false => skip ATTGrid\n"); + + ATTGrid_Xx = 0; // ATTGrid_X ermitteln + ATTGrid_Yx = 0; // ATTGrid_Y ermitteln + ATTGrid_Zx = 0; // ATTGrid_Z ermitteln + + ATTGrid_XYZ.x = ATTGrid_Xx; + ATTGrid_XYZ.y = ATTGrid_Yx; + ATTGrid_XYZ.z = ATTGrid_Zx; + } + + //====================================================================== 13.Input Parameter - Check IMAGE_RESOLUTION_S / res + res_Nx = GetDimensions(res)[0]; // Reihen N ermitteln + res_Mx = GetDimensions(res)[1]; // Spalten M ermitteln + + IMAGE_RESOLUTION = *((float *)GetPr(res)); + + if (!(res_Nx == 1)) + printf(" -> Dimension of IMAGE_RESOLUTION must be [1 x 1]"); + if ((res_Mx > 1)) + printf(" -> No Blockmode allowed for IMAGE_RESOLUTION! [1 x %i]\n", res_Mx); + // if(!(mxIsSingle(res))) + // printf(" -> IMAGE_RESOLUTION must be Single"); + + if (SOSMode_3DVolume == true) + { + if (IMAGE_RESOLUTION > SOS_RESOLUTION) + { + printf(" -> IMAGE_RESOLUTION (%f) > SOS_RESOLUTION (%f)\n", IMAGE_RESOLUTION, SOS_RESOLUTION); + printf(" -> IMAGE_RESOLUTION must not > SOS_RESOLUTION !!!"); + } + } + + //====================================================================== 14.Input Parameter - Check TimeInterval_S / Timeint + timeint_Nx = GetDimensions(timeint)[0]; // Reihen N ermitteln + timeint_Mx = GetDimensions(timeint)[1]; // Spalten M ermitteln + + sampleRate = *((float *)GetPr(timeint)); + + if (!(timeint_Nx == 1)) + printf(" -> Dimension of TimeInterval_S must be [1 x 1]"); + if ((timeint_Mx > 1)) + printf(" -> No Blockmode allowed for TimeInterval_S! [1 x %i]\n", timeint_Mx); + //====================================================================== 15.Input Parameter - Check IMAGE_XYZ_UI32 / IMAGE_XYZ + IMAGE_XYZ_Nx = GetDimensions(IMAGE_XYZ)[0]; // Reihen N ermitteln + IMAGE_XYZ_Mx = GetDimensions(IMAGE_XYZ)[1]; // Spalten M ermitteln + + IMAGE_SIZE_XYZ.x = *((int *)GetPr(IMAGE_XYZ)); + IMAGE_SIZE_XYZ.y = *((int *)GetPr(IMAGE_XYZ) + 1); + IMAGE_SIZE_XYZ.z = *((int *)GetPr(IMAGE_XYZ) + 2); + + if (!(IMAGE_XYZ_Nx == 1) || !(IMAGE_XYZ_Mx == 3)) + printf(" -> Dimension of IMAGE_XYZ must be [1 x 3]"); + if ((IMAGE_XYZ_Nx > 1)) + printf(" -> No Blockmode allowed for IMAGE_XYZ! [%i x 3]\n", IMAGE_XYZ_Nx); + // if(!(mxIsUint32(IMAGE_XYZ))) + // printf(" -> IMAGE_XYZ must be UINT32"); + + if ((IMAGE_SIZE_XYZ.x > 8192) || (IMAGE_SIZE_XYZ.y > 8192)) // Aufteilung in BlockDim 512,1,1 passt für 5632x5632. Es würde etwas weiter gehen aber dann muss Y kleiner sein. + printf(" -> IMAGE_XYZ must not > [8192 x 8192 x N]!!!"); + + //====================================================================== 16.Input Parameter - Check Env / IMAGE_SUM + IMAGE_SUM_Xx = GetDimensions(IMAGE_SUM)[0]; // Spalten M ermitteln X + IMAGE_SUM_Yx = GetDimensions(IMAGE_SUM)[1]; // Reihen N ermitteln Y + + if (GetNumberOfDimensions(IMAGE_SUM) > 2) + IMAGE_SUM_Zx = GetDimensions(IMAGE_SUM)[2]; // Z-Schichten ermitteln Z + else if (GetNumberOfDimensions(IMAGE_SUM) == 2) + IMAGE_SUM_Zx = 1; // Z-Schichten = 1 + else + { + printf(" -> mxGetNumberOfDimensions of IMAGE_SUM = %i\n", (int)GetNumberOfDimensions(IMAGE_SUM)); + printf(" -> Dimension of IMAGE_SUM must be 3: [X x Y x Z]"); + } + + uint64_t IMAGE_SUM_Count = GetNumberOfElements(IMAGE_SUM); + float *IMAGE_SUM_vec_ptr = (float *)GetPr(IMAGE_SUM); + + //====================================================================== 17.Input Parameter - Check BlockDimension for GPU + + BlockDim_XYZ_Nx = GetDimensions(BlockDim)[0]; // Reihen N ermitteln + BlockDim_XYZ_Mx = GetDimensions(BlockDim)[1]; // Spalten M ermitteln + + BlockDim_XYZ.x = *((int *)GetPr(BlockDim)); + BlockDim_XYZ.y = *((int *)GetPr(BlockDim) + 1); + BlockDim_XYZ.z = *((int *)GetPr(BlockDim) + 2); + + if (!(BlockDim_XYZ_Nx == 1) || !(BlockDim_XYZ_Mx == 3)) + printf(" -> Dimension of BlockDim_XYZ must be [1 x 3]"); + if ((BlockDim_XYZ_Nx > 1)) + printf(" -> No Blockmode! [%i x 3]\n", (int)BlockDim_XYZ_Nx); + // if(!(mxIsUint32(BlockDim))) + // printf(" -> BlockDim_XYZ must be UINT32"); + + if ((BlockDim_XYZ.x * BlockDim_XYZ.y * BlockDim_XYZ.z) > 1024) + { // BlockSize limited to 1024. Perhaps newer GPUs will support more Threads per Block + printf(" -> BlockDim_XYZ.x * BlockDim_XYZ.y * BlockDim_XYZ.z must not > 1024!!!"); + + // Here Adaption for BlockSize can be done. + BlockDim_XYZ.x = 1024; // If Blockdimensions are not specified than standard Blockdimensions will be used. + BlockDim_XYZ.x = 1; + BlockDim_XYZ.x = 1; + + printf(" -> Standard Size for BlockDim_XYZ is used [%ix%ix%i]\n", BlockDim_XYZ.x, BlockDim_XYZ.y, BlockDim_XYZ.z); + } + + //====================================================================== 18.Input Parameter - Check GPUs + + int *enableGPUs_ptr; + + GPUs_Nx = GetDimensions(GPUs)[0]; // Reihen N ermitteln + GPUs_Mx = GetDimensions(GPUs)[1]; // Spalten M ermitteln + enableGPUs_ptr = (int *)GetPr(GPUs); + selectedNumberGPUs = GPUs_Mx; + + // Determine Number of GPU-Devices and check if there are so many available + int num_devices = 0; + // printf( " -> cudaGetDeviceCount: %i\n", num_devices); + CUDA_CHECK(cudaGetDeviceCount(&num_devices)); + + if (selectedNumberGPUs <= num_devices) + { + } + else + { + printf(" !!! !!! selectedNumberGPUs(%i) > num_devices(%i) !!!! !!!! -> selectedNumberGPUs = num_devices = %i!\n", selectedNumberGPUs, num_devices, num_devices); + selectedNumberGPUs = num_devices; // Reduce number of selected to number of GPUs in PC system! + } + + // Check passed GPU-ID-Numbers and amount of GPUs + int gpuNr = 0; + int gpuNrCheck = 0; + for (gpuNr = 0; gpuNr < selectedNumberGPUs; ++gpuNr) + { + if (enableGPUs_ptr[gpuNr] > (num_devices - 1)) + { // Check if more GPUs are selected then available in System + printf("\n enableGPUs_ptr[gpuNr=%i] = %i !!!\n", gpuNr, enableGPUs_ptr[gpuNr]); + printf(" -> selected number of GPU > available Devices is not allowed!"); + } + + for (gpuNrCheck = 0; gpuNrCheck < gpuNr; ++gpuNrCheck) + { + if (enableGPUs_ptr[gpuNrCheck] == enableGPUs_ptr[gpuNr]) + printf(" -> GPU Device can only be used once!!!"); + } + } + + if (!(GPUs_Nx == 1) || !(GPUs_Mx < 10)) + printf(" -> Dimension of GPUs must be [1 x <10]"); + if ((pix_vect_Nx > 1)) + printf(" -> No Blockmode [%i x n] allowed for GPUs\n", GPUs_Nx); + + //====================================================================== 19.Input Parameter - debugMode, debugModeParameter + dbgMode_Nx = GetDimensions(dbgMode)[0]; // Reihen N ermitteln + dbgMode_Mx = GetDimensions(dbgMode)[1]; // Spalten M ermitteln + + debugMode = *((float *)GetPr(dbgMode)); + debugModeParameter = *((float *)GetPr(dbgMode) + 1); + + if ((dbgMode_Nx != 1) || (dbgMode_Mx != 2)) + printf(" -> Dimension of debugMode must be [1 x 2]\n"); + if (debugMode != 0.0) + printf(" -> debugMode = [%f], debugModeParameter = [%f]\n", debugMode, debugModeParameter); + + // ~~~~ Create 3D-Matrix for the Output-Values + // Output-Dimension is {IMAGE_XYZ_X, IMAGE_XYZ_Y, IMAGE_XYZ_Z} + const int dims[] = {IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z}; + int ndim = 3; + + Matrix_t Output_Voxels; + Output_Voxels.NumberOfDims = ndim; + Output_Voxels.Dims[0] = dims[0]; + Output_Voxels.Dims[1] = dims[1]; + Output_Voxels.Dims[2] = dims[2]; + Output_Voxels.Data = new float[dims[0] * dims[1] * (dims[2] ? dims[2] : 1)]; + + double *Output_Voxels_ptr = new double[dims[0] * dims[1] * (dims[2] ? dims[2] : 1)]; + ; + + // ~~~~ Create Pointer to return value from Duration of Kernel + // Erstelle Array mit folgender Formatierung + // 0: Total Durationtime all GPUs + // 1: Total Durationtime GPU 1 + // 2: Total Durationtime GPU 2 + // n: Total Durationtime GPU n + + int m = (1 + selectedNumberGPUs), n = 1; + + double *Duration_ptr = new double[m * n]; + + // ~~~~ Create Pointer to return Error/Abortvalue of each multithread + // int *Abort_ptr = (int*) malloc(num_workingPackages * sizeof(int)); + int *Abort_ptr = (int *)malloc(selectedNumberGPUs * sizeof(int)); + m = aScanCount; // 1 + n = AScan_Nx; // z.B. 3000 + + float *AscansOut_ptr = new float[m * n]; SPDLOG_INFO("preintegrateAscans!"); - //================================================================================================================ Preintegrate Ascans - #ifdef preAscanIntegrationToMatchSamplerateToResolution // Preintegrate Ascans for matching of Samplerate and Resolution + // 按照当前参数必走这个分支 + //================================================================================================================ Preintegrate Ascans + if (SAFT_VARIANT[SAFT_VARIANT_AscanPreintegration] == 1) + { + // printf( "(SAFT_VARIANT[0] == 1) => perform preintegrateAscans\n\n"); + speed_vec_ptr = (float *)GetPr(speed); + // printf( " speed_vec_ptr[%3i] = %12.10f\n",0,speed_vec_ptr[0]); + if (speed_vec_ptr[0] == 0) + { + printf("First value in SOS Volume = 0 --> preintegrateAscans can't be performed!!! --> Exit"); + } - #ifdef debug_OutputHostStepsPerformance - double diff_time; + //================================================================================================================ + preintegrateAscans(aScan_ptr, AscansOut_ptr, speed_vec_ptr, aScanCount, aScanLength, IMAGE_RESOLUTION, sampleRate, debugMode, debugModeParameter); + //================================================================================================================ + } + else + { + // printf( "(SAFT_VARIANT[0] == 0) => skip preintegrateAscans\n\n"); + // Daten trotzdem in Outputspeicher fuer Matlab transferieren + for (int j = 0; j < aScanCount; j++) + { // ueber alle A-scans gehen. + for (int i = 0; i < aScanLength; i++) + { + // printf( " i (%4i) = %6.3f ",i, aScan_ptr[j*aScanLength+i]); + AscansOut_ptr[j * aScanLength + i] = aScan_ptr[j * aScanLength + i]; // nach Matlab zurueckgeben + // printf( " i (%4i) = %6.3f \n",i, AscansOut_ptr[j*aScanLength+i]); + } + } + } - struct timeval startPreintegrateAscans, stopPreintegrateAscans; - gettimeofday(&startPreintegrateAscans, NULL); - #endif - if (SAFT_VARIANT[SAFT_VARIANT_AscanPreintegration] == 1){ - //printf( "(SAFT_VARIANT[0] == 1) => perform preintegrateAscans\n\n"); + //================================================================================================================ Start Reconstruction + //================================================================================================================ - speed_vec_ptr = (float*)GetPr(speed); - // printf( " speed_vec_ptr[%3i] = %12.10f\n",0,speed_vec_ptr[0]); - if (speed_vec_ptr[0] == 0){ - printf("First value in SOS Volume = 0 --> preintegrateAscans can't be performed!!! --> Exit"); - } - - //================================================================================================================ - preintegrateAscans(aScan_ptr, AscansOut_ptr, speed_vec_ptr, aScanCount, aScanLength, IMAGE_RESOLUTION, sampleRate, debugMode, debugModeParameter); - //================================================================================================================ - } - else{ - - //printf( "(SAFT_VARIANT[0] == 0) => skip preintegrateAscans\n\n"); - //Daten trotzdem in Outputspeicher fuer Matlab transferieren - for (int j = 0; j 0) + { + printf("!!!!!!!!!!!!!!!!!!! Aborted Thread for GPU[%i] = %i\n", i, Abort_ptr[i]); + AbortedThreads = true; + } + } + free(Abort_ptr); + if (AbortedThreads) + printf(" Aborted Thread occurred -> see output history"); - multithreaded_processing( aScan_ptr, - Output_Voxels_ptr, - receiver_index_ptr, - emitter_index_ptr, - receiver_list_ptr, - receiver_list_Mx, - emitter_list_ptr, - emitter_list_Mx, - speed_vec_ptr, - SOSGrid_XYZ, - sosOffset, - SOS_RESOLUTION, - att_vec_ptr, - AScan_Mx, - AScan_Nx, - regionOfInterestOffset, - IMAGE_SIZE_XYZ, - IMAGE_RESOLUTION, - sampleRate, - BlockDim_XYZ, - Duration_ptr, - selectedNumberGPUs, - enableGPUs_ptr, - debugMode, - debugModeParameter, - SOSMode_3DVolume, - ATTMode_3DVolume, - SAFT_MODE, - SAFT_VARIANT, - SAFT_VARIANT_Size, - Abort_ptr - ); + //================================================================================================================ + //================================================================================================================ - #ifdef debug_OutputHostStepsPerformance - gettimeofday(&stopMultithreadProcessing, NULL); - diff_time = (double)((stopMultithreadProcessing.tv_sec * 1000000.0 + stopMultithreadProcessing.tv_usec) - (startMultithreadProcessing.tv_sec * 1000000.0 + startMultithreadProcessing.tv_usec)); - printf ("########################################################################\n"); - printf ("### HOST ### MultithreadProcessing = %8.0 us = %8.2f GVA/s\n", diff_time, double(IMAGE_SUM_Count)*double(aScanCount)/(diff_time*1000)); - #endif - - // Check if errors occurred - bool AbortedThreads = false; - for (int i=0; i < selectedNumberGPUs; i++ ){ - if (Abort_ptr[i] > 0) - { - printf( "!!!!!!!!!!!!!!!!!!! Aborted Thread for GPU[%i] = %i\n", i, Abort_ptr[i]); - AbortedThreads = true; - } - } - free(Abort_ptr); - if (AbortedThreads) - printf(" Aborted Thread occurred -> see output history"); - - //================================================================================================================ - //================================================================================================================ - - - //================================================================================================================ Build Sum of IMAGE_SUM and current reconstructed Volume - - // Daten des uebergebenen Outputvolumens zum rekonstruierten Volumen addieren - #ifdef debug_OutputInfo - printf( "Build Sum of reconstructed Volume and given IMAGE_SUM \n"); - #endif - - #ifdef debug_OutputHostStepsPerformance - struct timeval startSumIMAGE_SUM, stopSumIMAGE_SUM; - gettimeofday(&startSumIMAGE_SUM, NULL); - #endif + //================================================================================================================ Build Sum of IMAGE_SUM and current reconstructed Volume SPDLOG_INFO("multithreaded_processing finish!"); - float* outData = (float*)Output_Voxels.Data; - for(uint64_t i=0; i < IMAGE_SUM_Count; i++) - { - outData[i]= Output_Voxels_ptr[i] + IMAGE_SUM_vec_ptr[i]; - } + float *outData = (float *)Output_Voxels.Data; + for (uint64_t i = 0; i < IMAGE_SUM_Count; i++) + { + outData[i] = Output_Voxels_ptr[i] + IMAGE_SUM_vec_ptr[i]; + } - #ifdef debug_OutputHostStepsPerformance - gettimeofday(&stopSumIMAGE_SUM, NULL); - diff_time = (double)((stopSumIMAGE_SUM.tv_sec * 1000000.0 + stopSumIMAGE_SUM.tv_usec) - (startSumIMAGE_SUM.tv_sec * 1000000.0 + startSumIMAGE_SUM.tv_usec)); - printf ("### HOST ### Sum up (IMAGE_SUM + reconstr. Volume) = %8.0f µs = %8.2f MVoxel/s\n", diff_time, double(IMAGE_SUM_Count)/diff_time); - printf ("########################################################################\n"); - #endif - - - - - //================================================================================================================ Show returned Output-Values - // - #ifdef debug_OutputVariables - // Testoutput Value of duration - printf( "Duration_ptr[0] = %f\n", Duration_ptr[0]); - - // Testoutput Sum of IMAGE_SUM and reconstructed Volume - printf( "Output_Voxels: %i = [%f %f %f]\n",0 , Output_Voxels_ptr[0], Output_Voxels_ptr[1], Output_Voxels_ptr[2]); - printf( "Output_Voxels: %i = [%f %f %f]\n",1 , Output_Voxels_ptr[3], Output_Voxels_ptr[4], Output_Voxels_ptr[5]); - printf( "Output_Voxels: %i = [%f %f %f]\n",2 , Output_Voxels_ptr[6], Output_Voxels_ptr[7], Output_Voxels_ptr[8]); - printf( "Output_Voxels: %i = [%f %f %f]\n",(IMAGE_SUM_Count-3) , Output_Voxels_ptr[IMAGE_SUM_Count - 3], Output_Voxels_ptr[IMAGE_SUM_Count - 2], Output_Voxels_ptr[IMAGE_SUM_Count -1]); - #endif - - #ifdef debug_OutputFunctions - printf( "<== mexFunction - End\n"); - #endif - delete [] AscansOut_ptr; - delete [] Duration_ptr; - delete [] Output_Voxels_ptr; + delete[] AscansOut_ptr; + delete[] Duration_ptr; + delete[] Output_Voxels_ptr; SPDLOG_INFO("SAFT finish!"); return Output_Voxels; } - - - - - - - diff --git a/SAFT_TOFI/src/kernel/rayTracing.cuh b/SAFT_TOFI/src/kernel/rayTracing.cuh index bc77275..2330239 100644 --- a/SAFT_TOFI/src/kernel/rayTracing.cuh +++ b/SAFT_TOFI/src/kernel/rayTracing.cuh @@ -137,7 +137,7 @@ __device__ __forceinline__ void performRayTracedSpeedAdditionTexture(float &voxe pathPoint[slowDim1] = voxel2f[slowDim1]; pathPoint[slowDim2] = voxel2f[slowDim2]; } - else // voxel2f < voxel1f Endpukt < Startpkt -> Steigung negativ + else // voxel2f < voxel1f End point < Start point -> Slope negative { fastDirectionSteps = floor(voxel2f[greatestDistanceDim] + 0.5f) - floor(voxel1f[greatestDistanceDim] + 0.5f) + 1; pathPoint[greatestDistanceDim] = voxel1f[greatestDistanceDim]; diff --git a/SAFT_TOFI/src/processAScans.cpp b/SAFT_TOFI/src/processAScans.cpp index 7c368b1..9973dfc 100644 --- a/SAFT_TOFI/src/processAScans.cpp +++ b/SAFT_TOFI/src/processAScans.cpp @@ -1,21 +1,20 @@ #include #include -#include +#include #include +#include #include #include -#include -#include +#include #include "saft.hpp" extern float3 *constEmitterPtr; extern float3 *constReceiverPtr; -extern unsigned short* constLookUpGeometryMemoryListEmitterPtr; -extern unsigned short* constLookUpGeometryMemoryListReceiverPtr; - +extern unsigned short *constLookUpGeometryMemoryListEmitterPtr; +extern unsigned short *constLookUpGeometryMemoryListReceiverPtr; /** Process the A-scans to perform the actual reconstruction using the complete cuboid SAFT kernel. @@ -25,1888 +24,760 @@ extern unsigned short* constLookUpGeometryMemoryListReceiverPtr; */ void SAFTHandler::performCoreReconstruction() { - - #ifdef debug_OutputFunctions - printf( "==> SAFTHandler::performCoreReconstruction - Start\n"); - #endif - - #ifdef debug_OutputStepsPerformance - struct timeval startPerformCoreReconstruction, stopPerformCoreReconstruction; - gettimeofday(&startPerformCoreReconstruction, NULL); - #endif - // Mitlaufender Zeiger fuer Speicherbereich der Outputdaten uebergeben. - double * currentHostOutputAdress = output; // Offset des OutputSpeichers am Anfang = 0 - CUDA_CHECK(cudaGetLastError()); - - // Copy A-Scans to GPU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - #ifdef debug_OutputStepsPerformance - struct timeval startCopyAscans, stopCopyAscans; - gettimeofday(&startCopyAscans, NULL); - #endif - - #ifdef debug_OutputMemory - printf( "CUDA:cudaMemcpyToArray(deviceAScansCuArray[0], 0, 0, aScanSamples, aScanBatchSize(%i), cudaMemcpyHostToDevice); aScanSamples -> deviceAScansCuArray[0]\n", aScanBatchSize); - #endif - CUDA_CHECK(cudaMemcpyToArray(deviceAScansCuArray[0], 0, 0, aScanSamples, aScanBatch_Bytes, cudaMemcpyHostToDevice)); - //CUDA_CHECK(cudaGetLastError()); - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stopCopyAscans, NULL); - diff_time = (double)((stopCopyAscans.tv_sec * 1000000.0 + stopCopyAscans.tv_usec) - (startCopyAscans.tv_sec * 1000000.0 + startCopyAscans.tv_usec)); - transferRate = aScanBatch_Bytes / diff_time; // Byte/µs = MB/s - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Copy Ascans (%i Bytes @ %.2f MB/s) = %8.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, aScanBatch_Bytes, transferRate, diff_time); - #endif - - // performCoreReconstruction - Bereite Lauf durch alle Z-Layer vor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - #ifdef debug_OutputInfo - printf( "Start Loop over Z-Layer in SAFTHandler::performCoreReconstruction\n"); - #endif - - int zOffset = 0; // Offset der aktuellen Z-Layer - int zSoSOffset = 0; // Offset der aktuellen SoSZ-Layer - int zSoSOffset_old = -1; // Offset der letzen SoSZ-Layer - - int ZLayerStart = 0; // Erste Z-Layer - int ZLayerEnd = (IMAGE_SIZE_XYZ.z-1); // letzte Z-Layer von Endpoint - int OutputVolume_nextZ; // aktuelle Z-Layer mit dem die nächste SoS-Z-Layer beginnt - - int SosZLayerStart = 0; // Erste SoSZ-Layer - int SosZLayerEnd = 0; // Letzte SoSZ-Layer - int SosZLayer = 0; // aktuelle SoSZ-Layer - - SosZLayerStart = (int)((regionOfInterestOffset.z - sosOffset.z + IMAGE_RESOLUTION * (ZLayerStart)) * SOS_RESOLUTION_FACTOR ); // SoSZ-Layer von Startpoint - SosZLayerEnd = (int)((regionOfInterestOffset.z - sosOffset.z + IMAGE_RESOLUTION * (ZLayerEnd)) * SOS_RESOLUTION_FACTOR ); // SoSZ-Layer von Endpoint - - zSoSOffset = SosZLayerStart; // Start des Offsets für Schleifendurchlauf mit SoS-Z-Layers - - #ifdef debug_OutputSOSStepsParameter - printf( "ZLayerStart = %i \n", ZLayerStart); - printf( "ZLayerEnd = IMAGE_SIZE_XYZ.z(%i)-1 = %i\n", IMAGE_SIZE_XYZ.z, ZLayerEnd); - printf( "SosZLayerStart = (int)( (regionOfInterestOffset.z(%3.3f) - sosOffset.z(%3.3f) + IMAGE_RESOLUTION * (ZLayerStart(%3i))) * SOS_RESOLUTION_FACTOR(%f) ) = %i \n", regionOfInterestOffset.z, sosOffset.z, ZLayerStart, SOS_RESOLUTION_FACTOR, SosZLayerStart); - printf( "SosZLayerEnd = (int)( (regionOfInterestOffset.z(%3.3f) - sosOffset.z(%3.3f) + IMAGE_RESOLUTION * (ZLayerEnd (%3i))) * SOS_RESOLUTION_FACTOR(%f) ) = %i \n", regionOfInterestOffset.z, sosOffset.z, ZLayerEnd, SOS_RESOLUTION_FACTOR, SosZLayerEnd); - #endif - - - - // Anzahl der Z-Layer im ersten SoS-ZLayer bestimmen wenn die erste Z-Layer inmitten der SoS-ZLayer startet - //OutputVolume_nextZ = ceil((SosZLayerStart + partialSoSZLayerCount) * 0.2587704 / (IMAGE_RESOLUTION * (float)SOSGrid_XYZ.z)); // OutputVolume_Z berechnen mit dem die nächste Z-Layer im SoSGrid anfangen wuerde. - //OutputVolume_nextZ = ceil((SosZLayerStart + partialSoSZLayerCount) * SOS_RESOLUTION / IMAGE_RESOLUTION); // OutputVolume_Z berechnen mit dem die nächste Z-Layer im SoSGrid anfangen wuerde. - //OutputVolume_nextZ = ceil((SosZLayerStart + maxFeasibleSosZLayerCount) * SOS_RESOLUTION / IMAGE_RESOLUTION); // OutputVolume_Z berechnen mit dem die nächste Z-Layer im SoSGrid anfangen wuerde. - //OutputVolume_nextZ = ceil((SosZLayerStart + maxFeasibleSosZLayerCount) * SOS_RESOLUTION / IMAGE_RESOLUTION); // OutputVolume_Z berechnen mit dem die nächste Z-Layer im SoSGrid anfangen wuerde. - - - // Search Z-Layer of next SOS-Z-layer - // Z-Layer berechnen mit der die naechste SoS-Z-Layer anfaengt - - int ROI_Z = zOffset; // Aktuelle Image_Z_Layer - float VoxelIncrement_Z = IMAGE_RESOLUTION/SOS_RESOLUTION; - float SosVoxelStartPosition_Z = (regionOfInterestOffset.z - sosOffset.z ) / SOS_RESOLUTION; - float FindNextSos_Z; - - switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]) - { - case 0: // mit Textur, nicht interpoliert - #ifdef debug_OutputSOSStepsParameter - printf(" Finde Start des naechsten SOSZ-Layers : zSoSOffset(%3i) --> zSoSOffset(next)(%3i) --> Grenze bei (%f)\n", zSoSOffset, (zSoSOffset+1), (zSoSOffset+0.5f)); - printf(" ======================================\n"); - printf(" - VoxelIncrement_Z = %3.12f [m]\n", VoxelIncrement_Z ); - //printf(" - SosVoxelStartPosition_Z = %3.12f [m]\n", SosVoxelStartPosition_Z ); - printf(" - ROI_Z = zOffset = %i [voxel beginn 0]\n", ROI_Z ); - printf(" - OutputVolume_nextZ (Start) = %i [voxel beginn 0]\n", OutputVolume_nextZ ); - printf(" ======================================\n"); - #endif - - do { - ROI_Z = ROI_Z + 1; // Die Voxel einzelnd durchgehen und schauen, wann Grenze ueberschritten wird - FindNextSos_Z = (SosVoxelStartPosition_Z + (VoxelIncrement_Z * (float)ROI_Z)); //Hier Addition der SOSVoxel im SoS-Grid durchfuehren - #ifdef debug_OutputSOSStepsParameter - printf(" - ROI_Z += 1 = %i [voxel beginn 0] ---> FindNextSos_Z = %3.12f \n", ROI_Z, FindNextSos_Z ); - // printf(" - FindNextSos_Z = ( SosVoxelStartPosition_Z(%3.12f) + (VoxelIncrement_Z(%3.12f) * ROI_Z(%i) ) = %3.12f ? >= ? (zSoSOffset(%i) + 1.5f)\n", SosVoxelStartPosition_Z, VoxelIncrement_Z, ROI_Z, FindNextSos_Z, zSoSOffset); - #endif - } while ( FindNextSos_Z < (float)(zSoSOffset + 0.5f) ); - - #ifdef debug_OutputSOSStepsParameter - printf(" -----> FindNextSos_Z mit ROI_Z = %i [voxel beginn 0]\n", ROI_Z ); - #endif - #ifdef SaftUseFastMath - OutputVolume_nextZ = ROI_Z + 1; - #else - OutputVolume_nextZ = ROI_Z; - #endif - #ifdef debug_OutputSOSStepsParameter - printf(" -----> OutputVolume_nextZ (Neu2) = ROI_Z = %i [voxel]\n", OutputVolume_nextZ ); - #endif - - break; - case 1: // mit Textur interpoliert - // Sprung bei ganzen Zahlen. - OutputVolume_nextZ = (int)(((zSoSOffset + 1) * SOS_RESOLUTION + sosOffset.z - regionOfInterestOffset.z) / IMAGE_RESOLUTION) +1; - #ifdef debug_OutputSOSStepsParameter - //printf( "OutputVolume_nextZ = (int)((sosOffset.z(%3.3f) - regionOfInterestOffset.z(%3.3f) + SOS_RESOLUTION(%3.3f) * (zSoSOffset(%3i) + 1)) * IMAGE_RESOLUTION_FACTOR(%3.3f) +0.5f +0.5f)) -1 = %i \n", sosOffset.z, regionOfInterestOffset.z, SOS_RESOLUTION, zSoSOffset, IMAGE_RESOLUTION_FACTOR, OutputVolume_nextZ); - printf( "OutputVolume_nextZ = (int)( ( ( (zSoSOffset(%3i) + 1) * SOS_RESOLUTION(%3.3f) + sosOffset.z(%3.3f) - regionOfInterestOffset.z(%3.3f) ) / IMAGE_RESOLUTION(%3.3f) ) +1 = %i \n", zSoSOffset, SOS_RESOLUTION, sosOffset.z, regionOfInterestOffset.z, IMAGE_RESOLUTION, OutputVolume_nextZ); - #endif - break; - } - - currentZLayerCount = maxFeasibleZLayerCount; // Mit maximal moeglicher Anzahl an Z-Layer rechnen und in Schleife abhaengig von Randbedingungen auf aktuelle maximale Anzahl verringern - - #ifdef debug_OutputSOSStepsParameter - printf( "currentZLayerCount = maxFeasibleZLayerCount(%i) = %i \n", maxFeasibleZLayerCount, currentZLayerCount); - #endif - -#ifdef SaftUseAscanIndexInterpolation_PartWise - for (int AscanBatch_i=0; i < TableAscanIndexAllocationCount; AscanBatch_i++){ - - //#ifdef debug_OutputLookUpGeometryMemoryList - printf( "AscanBatch_i (%i)\n", AscanBatch_i); - //#endif -#endif - - // performCoreReconstruction - While-Loop over all Z-Layers~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - while ((zOffset < IMAGE_SIZE_XYZ.z)) - { - #if defined(debug_OutputParameter) || defined(debug_OutputSOSStepsParameter) - Durchlauf++; - printf( "\n(%i)=>=>=>=>=>=>=>=>\n", Durchlauf); - printf( "ZLayerStart: %i\n", ZLayerStart); // ZLayerStart des Outputvolumens - printf( "ZLayerEnd: %i\n", ZLayerEnd); // ZLayerEnd des Outputvolumens - printf( "zOffset: %i\n", zOffset); // z-Offset des Outputvolumens [0 - IMAGE_SIZE_XYZ.z] - printf( "currentZLayer: %i\n\n", ZLayerStart + zOffset); // Aktuelle Z-Layer im gesamten Outputvolumen - printf( "zSoSOffset: %i\n", zSoSOffset); // z-SoSOffset des genutzen SoSGrids [0 - SOSGrid_XYZ.z] - //printf( "maxFeasibleSosZLayerCount: %i\n", maxFeasibleSosZLayerCount); // maximal mögliche Anzahl an SoS Z-Layern - printf( "OutputVolume_nextZ: %i\n", OutputVolume_nextZ); // aktuelle Z-Layer mit dem die nächste SoS-Z-Layer beginnt - printf( "currentZLayerCount: %i ", currentZLayerCount); // Anzahl an Z-Layern in aktueller SOS-Z-Layer - //printf( "\nCheck currentZLayerCount -> "); - #endif - - // Anzahl der in diesem Schritt zu berechnenden Z-Layer ermitteln - // 1. Test: ist Ende des Bildes schon erreicht? -> nur restliche nehmen - // 2. Test: gibt es eine neue SOS-ZSchicht die beginnt? - - // 1. Test: currentZLayerCount noch mal ueberpruefen. Wenn Ende erreicht ist -> nur noch die restlichen nehmen - if((zOffset + currentZLayerCount) <= IMAGE_SIZE_XYZ.z) // Z-Layer in n*partialOutputZLayerCount eingeteilt // Wenn noch nicht am Ende des Bildes angekommen - { - #ifdef debug_OutputSOSStepsParameter - printf( "Not at the End => currentZLayerCount = %i \n", currentZLayerCount); - #endif - } - else{ - currentZLayerCount = IMAGE_SIZE_XYZ.z - zOffset; // Wenn das letzte Z-Layer nicht mehr genau reinpasst - // nur noch die restlichen, noch nicht bearbeiteten Z-Layer berechnen - #ifdef debug_OutputSOSStepsParameter - printf( "At the End -> Last Z-Layers\n => currentZLayerCount = IMAGE_SIZE_XYZ.z(%i) - zOffset(%i) = %i\n", IMAGE_SIZE_XYZ.z, zOffset, currentZLayerCount); - #endif - } - - //2. Test: Wenn Grenze zur naechsten SoS-Layer ueberschreiten wuerde, dann nur die restlichen in dieser SoS-Layer berechnen - if ((ZLayerStart + zOffset + currentZLayerCount) > OutputVolume_nextZ) // Wenn Grenze zur naechsten SoS-Layer ueberschreiten wuerde, dann nur die restlichen in dieser SoS-Layer berechnen - { - currentZLayerCount = OutputVolume_nextZ - (ZLayerStart + zOffset); // Wenn das letzte Z-Layer nicht mehr genau reinpasst - // nur noch die restlichen, noch nicht bearbeiteten Z-Layer rechnen - #ifdef debug_OutputSOSStepsParameter - printf( "limit to the next SoS-Z-Layer border\n => currentZLayerCount = OutputVolume_nextZ(%i) - (ZLayerStart(%i) + zOffset(%i)) = %i \n", OutputVolume_nextZ, ZLayerStart, zOffset, currentZLayerCount); - #endif - } - - #if defined(debug_OutputParameter) || defined(debug_OutputSOSStepsParameter) - printf( " check --> %i\n", currentZLayerCount); // Anzahl an Z-Layern in aktueller SOS-Z-Layer - #endif - - std::size_t currentSpeedOfSoundZLayer = zSoSOffset; // Offset per Hand fuer erste SoSZ-Layer! - std::size_t currentOutputZLayerVoxelCount = currentZLayerCount * zLayerVoxelCount; // Anzahl Voxel fuer aktuelle ZLayer/Output - std::size_t currentOutputZLayerSize = currentOutputZLayerVoxelCount * sizeof(double); // Speichergroesse fuer aktuelle ZLayer/Outputgroesse in Byte - - #ifdef debug_OutputSOSStepsParameter - printf( "currentOutputZLayerVoxelCount = currentZLayerCount(%i) * zLayerVoxelCount(%i) = %i\n", currentZLayerCount, zLayerVoxelCount, currentOutputZLayerVoxelCount); - printf( "currentOutputZLayerSize = currentOutputZLayerVoxelCount(%i) * sizeof(double)(%i) = %i\n\n", currentOutputZLayerVoxelCount, sizeof(double), currentOutputZLayerSize); - #endif - - -// //Initialize the output voxels of the current window in device memory to 0.0 -// #ifdef debug_OutputMemory -// printf( "CUDA:cudaMemset: deviceOutput mit 0er fuellen von der Groesse currentOutputZLayerSize:%i\n", currentOutputZLayerSize); -// #endif -// CUDA_CHECK(cudaMemset(deviceOutput, 0, currentOutputZLayerSize)); - - // WindowGridDimensions.z kann durch Einteilung in Z-Layer veraendert worden sein. Und auch der letzte Z-LayerBlock kann sich unterscheiden! - // => daher hier die Berechnung von windowGridDimensions.z noetig! x und y bleiben gleich - windowGridDimensions.z = ((currentZLayerCount + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z); //im Moment werden die aktuellen Z-Layer (currentZLayerCount) alle berechnet indem sie umsortiert werden - #ifdef debug_OutputVariables - printf( "actual windowGridDimensions x,y,z: %i %i %i\n", windowGridDimensions.x, windowGridDimensions.y, windowGridDimensions.z); - #endif - - int blockIndexOffset = zOffset/genericSAFTBlockDimensions.z; // Umrechnung von Z-Layer auf Block -> welche Grid-Bloecke wurden schon bearbeitet - - #if defined(debug_OutputStepsPrecalculation) - printf( "===================================================================\n"); - printf( "==================== Kernel-Call: New Z-Layers ====================\n"); - printf( "===================================================================\n"); - printf( "|| currentZLayerCount = %i\n", currentZLayerCount); - printf( "|| zOffset : %i\n", zOffset); - printf( "|| zSoSOffset: %i\n", zSoSOffset); - printf( "|| currentZLayer: %i\n", ZLayerStart + zOffset); // Aktuelle Z-Layer im gesamten Outputvolumen - printf( "|| ZLayerStart: %i\n", ZLayerStart); // ZLayerStart des Outputvolumens - printf( "|| ZLayerEnd: %i\n", ZLayerEnd); // Ende der Z-Layer im gesamten Outputvolumen - printf( "|| ===============================================================\n"); - //printf( "aScanIndex: %i\n ", aScanIndex); - //printf( "|| currentAScanWindow: %i\n", currentAScanWindow); - //printf( "|| ===============================================================\n"); - printf( "|| ROI/Bildbereich: [%i %i %i]\n", IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z); - printf( "|| SOSGrid_XYZ: [%i %i %i]\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z); - printf( "|| blockIndexOffset: %i\n", blockIndexOffset); - //printf( "|| partialOutputVoxelCount: %i\n", partialOutputVoxelCount); - printf( "|| currentSpeedOfSoundZLayer: %i\n", currentSpeedOfSoundZLayer); // als Offset gedacht, welche die erste SoS-Z-Layer ist# - - //partialSpeedOfSoundVoxelCount = currentSpeedOfSoundZLayer * SOSGrid_XYZ.x * SOSGrid_XYZ.y; // hier unnoetig?!????? Erstmal nicht gebraucht!!!! Daher OK - - //printf( "|| partialSpeedOfSoundVoxelCount: %i\n", partialSpeedOfSoundVoxelCount); // kann durch currentSpeedOfSoundZLayer geteilt werden, um Anzahl SoS-Z-Layer zu bestimmen - printf( "|| windowGridDimensions x,y,z: %i %i %i\n", windowGridDimensions.x, windowGridDimensions.y, windowGridDimensions.z); - printf( "|| genericSAFTGridDimensions x,y,z: %i %i %i\n", genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z); - printf( "|| genericSAFTBlockDimensions x,y,z: %i %i %i\n", genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z); - printf( "|| currentHostOutputAdress: %u\n", currentHostOutputAdress); // Adresse der Outputdaten auf dem Host - printf( "|| currentOutputZLayerVoxelCount: %i\n", currentOutputZLayerVoxelCount); // Index der Outputdaten auf dem Host - printf( "==================================================================\n"); - #endif - - - - // ========================================================================= SoS-Pfade für Z-Layer vorberechnen - // Basepoints berechnen, wenn neue SoS-Layer benoetig wird: - // 1. Emitter Pfade - precalculateAverageSpeedOfSound - // 2. Receiver Pfade - precalculateAverageSpeedOfSound - - // 3. In Ascan-Schleife - // Immer für maximale Anzahl an Ascans - // 4. AscanIndexe berechnen - // 5. SAFT Rekonstruktuon - - if (zSoSOffset_old != zSoSOffset) // Neue SoS-Layer wird benoetig - { - zSoSOffset_old = zSoSOffset; - // SoS-Pfade für Z-Layer vorberechnen ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // Emitter ----------------------------------------------------------------------- Emitter - #if defined(debug_OutputInfo) || defined(debug_OutputSOSPaths) || defined(debug_OutputStepsPrecalculation) - printf( "Emitter - precalculateAverageSpeedOfSound - currentSpeedOfSoundZLayer(%i) maxFeasibleSosZLayerCount(%i) emitter_list_Size(%i)\n",currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, emitter_list_Size); - #endif - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - struct timeval startprecalculateAvgSos, stoprecalculateAvgSos; - gettimeofday(&startprecalculateAvgSos, NULL); - #endif - - #ifdef SaftUseConstantMemforGeometry // Pfad Emitter -> Voxel berechnen - precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 0, emitter_list_Size, deviceTableVoxelToEmitterPathCountFloat, deviceTableVoxelToEmitterPathSosSum); - #else - precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, deviceListEmitterGeometry, emitter_list_Size, deviceTableVoxelToEmitterPathCountFloat, deviceTableVoxelToEmitterPathSosSum); - #endif - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stoprecalculateAvgSos, NULL); - diff_time = (double)((stoprecalculateAvgSos.tv_sec * 1000000.0 + stoprecalculateAvgSos.tv_usec) - (startprecalculateAvgSos.tv_sec * 1000000.0 + startprecalculateAvgSos.tv_usec)); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Emitter precalcSOS = %8.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time); - #endif - - - // Receiver ----------------------------------------------------------------------- Receiver - #if defined(debug_OutputInfo) || defined(debug_OutputSOSPaths) || defined(debug_OutputStepsPrecalculation) - printf( "Receiver - precalculateAverageSpeedOfSound - currentSpeedOfSoundZLayer(%i) maxFeasibleSosZLayerCount(%i) receiver_list_Size(%i)\n",currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, receiver_list_Size); - #endif - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&startprecalculateAvgSos, NULL); - #endif - - #ifdef SaftUseConstantMemforGeometry // Pfad Receiver -> Voxel berechnen - precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 1, receiver_list_Size, deviceTableVoxelToReceiverPathCountFloat, deviceTableVoxelToReceiverPathSosSum); - #else - precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, deviceListReceiverGeometry, receiver_list_Size, deviceTableVoxelToReceiverPathCountFloat, deviceTableVoxelToReceiverPathSosSum); - #endif - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stoprecalculateAvgSos, NULL); - diff_time = (double)((stoprecalculateAvgSos.tv_sec * 1000000.0 + stoprecalculateAvgSos.tv_usec) - (startprecalculateAvgSos.tv_sec * 1000000.0 + startprecalculateAvgSos.tv_usec)); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Receiver precalcSOS = %8.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time); - #endif - -// printf( "=========================================================================================\n"); -// float *MemPtr = &deviceTableVoxelToReceiverPathSosSum[0]; -// printf( " -> deviceTableVoxelToReceiverPathSosSum (%3i) [ 0: 2] = [%f %f %f]\n",0 , *(MemPtr+0), *(MemPtr+1), *(MemPtr+2)); -// //printf( " -> deviceTableVoxelToReceiverPathSosSum (%3i) [2997:2999] = [%f %f %f]\n",0 , *(MemPtr+2997), *(MemPtr+2998), *(MemPtr+2999)); -// //printf( " -> deviceTableVoxelToReceiverPathSosSum (%3i) [ 0: 2] = [%f %f %f]\n",1 , MemPtr[3000], MemPtr[3001], MemPtr[3002]); -// //printf( " -> deviceTableVoxelToReceiverPathSosSum (%3i) [2997:2999] = [%f %f %f]\n",1 , MemPtr[5997], MemPtr[5998], MemPtr[5999]); -// //printf( " -> deviceTableVoxelToReceiverPathSosSum (%3i) [ 0: 2] = [%f %f %f]\n",156 , MemPtr[0+(156*aScanSampleCountPerReceiver)], MemPtr[1+(156*aScanSampleCountPerReceiver)], MemPtr[2+(156*aScanSampleCountPerReceiver)]); -// //printf( " -> deviceTableVoxelToReceiverPathSosSum (%3i) [2997:2999] = [%f %f %f]\n",156 , MemPtr[2997+(156*aScanSampleCountPerReceiver)], MemPtr[2998+(156*aScanSampleCountPerReceiver)], MemPtr[2999+(156*aScanSampleCountPerReceiver)]); -// printf( "=========================================================================================\n"); - - } - - - - - // Go overall A-Scanblocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - // PrecalAndPerformSAFTAllAscans - Berechne Performanz fuer Durchlauf von allen Ascans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - struct timeval startPrecalAndPerformSAFTAllAscans, stopPrecalAndPerformSAFTAllAscans; - gettimeofday(&startPrecalAndPerformSAFTAllAscans, NULL); - #endif - - // aScanWindowSize auf maximal moegliche Anzahl setzen - std::size_t aScanWindowSize = (maxAscanIndexArraysInTexture*maxSupportedTexturesForAscanIndex); //Anzahl maximaler Ascans die auf einmal verarbeitet werden kann. - - int ascanIndexBatchOffset = 0; - while(ascanIndexBatchOffset < aScanCount){ // Alle Emitter oder Receiver in der Liste von Matlab durchgehen - - - // Wenn ueber die Anzahl der Ascans Ascans zur rekonstruktion verwendet werden sollen nur noch Restliche nehmen - if (aScanCount <= (ascanIndexBatchOffset+aScanWindowSize)){ - aScanWindowSize = aScanCount-ascanIndexBatchOffset; - } - - // Für Ascan-Index benoetigt man mehrere Texturen fuer jeweils 2 Z-Layer. 2*N < maxSurfaceTexture3DDimension(Fermi, Kepler: 2048) ==> (1024 Em/Rec - Kombinationen) - // maxSurfaceTexture3DDimension = maximale Groesse die erlaubt ist (2048) - // TableAscanIndexAllocationCount = Anzahl der Teiltabellen ==> auch Anzahl der benoetigten Durchlaeufe (aktuell 4 Texturen) - // maxFeasibleSosZLayerCount = Anzahl der SoS-Zlayer die gleichzeitig im Speicher pro EM/REC-Kombi vorgehalten werden (1 oder 2 bei Interpolierten Variante) - // maxAscanIndexArraysInTexture = Anzahl der Ascans in einer Teiltabelle (1024) - // maxSupportedTexturesForAscanIndex = 4 // Definiert die maximal unterstuetzen Texturen fuer AscanIndex - // neededAscanBatchCount = Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen - - - #if defined(debug_OutputStepsPrecalculation) -// //Actual launch of the SAFT kernel for the current z-layers - printf( "\n"); -// printf( " ==================================================================\n"); - printf( " ==================== Kernel-Call: performSAFT ====================\n"); -// printf( " ==================================================================\n"); - printf( " || ascanIndexBatchOffset: %i\n", ascanIndexBatchOffset); - printf( " || aScanWindowSize: %i\n", aScanWindowSize); - printf( " || aScanCount: %i\n", aScanCount); - printf( " || maxAscanIndexArraysInTexture: %i\n", maxAscanIndexArraysInTexture); - printf( " || maxSupportedTexturesForAscanIndex: %i\n", maxSupportedTexturesForAscanIndex); - printf( " || neededAscanBatchCount: %i\n\n", neededAscanBatchCount); - printf( " || ===============================================================\n"); - #endif - - #if defined(debug_OutputInfo) || defined(debug_OutputSOSPaths) || defined(debug_OutputStepsPrecalculation) - printf( "AscanIndex - precalculateAscanIndex_usePaths - currentSpeedOfSoundZLayer(%i) maxFeasibleSosZLayerCount(%i) \n",currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount); - #endif - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - struct timeval startprecalculateAscanIndex, stoprecalculateAscanIndex; - gettimeofday(&startprecalculateAscanIndex, NULL); - #endif - - // AscanIndex Emitter -> Voxel --> Receiver vorberechnen fuer AscansBatchSize - precalculateAscanIndex_usePaths(ascanIndexBatchOffset, aScanWindowSize, currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount); //, deviceTextureAscanIndexFloatCuArray); - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stoprecalculateAscanIndex, NULL); - diff_time = (double)((stoprecalculateAscanIndex.tv_sec * 1000000.0 + stoprecalculateAscanIndex.tv_usec) - (startprecalculateAscanIndex.tv_sec * 1000000.0 + startprecalculateAscanIndex.tv_usec)); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### precalculateAscanIndex_usePaths(Em->V->Rec) for Ascan(ascanIndexBatchOffset=%i, aScanWindowSize=%i) = %8.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, ascanIndexBatchOffset, aScanWindowSize, diff_time); - #endif - - // performSAFT - Berechne Rekonstruktionsbild mit SAFT-Algo ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - struct timeval startPerformSAFT, stopPerformSAFT; - gettimeofday(&startPerformSAFT, NULL); - #endif - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - if (currentZLayerCount > 0) //bugprevention because windows and linux would stop execution for a bug - Issue 100 - performSAFT( ascanIndexBatchOffset, - aScanWindowSize, - IMAGE_SIZE_XYZ, - SOSGrid_XYZ, - blockIndexOffset, - static_cast(currentOutputZLayerVoxelCount), - static_cast(currentSpeedOfSoundZLayer), - static_cast(partialSpeedOfSoundVoxelCount), - static_cast(maxFeasibleSosZLayerCount), - static_cast(currentEmIndexUsedForAscanIndexCalculation), - windowGridDimensions, - genericSAFTGridDimensions, - genericSAFTBlockDimensions, - deviceSpeedOfSoundField, - deviceAScansCuArray[0]); - // , calculationStream); - - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stopPerformSAFT, NULL); - diff_time = (double)((stopPerformSAFT.tv_sec * 1000000.0 + stopPerformSAFT.tv_usec) - (startPerformSAFT.tv_sec * 1000000.0 + startPerformSAFT.tv_usec)); - performRate = (aScanWindowSize * currentOutputZLayerVoxelCount)/(diff_time*1000); //AScans*Voxel/(µs*1000) = AScans*GV/s - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### performSAFT = %4.3f GVA/s = %8.0f µs \n", deviceProperties[deviceId].name, deviceId, deviceIndex, performRate, diff_time); - #endif - - - ascanIndexBatchOffset+=aScanWindowSize; - - - } // End-while Go over all Ascans with ascanIndex_i - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stopPrecalAndPerformSAFTAllAscans, NULL); - diff_time = (double)((stopPrecalAndPerformSAFTAllAscans.tv_sec * 1000000.0 + stopPrecalAndPerformSAFTAllAscans.tv_usec) - (startPrecalAndPerformSAFTAllAscans.tv_sec * 1000000.0 + startPrecalAndPerformSAFTAllAscans.tv_usec)); - performRate = (aScanCount * currentOutputZLayerVoxelCount)/(diff_time*1000); //AScans*Voxel/(µs*1000) = AScans*GV/s - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### PrecalAndPerformSAFTAllAscans = %4.3f GVA/s = %8.0f µs \n", deviceProperties[deviceId].name, deviceId, deviceIndex, performRate, diff_time); - #endif - - - // Copy OutputVolume back - Kopiere das berechnete Volumen zurück auf den Host ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - #ifdef debug_OutputStepsPerformance - struct timeval startCopyOutputVolume, stopCopyOutputVolume; - gettimeofday(&startCopyOutputVolume, NULL); - #endif - - //Copy the calculation results from device memory to the host and adjust the output pointer - #ifdef debug_OutputMemory - printf( "CUDA:cudaMemcpy: deviceOutput -> currentHostOutputAdress mit Laenge : %i\n", currentOutputZLayerSize); - #endif - CUDA_CHECK(cudaMemcpy(currentHostOutputAdress, deviceOutput, currentOutputZLayerSize, cudaMemcpyDeviceToHost)); - //CUDA_CHECK(cudaDeviceSynchronize()); - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stopCopyOutputVolume, NULL); - diff_time = (double)((stopCopyOutputVolume.tv_sec * 1000000.0 + stopCopyOutputVolume.tv_usec) - (startCopyOutputVolume.tv_sec * 1000000.0 + startCopyOutputVolume.tv_usec)); - transferRate = currentOutputZLayerSize / diff_time; // Byte/µs = MB/s - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### zOffset(%4i) - Copyback Vol (%i Bytes @ %.2f MB/s) = %8.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, zOffset, currentOutputZLayerSize, transferRate, diff_time); - #endif - - - #ifdef debug_OutputVariables - printf( "****currentHostOutputAdress(%u)", currentHostOutputAdress); - // Testoutput Sum of IMAGE_SUM and reconstructed Volume -// printf( "currentHostOutputAdress: %i = [%f %f %f]\n", 0, currentHostOutputAdress[0], currentHostOutputAdress[1], currentHostOutputAdress[2]); -// printf( "currentHostOutputAdress: %i = [%f %f %f]\n", 1, currentHostOutputAdress[3], currentHostOutputAdress[4], currentHostOutputAdress[5]); -// printf( "currentHostOutputAdress: %i = [%f %f %f]\n", 2, currentHostOutputAdress[6], currentHostOutputAdress[7], currentHostOutputAdress[8]); -// printf( "currentHostOutputAdress: %i = [%f %f %f]\n",(currentOutputZLayerVoxelCount-3) , currentHostOutputAdress[currentOutputZLayerVoxelCount - 3], currentHostOutputAdress[currentOutputZLayerVoxelCount - 2], currentHostOutputAdress[currentOutputZLayerVoxelCount -1]); - #endif - - currentHostOutputAdress = currentHostOutputAdress + currentOutputZLayerVoxelCount; - - #ifdef debug_OutputVariables - printf( "****currentHostOutputAdress += currentOutputZLayerVoxelCount(%u) = %u\n", currentOutputZLayerVoxelCount, currentHostOutputAdress); - #endif - - zOffset += currentZLayerCount; - - if ((zOffset + ZLayerStart) >= OutputVolume_nextZ) // Befinden wir uns nun in einer neuen SoS-Z-Layer? --> Funktioniert bisher nur mit 1ner SoS-Layer!!! - { - zSoSOffset += 1; // Da immer nur 1ne interpolierte SOS-Z-Layer mit 2 SOS-Z-Layer berechnet werden kann - //zSoSOffset += maxFeasibleSosZLayerCount; - } - - - - // Z-Layer berechnen mit der die naechste SoS-Z-Layer anfaengt - ROI_Z = zOffset; // Aktuelle Image_Z_Layer - - switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]) - { - case 0: // mit Textur, nicht interpoliert - #ifdef debug_OutputSOSStepsParameter - printf(" Finde Start des naechsten SOSZ-Layers : zSoSOffset(%3i) --> zSoSOffset(next)(%3i) --> Grenze bei (%f)\n", zSoSOffset, (zSoSOffset+1), (zSoSOffset+0.5f)); - printf(" ======================================\n"); - printf(" - VoxelIncrement_Z = %3.12f [m]\n", VoxelIncrement_Z ); - //printf(" - SosVoxelStartPosition_Z = %3.12f [m]\n", SosVoxelStartPosition_Z ); - printf(" - ROI_Z = zOffset = %i [voxel]\n", ROI_Z ); - printf(" - OutputVolume_nextZ (Start) = %i [voxel]\n", OutputVolume_nextZ ); - printf(" ======================================\n"); - #endif - - do { - ROI_Z = ROI_Z + 1; // Die Voxel einzelnd durchgehen und schauen wann es GRenze ueberschreitet - // Alte Version Michael - FindNextSos_Z = (SosVoxelStartPosition_Z + (VoxelIncrement_Z * (float)ROI_Z)); //Hier Addition der SOSVoxel im SoS-Grid durchfuehren - #ifdef debug_OutputSOSStepsParameter - printf(" - ROI_Z += 1 = %i [voxel] ---> FindNextSos_Z = %3.12f \n", ROI_Z, FindNextSos_Z ); - // printf(" - FindNextSos_Z = ( SosVoxelStartPosition_Z(%3.12f) + (VoxelIncrement_Z(%3.12f) * ROI_Z(%i) ) = %3.12f ? >= ? (zSoSOffset(%i) + 1.5f)\n", SosVoxelStartPosition_Z, VoxelIncrement_Z, ROI_Z, FindNextSos_Z, zSoSOffset); - #endif - } while ( FindNextSos_Z < (float)(zSoSOffset + 0.5f) ); - - #ifdef debug_OutputSOSStepsParameter - printf(" -----> FindNextSos_Z mit ROI_Z = %i [voxel]\n", ROI_Z ); - #endif - #ifdef SaftUseFastMath - OutputVolume_nextZ = ROI_Z + 1; - #else - OutputVolume_nextZ = ROI_Z; - #endif - #ifdef debug_OutputSOSStepsParameter - printf(" -----> OutputVolume_nextZ (Neu2) = ROI_Z = %i [voxel]\n", OutputVolume_nextZ ); - #endif - - - break; - case 1: // mit Textur interpoliert - // Z-Layer berechnen mit der die naechste SoS-Z-Layer anfaengt - // (zSoSOffset + 1) = naechstes Ganze SOS Z-layer // zweites + 0.5f zum runden um 0.5 - // - // => Sprung bei ganzen Zahlen. Beginn mit z.B 3.0 - //OutputVolume_nextZ = (int)((sosOffset.z - regionOfInterestOffset.z + (zSoSOffset + 1) * SOS_RESOLUTION) * IMAGE_RESOLUTION_FACTOR +0.5f +0.5f) - 1; - OutputVolume_nextZ = (int)(((zSoSOffset + 1) * SOS_RESOLUTION + sosOffset.z - regionOfInterestOffset.z) / IMAGE_RESOLUTION) +1; - #ifdef debug_OutputSOSStepsParameter - //printf( "OutputVolume_nextZ = (int)((sosOffset.z(%3.3f) - regionOfInterestOffset.z(%3.3f) + SOS_RESOLUTION(%3.3f) * (zSoSOffset(%3i) + 1)) * IMAGE_RESOLUTION_FACTOR(%3.3f) +0.5f +0.5f)) -1 = %i \n", sosOffset.z, regionOfInterestOffset.z, SOS_RESOLUTION, zSoSOffset, IMAGE_RESOLUTION_FACTOR, OutputVolume_nextZ); - printf( "OutputVolume_nextZ = (int)( ( ( (zSoSOffset(%3i) + 1) * SOS_RESOLUTION(%3.3f) + sosOffset.z(%3.3f) - regionOfInterestOffset.z(%3.3f) ) / IMAGE_RESOLUTION(%3.3f) ) +1 = %i \n", zSoSOffset, SOS_RESOLUTION, sosOffset.z, regionOfInterestOffset.z, IMAGE_RESOLUTION, OutputVolume_nextZ); - #endif - break; - } - - // Neue Anzahl der Z-Layer im naechsten SoS-Grid-Z-Layer berechnen (currentZLayerCount) - currentZLayerCount = maxFeasibleZLayerCount; - - #ifdef debug_OutputParameter - printf( "OutputVolume_nextZ = (int)((sosOffset.z(%3.3f) - regionOfInterestOffset.z(%3.3f) + SOS_RESOLUTION(%3.3f) * (zSoSOffset(%3i) + 1 + 0.5f)) * IMAGE_RESOLUTION_FACTOR(%3.3f) -0.5f)) = %i \n", sosOffset.z, regionOfInterestOffset.z, SOS_RESOLUTION, zSoSOffset, IMAGE_RESOLUTION_FACTOR, OutputVolume_nextZ); - printf( "currentZLayerCount = maxFeasibleZLayerCount(%i) = %i \n", maxFeasibleZLayerCount, currentZLayerCount); - #endif - - }; //End While-Loop over all Z-Layer - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - - #ifdef SaftUseAscanIndexInterpolation_PartWise - aScanIndex += 1413; // ERster Test mit 1413 Receivern // ToDo die Anzahl der Receiver direkt an Kernel ubergeben als AscanWindowsSize. - printf( "|| aScanIndex: %i\n ", aScanIndex); - } - // Schleifenende - // Wenn alle benutzten Emitter durchgelaufen sind - - #endif - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stopPerformCoreReconstruction, NULL); - diff_time = (double)((stopPerformCoreReconstruction.tv_sec * 1000000.0 + stopPerformCoreReconstruction.tv_usec) - (startPerformCoreReconstruction.tv_sec * 1000000.0 + startPerformCoreReconstruction.tv_usec)); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Function PerformCoreReconstruction = %8.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time); - #endif - #ifdef debug_OutputFunctions - printf( "<== SAFTHandler::performCoreReconstruction - End\n"); - #endif + double *currentHostOutputAdress = output; // Offset des OutputSpeichers am Anfang = 0 + CUDA_CHECK(cudaGetLastError()); + + // Copy A-Scans to GPU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + CUDA_CHECK(cudaMemcpyToArray(deviceAScansCuArray[0], 0, 0, aScanSamples, aScanBatch_Bytes, cudaMemcpyHostToDevice)); + // CUDA_CHECK(cudaGetLastError()); + + // performCoreReconstruction - Bereite Lauf durch alle Z-Layer vor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + int zOffset = 0; // Offset der aktuellen Z-Layer + int zSoSOffset = 0; // Offset der aktuellen SoSZ-Layer + int zSoSOffset_old = -1; // Offset der letzen SoSZ-Layer + + int ZLayerStart = 0; // Erste Z-Layer + int ZLayerEnd = (IMAGE_SIZE_XYZ.z - 1); // letzte Z-Layer von Endpoint + int OutputVolume_nextZ; // aktuelle Z-Layer mit dem die nächste SoS-Z-Layer beginnt + + int SosZLayerStart = 0; // Erste SoSZ-Layer + int SosZLayerEnd = 0; // Letzte SoSZ-Layer + int SosZLayer = 0; // aktuelle SoSZ-Layer + + SosZLayerStart = (int)((regionOfInterestOffset.z - sosOffset.z + IMAGE_RESOLUTION * (ZLayerStart)) * SOS_RESOLUTION_FACTOR); // SoSZ-Layer von Startpoint + SosZLayerEnd = (int)((regionOfInterestOffset.z - sosOffset.z + IMAGE_RESOLUTION * (ZLayerEnd)) * SOS_RESOLUTION_FACTOR); // SoSZ-Layer von Endpoint + + zSoSOffset = SosZLayerStart; // Start des Offsets für Schleifendurchlauf mit SoS-Z-Layers + // Search Z-Layer of next SOS-Z-layer + // Z-Layer berechnen mit der die naechste SoS-Z-Layer anfaengt + + int ROI_Z = zOffset; // Aktuelle Image_Z_Layer + float VoxelIncrement_Z = IMAGE_RESOLUTION / SOS_RESOLUTION; + float SosVoxelStartPosition_Z = (regionOfInterestOffset.z - sosOffset.z) / SOS_RESOLUTION; + float FindNextSos_Z; + + switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]) + { + case 0: // mit Textur, nicht interpoliert + do + { + ROI_Z = ROI_Z + 1; // Die Voxel einzelnd durchgehen und schauen, wann Grenze ueberschritten wird + FindNextSos_Z = (SosVoxelStartPosition_Z + (VoxelIncrement_Z * (float)ROI_Z)); // Hier Addition der SOSVoxel im SoS-Grid durchfuehren + } while (FindNextSos_Z < (float)(zSoSOffset + 0.5f)); + OutputVolume_nextZ = ROI_Z; + break; + case 1: // mit Textur interpoliert + // Sprung bei ganzen Zahlen. + OutputVolume_nextZ = (int)(((zSoSOffset + 1) * SOS_RESOLUTION + sosOffset.z - regionOfInterestOffset.z) / IMAGE_RESOLUTION) + 1; + break; + } + + currentZLayerCount = maxFeasibleZLayerCount; // Mit maximal moeglicher Anzahl an Z-Layer rechnen und in Schleife abhaengig von Randbedingungen auf aktuelle maximale Anzahl verringern + + // performCoreReconstruction - While-Loop over all Z-Layers~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + while ((zOffset < IMAGE_SIZE_XYZ.z)) + { + // Anzahl der in diesem Schritt zu berechnenden Z-Layer ermitteln + // 1. Test: ist Ende des Bildes schon erreicht? -> nur restliche nehmen + // 2. Test: gibt es eine neue SOS-ZSchicht die beginnt? + + // 1. Test: currentZLayerCount noch mal ueberpruefen. Wenn Ende erreicht ist -> nur noch die restlichen nehmen + if ((zOffset + currentZLayerCount) <= IMAGE_SIZE_XYZ.z) // Z-Layer in n*partialOutputZLayerCount eingeteilt // Wenn noch nicht am Ende des Bildes angekommen + { + } + else + { + currentZLayerCount = IMAGE_SIZE_XYZ.z - zOffset; // Wenn das letzte Z-Layer nicht mehr genau reinpasst + // nur noch die restlichen, noch nicht bearbeiteten Z-Layer berechnen + } + + // 2. Test: Wenn Grenze zur naechsten SoS-Layer ueberschreiten wuerde, dann nur die restlichen in dieser SoS-Layer berechnen + if ((ZLayerStart + zOffset + currentZLayerCount) > OutputVolume_nextZ) // Wenn Grenze zur naechsten SoS-Layer ueberschreiten wuerde, dann nur die restlichen in dieser SoS-Layer berechnen + { + currentZLayerCount = OutputVolume_nextZ - (ZLayerStart + zOffset); // Wenn das letzte Z-Layer nicht mehr genau reinpasst + // nur noch die restlichen, noch nicht bearbeiteten Z-Layer rechnen + } + + std::size_t currentSpeedOfSoundZLayer = zSoSOffset; // Offset per Hand fuer erste SoSZ-Layer! + std::size_t currentOutputZLayerVoxelCount = currentZLayerCount * zLayerVoxelCount; // Anzahl Voxel fuer aktuelle ZLayer/Output + std::size_t currentOutputZLayerSize = currentOutputZLayerVoxelCount * sizeof(double); // Speichergroesse fuer aktuelle ZLayer/Outputgroesse in Byte + // //Initialize the output voxels of the current window in device memory to 0.0; + + // WindowGridDimensions.z kann durch Einteilung in Z-Layer veraendert worden sein. Und auch der letzte Z-LayerBlock kann sich unterscheiden! + // => daher hier die Berechnung von windowGridDimensions.z noetig! x und y bleiben gleich + windowGridDimensions.z = ((currentZLayerCount + genericSAFTBlockDimensions.z - 1) / + genericSAFTBlockDimensions.z); // im Moment werden die aktuellen Z-Layer (currentZLayerCount) alle berechnet indem sie umsortiert werden + + int blockIndexOffset = zOffset / genericSAFTBlockDimensions.z; // Umrechnung von Z-Layer auf Block -> welche Grid-Bloecke wurden schon bearbeitet + + // ========================================================================= SoS-Pfade für Z-Layer vorberechnen + // Basepoints berechnen, wenn neue SoS-Layer benoetig wird: + // 1. Emitter Pfade - precalculateAverageSpeedOfSound + // 2. Receiver Pfade - precalculateAverageSpeedOfSound + + // 3. In Ascan-Schleife + // Immer für maximale Anzahl an Ascans + // 4. AscanIndexe berechnen + // 5. SAFT Rekonstruktuon + + if (zSoSOffset_old != zSoSOffset) // Neue SoS-Layer wird benoetig + { + zSoSOffset_old = zSoSOffset; + // SoS-Pfade für Z-Layer vorberechnen ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + // Emitter ----------------------------------------------------------------------- Emitter + precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 0, emitter_list_Size, deviceTableVoxelToEmitterPathCountFloat, deviceTableVoxelToEmitterPathSosSum); + + // Receiver ----------------------------------------------------------------------- Receiver + + precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 1, receiver_list_Size, deviceTableVoxelToReceiverPathCountFloat, + deviceTableVoxelToReceiverPathSosSum); + } + // Go overall A-Scanblocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // PrecalAndPerformSAFTAllAscans - Berechne Performanz fuer Durchlauf von allen Ascans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + // aScanWindowSize auf maximal moegliche Anzahl setzen + std::size_t aScanWindowSize = (maxAscanIndexArraysInTexture * maxSupportedTexturesForAscanIndex); // Anzahl maximaler Ascans die auf einmal verarbeitet werden kann. + + int ascanIndexBatchOffset = 0; + while (ascanIndexBatchOffset < aScanCount) + { // Alle Emitter oder Receiver in der Liste von Matlab durchgehen + // Wenn ueber die Anzahl der Ascans Ascans zur rekonstruktion verwendet werden sollen nur noch Restliche nehmen + if (aScanCount <= (ascanIndexBatchOffset + aScanWindowSize)) + { + aScanWindowSize = aScanCount - ascanIndexBatchOffset; + } + + // Für Ascan-Index benoetigt man mehrere Texturen fuer jeweils 2 Z-Layer. 2*N < maxSurfaceTexture3DDimension(Fermi, Kepler: 2048) ==> (1024 Em/Rec - Kombinationen) + // maxSurfaceTexture3DDimension = maximale Groesse die erlaubt ist (2048) + // TableAscanIndexAllocationCount = Anzahl der Teiltabellen ==> auch Anzahl der benoetigten Durchlaeufe (aktuell 4 Texturen) + // maxFeasibleSosZLayerCount = Anzahl der SoS-Zlayer die gleichzeitig im Speicher pro EM/REC-Kombi vorgehalten werden (1 oder 2 bei Interpolierten Variante) + // maxAscanIndexArraysInTexture = Anzahl der Ascans in einer Teiltabelle (1024) + // maxSupportedTexturesForAscanIndex = 4 // Definiert die maximal unterstuetzen Texturen fuer AscanIndex + // neededAscanBatchCount = Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen + + // AscanIndex Emitter -> Voxel --> Receiver vorberechnen fuer AscansBatchSize + precalculateAscanIndex_usePaths(ascanIndexBatchOffset, aScanWindowSize, currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount); //, deviceTextureAscanIndexFloatCuArray); + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + if (currentZLayerCount > 0) // bugprevention because windows and linux would stop execution for a bug - Issue 100 + performSAFT(ascanIndexBatchOffset, aScanWindowSize, IMAGE_SIZE_XYZ, SOSGrid_XYZ, blockIndexOffset, static_cast(currentOutputZLayerVoxelCount), + static_cast(currentSpeedOfSoundZLayer), static_cast(partialSpeedOfSoundVoxelCount), static_cast(maxFeasibleSosZLayerCount), + static_cast(currentEmIndexUsedForAscanIndexCalculation), windowGridDimensions, genericSAFTGridDimensions, genericSAFTBlockDimensions, deviceSpeedOfSoundField, + deviceAScansCuArray[0]); + // , calculationStream); + ascanIndexBatchOffset += aScanWindowSize; + + } // End-while Go over all Ascans with ascanIndex_i + + CUDA_CHECK(cudaMemcpy(currentHostOutputAdress, deviceOutput, currentOutputZLayerSize, cudaMemcpyDeviceToHost)); + currentHostOutputAdress = currentHostOutputAdress + currentOutputZLayerVoxelCount; + zOffset += currentZLayerCount; + + if ((zOffset + ZLayerStart) >= OutputVolume_nextZ) // Befinden wir uns nun in einer neuen SoS-Z-Layer? --> Funktioniert bisher nur mit 1ner SoS-Layer!!! + { + zSoSOffset += 1; // Da immer nur 1ne interpolierte SOS-Z-Layer mit 2 SOS-Z-Layer berechnet werden kann + // zSoSOffset += maxFeasibleSosZLayerCount; + } + + // Z-Layer berechnen mit der die naechste SoS-Z-Layer anfaengt + ROI_Z = zOffset; // Aktuelle Image_Z_Layer + + switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]) + { + case 0: // mit Textur, nicht interpoliert + do + { + ROI_Z = ROI_Z + 1; // Die Voxel einzelnd durchgehen und schauen wann es GRenze ueberschreitet + // Alte Version Michael + FindNextSos_Z = (SosVoxelStartPosition_Z + (VoxelIncrement_Z * (float)ROI_Z)); // Hier Addition der SOSVoxel im SoS-Grid durchfuehren + } while (FindNextSos_Z < (float)(zSoSOffset + 0.5f)); + OutputVolume_nextZ = ROI_Z; + break; + case 1: // mit Textur interpoliert + // Z-Layer berechnen mit der die naechste SoS-Z-Layer anfaengt + // (zSoSOffset + 1) = naechstes Ganze SOS Z-layer // zweites + 0.5f zum runden um 0.5 + // + // => Sprung bei ganzen Zahlen. Beginn mit z.B 3.0 + // OutputVolume_nextZ = (int)((sosOffset.z - regionOfInterestOffset.z + (zSoSOffset + 1) * SOS_RESOLUTION) * IMAGE_RESOLUTION_FACTOR +0.5f +0.5f) - 1; + OutputVolume_nextZ = (int)(((zSoSOffset + 1) * SOS_RESOLUTION + sosOffset.z - regionOfInterestOffset.z) / IMAGE_RESOLUTION) + 1; + break; + } + + // Neue Anzahl der Z-Layer im naechsten SoS-Grid-Z-Layer berechnen (currentZLayerCount) + currentZLayerCount = maxFeasibleZLayerCount; + }; // End While-Loop over all Z-Layer + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ } - - /** Process A-scans for the image reconstruction. Initialise more of the surrounding dependencies of the image reconstruction process and call performCoreReconstruction. - Verarbeite A-Scans fuer die Bildrekonstruktion - Initialsiere weitre umgebenden Abhaengigkeiten des Bildrekonstruktion-Prozesses und rufe performCoreReconstruction auf. */ -void SAFTHandler::processAScans( - ullong & duration ///< This argument is written to. Duration of the core reconstruction. - ) +void SAFTHandler::processAScans(ullong &duration ///< This argument is written to. Duration of the core reconstruction. +) { - #ifdef debug_OutputFunctions - printf( "==> SAFTHandler::processAScans - Start\n"); - #endif + // memoryCheck(); // Freier Speicher am Anfang augeben - #ifdef debug_OutputStepsPerformance - struct timeval startMalloc, stopMalloc; - gettimeofday(&startMalloc, NULL); - #endif + //===================================================================================================== Memory allocation on and copy to GPU Device for - deviceSAFT_VARIANT - // memoryCheck(); // Freier Speicher am Anfang augeben + CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceSAFT_VARIANT), SAFT_VARIANT_Size * sizeof(int))); + // printf(" -> SAFT_VARIANT = [%i %i %i %i %i %i]\n", SAFT_VARIANT[0], SAFT_VARIANT[1], SAFT_VARIANT[2], SAFT_VARIANT[3], SAFT_VARIANT[4], SAFT_VARIANT[5]); + CUDA_CHECK(cudaMemcpy(deviceSAFT_VARIANT, SAFT_VARIANT, SAFT_VARIANT_Size * sizeof(int), cudaMemcpyHostToDevice)); - #ifdef debug_OutputMemory - //Memory allocation, stream initialisation - printf( "\nCUDA: Memory Allokation, stream Initialisation\n"); - #endif + //===================================================================================================== Memory allocation on GPU Device for - AScan-Data - //===================================================================================================== Memory allocation on and copy to GPU Device for - deviceSAFT_VARIANT - #ifdef debug_OutputMemory - printf( "CUDA: cudaMalloc(deviceSAFT_VARIANT) der Groesse (%i Bytes) \n", SAFT_VARIANT_Size*sizeof(int)); - #endif - CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceSAFT_VARIANT), SAFT_VARIANT_Size*sizeof(int) )); + deviceAScansCuArray = new cudaArray *[aScanAllocationCount]; // Für Arbeiten mit Texturmemory + cudaChannelFormatDesc texChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Beschreibung des RueckgabeFormats der Textur - #ifdef debug_OutputMemory - printf( "CUDA: cudaMemcpy: SAFT_VARIANT -> deviceSAFT_VARIANT\n"); // Copy Data from Host to GPU for Index Data - #endif - //printf(" -> SAFT_VARIANT = [%i %i %i %i %i %i]\n", SAFT_VARIANT[0], SAFT_VARIANT[1], SAFT_VARIANT[2], SAFT_VARIANT[3], SAFT_VARIANT[4], SAFT_VARIANT[5]); - CUDA_CHECK(cudaMemcpy( deviceSAFT_VARIANT, SAFT_VARIANT, SAFT_VARIANT_Size*sizeof(int), cudaMemcpyHostToDevice)); - - // Check if copy works - // int *c [SAFT_VARIANT_Size]; //device copy of SAFT_VARIANT - // CUDA_CHECK(cudaMemcpy( c, deviceSAFT_VARIANT, SAFT_VARIANT_Size*sizeof(int), cudaMemcpyDeviceToHost)); - // printf(" -> c = [%i %i %i %i %i]\n", c[0], c[1], c[2], c[3], c[4], c[5]); - - //===================================================================================================== Memory allocation on GPU Device for - AScan-Data - - deviceAScansCuArray = new cudaArray *[aScanAllocationCount]; // Für Arbeiten mit Texturmemory - cudaChannelFormatDesc texChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Beschreibung des RueckgabeFormats der Textur - - - for(std::size_t i = 0; i < aScanAllocationCount; i++) + for (std::size_t i = 0; i < aScanAllocationCount; i++) { - // CuArray fuer Texturmemory fuer AscanDaten auf GPU allozieren - #ifdef debug_OutputMemory - printf( "CUDA: cudaMallocArray (&deviceAScansCuArray[%i], &texChannelDesc, aScanLength, aScanCount(%i Elemente))\n", i, aScanCount); - #endif - CUDA_CHECK(cudaMallocArray(&deviceAScansCuArray[i], &texChannelDesc, aScanLength, aScanCount)); + // CuArray fuer Texturmemory fuer AscanDaten auf GPU allozieren + + CUDA_CHECK(cudaMallocArray(&deviceAScansCuArray[i], &texChannelDesc, aScanLength, aScanCount)); } - //memoryCheck(); + // memoryCheck(); //============================================================================= Memory allocation on GPU Device for - Emitter and Receiver Index and LookUpList-Data // ==== Index from Blockdata - int deviceEmitterIndex_blocksize = aScanCount * sizeof(unsigned short); // Size of EmitterIndex_block - int deviceReceiverIndex_blocksize = aScanCount * sizeof(unsigned short); // Size of ReceiverIndex_block - #ifdef debug_OutputVariables - printf( "deviceEmitterIndex_blocksize = aScanCount(=%i) * sizeof(unsigned short)(=%i) = %i\n", aScanCount, sizeof(unsigned short), deviceEmitterIndex_blocksize); - printf( "deviceReceiverIndex_blocksize = aScanCount(=%i) * sizeof(unsigned short)(=%i) = %i\n", aScanCount, sizeof(unsigned short), deviceReceiverIndex_blocksize); - #endif - - #ifdef debug_OutputMemory // Allocate Memory on GPU Device for Emitter and Receiver Geometry Data - printf( "CUDA: cudaMalloc (deviceEmitterIndex_block) der Groesse : %i Bytes\n", deviceEmitterIndex_blocksize); - printf( "CUDA: cudaMalloc (deviceReceiverIndex_block) der Groesse: %i Bytes\n", deviceReceiverIndex_blocksize); - #endif - CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceEmitterIndex_block), deviceEmitterIndex_blocksize)); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceReceiverIndex_block), deviceReceiverIndex_blocksize)); - - - #ifdef debug_OutputVariables - //Test Output from Receiver-Data - printf( "Index-Data:\n"); - //printf( "emitter_index (0-2) = [%d %d %d]\n", emitter_index[0], emitter_index[1], emitter_index[2]); - //printf( "receiver_index (0-2) = [%d %d %d]\n", receiver_index[0], receiver_index[1], receiver_index[2]); - #endif - #ifdef debug_OutputMemory - printf( "CUDA: cudaMemcpy: emitter_index -> deviceEmitterIndex_block\n"); // Copy Data from Host to GPU for Index Data - printf( "CUDA: cudaMemcpy: receiver_index -> deviceReceiverIndex_block\n"); - #endif - CUDA_CHECK(cudaMemcpy(deviceEmitterIndex_block, emitter_index, deviceEmitterIndex_blocksize, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(deviceReceiverIndex_block, receiver_index, deviceReceiverIndex_blocksize, cudaMemcpyHostToDevice)); + int deviceEmitterIndex_blocksize = aScanCount * sizeof(unsigned short); // Size of EmitterIndex_block + int deviceReceiverIndex_blocksize = aScanCount * sizeof(unsigned short); // Size of ReceiverIndex_block + CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceEmitterIndex_block), deviceEmitterIndex_blocksize)); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceReceiverIndex_block), deviceReceiverIndex_blocksize)); + CUDA_CHECK(cudaMemcpy(deviceEmitterIndex_block, emitter_index, deviceEmitterIndex_blocksize, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(deviceReceiverIndex_block, receiver_index, deviceReceiverIndex_blocksize, cudaMemcpyHostToDevice)); //============================================================================= Memory allocation on GPU Device for - Geometry-Data within the Emitter and Receiver List - // ==== Coordinates / Geometry - Blockdata - int deviceListEmitterGeometrySize = emitter_list_Size * sizeof(float3); // Size of ListEmitterGeometry - int deviceListReceiverGeometrySize = receiver_list_Size * sizeof(float3); // Size of ListReceiverGeometry - - #ifndef SaftUseConstantMemforGeometry - #ifdef debug_OutputVariables - printf( "deviceListEmitterGeometrySize = emitter_list_Size(=%i) * sizeof(float3)(=12) = %i\n", emitter_list_Size, deviceListEmitterGeometrySize); - printf( "deviceListReceiverGeometrySize = receiver_list_Size(=%i) * sizeof(float3)(=12) = %i\n", receiver_list_Size, deviceListReceiverGeometrySize); - #endif - - #ifdef debug_OutputMemory // Allocate Memory on GPU Device for Emitter and Receiver Geometry Data - printf( "CUDA: cudaMalloc (deviceListEmitterGeometry) der Groesse: %i Bytes\n", deviceListEmitterGeometrySize); - printf( "CUDA: cudaMalloc (deviceListReceiverGeometry) der Groesse: %i Bytes\n", deviceListReceiverGeometrySize); - #endif - CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceListEmitterGeometry), deviceListEmitterGeometrySize)); - CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceListReceiverGeometry), deviceListReceiverGeometrySize)); - - #ifdef debug_OutputVariables - // //Test Output from Emitter-Data - // printf( "Emitter-List:\n"); - // for(int i = 0; i < emitter_list_Size; i++) - // { - // printf( "emitter_list (%i) = [%f %f %f]\n", i , emitter_list[i].x, emitter_list[i].y, emitter_list[i].z); - // } - // - // //Test Output from Receiver-Data - // printf( "Receiver-List:\n"); - // for(int i = 0; i < receiver_list_Size; i++) - // { - // printf( "receiver_list (%i) = [%f %f %f]\n", i , receiver_list[i].x, receiver_list[i].y, receiver_list[i].z); - // } - #endif - #ifdef debug_OutputMemory - printf( "CUDA: cudaMemcpy: emitter_list -> deviceListEmitterGeometry\n"); // Copy Data from Host to GPU for List Index <-> Geometry Data - printf( "CUDA: cudaMemcpy: receiver_list -> deviceListReceiverGeometry\n"); - #endif - CUDA_CHECK(cudaMemcpy(deviceListEmitterGeometry, emitter_list, deviceListEmitterGeometrySize, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(deviceListReceiverGeometry, receiver_list, deviceListReceiverGeometrySize, cudaMemcpyHostToDevice)); - #endif - - #ifdef SaftUseConstantMemforGeometry - #ifdef debug_OutputMemory - printf( "CUDA: cudaMemcpyToSymbol: emitter_list -> emitterPOS : %i Bytes\n", deviceListEmitterGeometrySize); // Copy Data from Host(emitter/receiver_list) to GPU(emitter/receiverPOS) for Index Data - printf( "CUDA: cudaMemcpyToSymbol: receiver_list -> receiverPOS : %i Bytes\n", deviceListReceiverGeometrySize); - #endif - - - #ifdef SaftUseHarmonicMean - CUDA_CHECK(cudaMemcpyToSymbol (constEmitterPtr, emitter_list, deviceListEmitterGeometrySize, 0, cudaMemcpyHostToDevice )); - CUDA_CHECK(cudaMemcpyToSymbol (constReceiverPtr, receiver_list, deviceListReceiverGeometrySize, 0, cudaMemcpyHostToDevice )); - #endif - - - #endif - - //============================================================================= Memory allocation on GPU Device for - LookUpGeometryMemoryList for Emitter and Receiver - - // Steps: - // 1. Determine amount of used Emitter and Receiver --> usedAmountOfEmitter, usedAmountOfReceiver -> Done - // 2. Allocate Memory on Host for LookUpGeometryMemoryList for Emitter and Receiver -> Done - // 3. Fill LookUpGeometryMemoryList with MemoryPositionIndex for every emitter and Receiver -> Done - // Copy lookUpGeometryMemoryListEmitter, lookUpGeometryMemoryListReceiver to Constant Memory of Device - - // 4. Allocate SOSPathLists, depending on usedAmountOfEmitter, usedAmountOfReceiver -> Done - // 5. Perform PreprocessSOSPaths only with used Emitter and Receiver -> Done - // 6. Inside SAFT-Kernel first determine MemoryPositionIndex of current Emitter and Receivers then access Data with tex3D. -> Done - - - // 1. Determine amount of used Emitter and Receiver --> usedAmountOfEmitter, usedAmountOfReceiver - usedAmountOfEmitter = 0; // amount of used emitter - usedAmountOfReceiver = 0; // amount of used receiver - - //Test Output from Emitter-Data - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "Emitter-List:\n"); - #endif - for(int i = 0; i < emitter_list_Size; i++) - { - #ifdef debug_OutputLookUpGeometryMemoryList - //printf( "emitter_list (%i) = [%f %f %f]\n", i , emitter_list[i].x, emitter_list[i].y, emitter_list[i].z); - #endif - - if (emitter_list[i].x != 255) - usedAmountOfEmitter ++; - } - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "=> usedAmountOfEmitter = %i\n", usedAmountOfEmitter); - #endif - - //Test Output from Receiver-Data - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "Receiver-List:\n"); - #endif - for(int i = 0; i < receiver_list_Size; i++) - { - #ifdef debug_OutputLookUpGeometryMemoryList - //printf( "receiver_list (%i) = [%f %f %f]\n", i , receiver_list[i].x, receiver_list[i].y, receiver_list[i].z); - #endif - - if (receiver_list[i].x != 255) - usedAmountOfReceiver ++; - } - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "=> usedAmountOfReceiver = %i\n", usedAmountOfReceiver); - #endif - - // 2. Allocate Memory on Host for LookUpGeometryMemoryList for Emitter and Receiver - // create array - - // NEW 2019, trying to get rid of USCT II hard coded limitation - int maxEmitterReceiverListSize = MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY; // 2340 --> max. num of entries for 64kB constant memory, has to in alignment with constantMemory.cu - hostLookUpGeometryMemoryListEmitterPtr = (unsigned short*)malloc(maxEmitterReceiverListSize * sizeof(unsigned short)); - hostLookUpGeometryMemoryListReceiverPtr = (unsigned short*)malloc(maxEmitterReceiverListSize * sizeof(unsigned short)); - - //hostLookUpGeometryMemoryListEmitterPtr = (unsigned short*)mxMalloc(157 * 4 * sizeof(unsigned short)); - //hostLookUpGeometryMemoryListReceiverPtr = (unsigned short*)mxMalloc(157 * 9 * sizeof(unsigned short)); - - // 3. Fill LookUpGeometryMemoryList with MemoryPositionIndex for every emitter and Receiver - // Run over all emitter and fill constLookUpGeometryMemoryListEmitterPtr - int lookUpEmitterIndex = 0; - int lookUpReceiverIndex = 0; - for(int i = 0; i < emitter_list_Size; i++) - { - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "emitter_list (%i) = [%f %f %f]\n", i , emitter_list[i].x, emitter_list[i].y, emitter_list[i].z); - #endif - - if (emitter_list[i].x != 255){ // Wenn nicht leer dann an naechste Stelle einsortieren - hostLookUpGeometryMemoryListEmitterPtr[i] = (unsigned short) lookUpEmitterIndex; - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "=====> Emitter (%4i) => hostLookUpGeometryMemoryListEmitterPtr [%4i] = %i\n", i, i, hostLookUpGeometryMemoryListEmitterPtr[i]); - #endif - lookUpEmitterIndex ++; - }else{ - hostLookUpGeometryMemoryListEmitterPtr[i] = (unsigned short) 65535; - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "=====> Emitter (%4i) => hostLookUpGeometryMemoryListEmitterPtr [%4i] = %i\n", i, i, hostLookUpGeometryMemoryListEmitterPtr[i]); - #endif - } - } - - #ifdef debug_OutputLookUpGeometryMemoryList - //if (UsedAmountOfEmitter == lookUpEmitterIndex) // Check if all emitter have their Index - printf( "######### usedAmountOfEmitter (%i) = lookUpEmitterIndex (%i)\n", usedAmountOfEmitter , lookUpEmitterIndex); - #endif - - - for(int i = 0; i < receiver_list_Size; i++) - { - #ifdef debug_OutputLookUpGeometryMemoryList - //printf( "receiver_list (%i) = [%f %f %f]\n", i , receiver_list[i].x, receiver_list[i].y, receiver_list[i].z); - #endif - - if (receiver_list[i].x != 255){ // Wenn nicht leer dann an naechste Stelle einsortieren - hostLookUpGeometryMemoryListReceiverPtr[i] = (unsigned short) lookUpReceiverIndex; - #ifdef debug_OutputLookUpGeometryMemoryList - printf( "=====> Receiver (%4i) => hostLookUpGeometryMemoryListReceiverPtr [%4i] = %i\n", i, i, hostLookUpGeometryMemoryListReceiverPtr[i]); - #endif - lookUpReceiverIndex ++; - }else{ - hostLookUpGeometryMemoryListReceiverPtr[i] = (unsigned short) 65535; - //printf( "=====> Receiver (%4i) => hostLookUpGeometryMemoryListReceiverPtr [%4i] = %i\n", i, i, hostLookUpGeometryMemoryListReceiverPtr[i]); - } - } - - #ifdef debug_OutputLookUpGeometryMemoryList - //if (UsedAmountOfReceiver == lookUpReceiverIndex) // Check if all emitter have their Index - printf( "######### usedAmountOfReceiver (%i) = lookUpReceiverIndex (%i)\n", usedAmountOfReceiver , lookUpReceiverIndex); - #endif - - // Copy lookUpGeometryMemoryListEmitter, lookUpGeometryMemoryListReceiver to Constant Memory of Device - lookUpGeometryMemoryListEmitterSize = emitter_list_Size * sizeof(unsigned short); // Size of ListEmitterGeometry - lookUpGeometryMemoryListReceiverSize = receiver_list_Size * sizeof(unsigned short); // Size of ListReceiverGeometry - - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( "lookUpGeometryMemoryListEmitterSize = emitter_list_Size(=%i) * sizeof(unsigned short)(=%i) = %i\n", emitter_list_Size, sizeof(unsigned short), lookUpGeometryMemoryListEmitterSize); - printf( "lookUpGeometryMemoryListReceiverSize = receiver_list_Size(=%i) * sizeof(unsigned short)(=%i) = %i\n", receiver_list_Size, sizeof(unsigned short), lookUpGeometryMemoryListReceiverSize); - #endif - - CUDA_CHECK(cudaMemcpyToSymbol (constLookUpGeometryMemoryListEmitterPtr, hostLookUpGeometryMemoryListEmitterPtr, lookUpGeometryMemoryListEmitterSize, 0, cudaMemcpyHostToDevice )); - CUDA_CHECK(cudaMemcpyToSymbol (constLookUpGeometryMemoryListReceiverPtr, hostLookUpGeometryMemoryListReceiverPtr, lookUpGeometryMemoryListReceiverSize, 0, cudaMemcpyHostToDevice )); - - - //======================================================================================================= Memory allocation on GPU Device for - SOS 3DVolume-Data - - if (SOSMode_3DVolume == true) // ====================================== 3DVolume Mode with SOS-Correction - { - #ifndef SaftTextureForBresenhamSosPaths - #ifdef debug_OutputMemory - printf( "CUDA: cudaMalloc(deviceSpeedOfSoundField) der Groesse:%i Bytes\n", speedOfSoundFieldBytes); // Entweder oder -> abhuengig von uebergabe machen wie im Kernel - #endif - CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceSpeedOfSoundField), speedOfSoundFieldBytes)); - - #ifdef debug_OutputMemory - printf( "CUDA: Memory Copy: speedOfSoundField -> deviceSpeedOfSoundField\n"); // SoSGrid-Daten auf Device kopieren - #endif - CUDA_CHECK(cudaMemcpy(deviceSpeedOfSoundField, speedOfSoundField, speedOfSoundFieldBytes, cudaMemcpyHostToDevice)); - #endif - - #ifdef SaftTextureForBresenhamSosPaths - - struct cudaMemcpy3DParms copyParams = {0}; - - #ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) - // Texture for SOS 3DVolume - cudaChannelFormatDesc texChannelDescSpeedOfSoundField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - //CUDA_CHECK(cudaGetLastError()); - - // 1. 3D Array fuer Texturmemory anlegen - #ifdef debug_OutputMemory - //printf( "CUDA: cudaMalloc3DArray: deviceSpeedOfSoundFieldCuArray [%ix%ix%i] : %i Bytes\n",SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, (pitchedIntermediateSoSFieldDevPtr.pitch * SOSGrid_XYZ.y * SOSGrid_XYZ.z)); - printf( "CUDA: cudaMalloc3DArray: deviceSpeedOfSoundFieldCuArray [%ix%ix%i] : %i Bytes\n",SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, ((int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2)))) * SOSGrid_XYZ.y * SOSGrid_XYZ.z) ); - #endif - // Schritt 3.2 SpeicherArray3D auf Device anlegen fuer Texturmemory - cudaMalloc3DArray (& deviceSpeedOfSoundFieldCuArray, & texChannelDescSpeedOfSoundField, make_cudaExtent(SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.y, SOSGrid_XYZ.z), 0); - //CUDA_CHECK(cudaGetLastError()); - - // 2. Daten kopieren - // Kopiere Daten von speedOfSoundField (Host) zu 3D-alloziertem Zwischenspeicher direkt ins gepitchte 3DArray vorbereiten - - #ifdef debug_OutputMemory - printf( "CUDA: cudaMemcpy3D: make_cudaPitchedPtr(speedOfSoundField) -> deviceSpeedOfSoundFieldCuArray; Extent [%i %i %i] )\n", SOSGrid_XYZ.x , SOSGrid_XYZ.y, SOSGrid_XYZ.z); - #endif - - copyParams.srcPtr = make_cudaPitchedPtr((void*)speedOfSoundField, SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.x, SOSGrid_XYZ.y); - copyParams.srcPos = make_cudaPos(0,0,0); // Ab welcher Speicherstelle - copyParams.dstArray = deviceSpeedOfSoundFieldCuArray; // Ziel-Array - copyParams.dstPos = make_cudaPos(0,0,0); // Ab welcher Speicherstelle - copyParams.extent = make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z); - copyParams.kind = cudaMemcpyHostToDevice; - - // Schritt 3.4 Daten von Zwischenspeicher in 3DArray fuer Texturmemory kopieren - CUDA_CHECK(cudaMemcpy3D(©Params)); - - if (ATTMode_3DVolume == true){ // ====================================== 3DVolume Mode with ATT-Correction - // Texture for ATT 3DVolume - cudaChannelFormatDesc texChannelDescAttenuationField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - //CUDA_CHECK(cudaGetLastError()); - - // 1. 3D Array fuer Texturmemory anlegen - #ifdef debug_OutputMemory - //printf( "CUDA: cudaMalloc3DArray: deviceSpeedOfSoundFieldCuArray [%ix%ix%i] : %i Bytes\n",SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, (pitchedIntermediateSoSFieldDevPtr.pitch * SOSGrid_XYZ.y * SOSGrid_XYZ.z)); - printf( "CUDA: cudaMalloc3DArray: deviceAttenuationFieldCuArray [%ix%ix%i] : %i Bytes\n",SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, ((int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2)))) * SOSGrid_XYZ.y * SOSGrid_XYZ.z) ); - #endif - // Schritt 3.2 SpeicherArray3D auf Device anlegen fuer Texturmemory - cudaMalloc3DArray (& deviceAttenuationFieldCuArray, & texChannelDescAttenuationField, make_cudaExtent(SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.y, SOSGrid_XYZ.z), 0); - //CUDA_CHECK(cudaGetLastError()); - - // 2. Daten kopieren - // Kopiere Daten von speedOfSoundField (Host) zu 3D-alloziertem Zwischenspeicher direkt ins gepitchte 3DArray vorbereiten - - #ifdef debug_OutputMemory - printf( "CUDA: cudaMemcpy3D: make_cudaPitchedPtr(speedOfSoundField) -> deviceSpeedOfSoundFieldCuArray; Extent [%i %i %i] )\n", SOSGrid_XYZ.x , SOSGrid_XYZ.y, SOSGrid_XYZ.z); - #endif - - copyParams.srcPtr = make_cudaPitchedPtr((void*)attenuationField, SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.x, SOSGrid_XYZ.y); - copyParams.srcPos = make_cudaPos(0,0,0); // Ab welcher Speicherstelle - copyParams.dstArray = deviceAttenuationFieldCuArray; // Ziel-Array - copyParams.dstPos = make_cudaPos(0,0,0); // Ab welcher Speicherstelle - copyParams.extent = make_cudaExtent(ATTGrid_XYZ.x, ATTGrid_XYZ.y, ATTGrid_XYZ.z); - copyParams.kind = cudaMemcpyHostToDevice; - - // Schritt 3.4 Daten von Zwischenspeicher in 3DArray fuer Texturmemory kopieren - CUDA_CHECK(cudaMemcpy3D(©Params)); - } - #endif - - #ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att) - - cudaChannelFormatDesc texChannelDescSosAttField = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - // 1. Speicher auf Device allozieren - - // Zwischenspeicher im float2-Format auf dem Host anlegen und Sos und Att-Volumen reinkopieren - #ifdef debug_OutputMemory - printf( "HOST: create hostSosAttField and copy data from speedOfSoundField and attenuationField [%ix%ix%i] : %i Bytes\n",SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z * sizeof(float2) ); - #endif - hostSosAttField = (float2*)malloc(SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z * sizeof(float2)); // Auf dem Host ein Zwischenspeicher anlegen, fuer alle Elemente in float2 - - #ifdef debug_OutputInfo - printf("Copy Sos and Att-Volume into intermediate Buffer to float2-format\n"); - #endif - -// #ifdef debug_OutputVariables -// printf(" -> ------------------------------------------------------------------------\n"); -// printf(" -> hostSosAttField[1-3].x = [%f %f %f]\n", hostSosAttField[0].x, hostSosAttField[1].x, hostSosAttField[2].x); -// printf(" -> hostSosAttField[1-3].y = [%f %f %f]\n", hostSosAttField[0].y, hostSosAttField[1].y, hostSosAttField[2].y); -// printf(" -> ------------------------------------------------------------------------\n"); -// printf(" -> speedOfSoundField[1-3] = [%f %f %f]\n", speedOfSoundField[0], speedOfSoundField[1], speedOfSoundField[2]); -// printf(" -> attenuationField[1-3] = [%f %f %f]\n", attenuationField[0], attenuationField[1], attenuationField[2]); -// #endif - - // Copy Sos and Att-Volume into intermediate Buffer to float2-format - for (size_t i=0; i<(SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z); i++) { - - hostSosAttField[i].x = speedOfSoundField[i]; - - if (ATTMode_3DVolume == true){ // 3DVolume Mode with ATT-Correction - hostSosAttField[i].y = attenuationField[i]; - } - else{ - hostSosAttField[i].y = 0.0f; - } - } - -// #ifdef debug_OutputVariables -// printf(" -> ------------------------------------------------------------------------\n"); -// printf(" -> hostSosAttField[1-3].x = [%f %f %f]\n", hostSosAttField[0].x, hostSosAttField[1].x, hostSosAttField[2].x); -// printf(" -> hostSosAttField[1-3].y = [%f %f %f]\n", hostSosAttField[0].y, hostSosAttField[1].y, hostSosAttField[2].y); -// printf(" -> ------------------------------------------------------------------------\n"); -// #endif - - - - // 3D Array fuer SOS and ATT Volume Texturmemory anlegen - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - // Pitch besitzt eine Groesse von 2^n in X-Richtung - printf( "CUDA: cudaMalloc3DArray: deviceSosAttFieldCuArray [%ix%ix%i] : %i Bytes\n",SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, ((int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2)))) * SOSGrid_XYZ.y * SOSGrid_XYZ.z) ); - #endif - // Schritt 3.2 SpeicherArray3D auf Device anlegen fuer Texturmemory - cudaMalloc3DArray (& deviceSosAttFieldCuArray, & texChannelDescSosAttField, make_cudaExtent(SOSGrid_XYZ.x * sizeof(float2), SOSGrid_XYZ.y, SOSGrid_XYZ.z), 0); - - // Kopiere Daten von float2-Zwischenspeicher in 3DArray vorbereiten - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaMemcpy3D: make_cudaPitchedPtr(hostSosAttField) -> deviceSosAttFieldCuArray; Extent [%i %i %i] )\n", SOSGrid_XYZ.x , SOSGrid_XYZ.y, SOSGrid_XYZ.z); - #endif - //struct cudaMemcpy3DParms copyParams = {0}; - copyParams.srcPtr = make_cudaPitchedPtr((void*)hostSosAttField, SOSGrid_XYZ.x * sizeof(float2), SOSGrid_XYZ.x, SOSGrid_XYZ.y); - copyParams.srcPos = make_cudaPos(0,0,0); // Ab welcher Speicherstelle - copyParams.dstArray = deviceSosAttFieldCuArray; // Ziel-Array - copyParams.dstPos = make_cudaPos(0,0,0); // Ab welcher Speicherstelle - copyParams.extent = make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z); - copyParams.kind = cudaMemcpyHostToDevice; - - // Schritt 3.4 Daten von Zwischenspeicher in 3DArray fuer Texturmemory kopieren - CUDA_CHECK(cudaMemcpy3D(©Params)); - - #ifdef debug_OutputMemory - printf( "HOST: free hostSosAttField\n"); - #endif - // Free intermediate Buffer for Sos and Att-Volume - free(hostSosAttField); - - #endif - - - #endif - } - - - - - //=========================================================================================================================== adjust Size of OutputWindow - Part 1 - #ifdef debug_OutputInfo - printf( "==> automatically adjust OutputWindow\n"); - printf( "Whole OutputSize (outputVolume_Bytes = %lld Bytes) have to be divided in z-layers\n", outputVolume_Bytes); // Speicherbedarf fuer alle Voxel im Volumen // aus saft.cpp - #endif - - // Feste Groeßssen einer Z-Layer und SoS-Z-Layer bestimmen - // ======================================================== - - std::size_t zLayerCount; // Anzahl Z-Layer zur Bestimmung der Anzahl der moeglichen Z-Layer für Outputgroesse - std::size_t zLayerSize; // Benoetigter Speicher für eine Z-Layer des Outputvolumens = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y * sizeof(double) Byte - std::size_t sosZLayerSize; // Benoetigter Speicher fuer alle S/E-Kombinationen fuer eine SoS-Z-Layer = SOSGrid_XYZ.x * SOSGrid_XYZ.y * (1413+628) * (4+1) Byte - - // Volumen Z-Layer - zLayerVoxelCount = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y; // Anzahl der X-Y-Voxel in einer Volumen-Layer. - zLayerSize = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y * sizeof(double); // Benoetigter Speicher für eine Z-Layer des Outputvolumens - - #ifdef debug_OutputVariables - printf( "zLayerVoxelCount = IMAGE_SIZE_XYZ.x(%i) * IMAGE_SIZE_XYZ.y(%i) = %i\n", IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, zLayerVoxelCount); - printf( "zLayerSize = IMAGE_SIZE_XYZ.x(%i) * IMAGE_SIZE_XYZ.y(%i) * Double(8) = %i\n", IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, zLayerSize); - #endif - - - // SoS Z-Layer - // Fuer Receiver muss wegen der Anzahl von 1413, für die jeweils 2 SOS-Z-Layer gespeichert werden muessen, - // zwei Arrays angelegt werden, da sie nicht mehr ins Texturmemory passen (2*1413 > 2048). - - //TODO: Das koennte berechnet statt vorgegeben werden. - maxSoSReceiverArrayForTexture = MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE; // 2*710=1420 koennten so gewahlt maximal, genutzt werden aber nur 1413 - - // Wie viele Receiverbloecke benoetigt man. Abhaenig von Maximaler Anzahl muss < 2048 sein - TableVoxelToReceiverPathSosAllocationCount = ceil((float)usedAmountOfReceiver/maxSoSReceiverArrayForTexture); - receiver_list_Size_deviceMemory = maxSoSReceiverArrayForTexture * TableVoxelToReceiverPathSosAllocationCount; // Auf vielfaches von 710 vergroesserter Speicher - - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - //printf( "CUDA: cudaMalloc3D: pitchedTableVoxelToReceiverPathSosSumDevPtr[%i] [%ix%ix%i]\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * receiver_list_Size)); - printf( "maxSoSReceiverArrayForTexture = %i (definiert vorgegeben)\n", maxSoSReceiverArrayForTexture); - printf( "TableVoxelToReceiverPathSosAllocationCount = (usedAmountOfReceiver(%4i)/maxSoSReceiverArrayForTexture(%4i)) = %i\n", usedAmountOfReceiver, maxSoSReceiverArrayForTexture, TableVoxelToReceiverPathSosAllocationCount); - printf( "receiver_list_Size_deviceMemory = maxSoSReceiverArrayForTexture(%4i)*TableVoxelToReceiverPathSosAllocationCount(%4i) = %i\n", maxSoSReceiverArrayForTexture, TableVoxelToReceiverPathSosAllocationCount, receiver_list_Size_deviceMemory); - printf( "sosZLayerVoxelCount = SOSGrid_XYZ.x (%i) * SOSGrid_XYZ.y(%i) = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, sosZLayerVoxelCount); // Zusaetzliche Ausgabe der VoxelAnzahl - #endif - - - // Benoetigter Speicher fuer alle S/E-Kombinationen fuer eine SoS-Z-Layer - sosZLayerVoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y; // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. - // TODO: Aktuell wird nicht von usedAmountOfReceiver sondern von maxSoSReceiverArrayForTexture = 710 ausgegangen. Da normalerweise meist alle Receiver und einige Emitter im Durchlauf genutzt werden. - // sosZLayerSize = sosZLayerVoxelCount * (usedAmountOfEmitter + usedAmountOfReceiver) * (sizeof(float) + sizeof(float)); //SOSGrid_XYZ.x * SOSGrid_XYZ.y * (1413+628) * (4+4) Byte - sosZLayerSize = TableVoxelToReceiverPathSosAllocationCount * sosZLayerVoxelCount * (usedAmountOfEmitter + maxSoSReceiverArrayForTexture) * (4 * sizeof(float)); //SOSGrid_XYZ.x * SOSGrid_XYZ.y * (max628 + max710) * (4*4) Byte - - std::size_t maxZLayerCountInOneSoSZLayer = ceil(SOS_RESOLUTION / IMAGE_RESOLUTION)+1; // Maximale Anzahl an Z-Layern in einem SOS-Z-Layer +1 ist auch moeglich!? - std::size_t maxZLayerCountInOneSoSZLayerSize = maxZLayerCountInOneSoSZLayer * zLayerSize + sosZLayerSize; // Benoetigter Speicher fuer alle Z-Layer mit benoetigter SoS-Z-Layer innerhalb einer SOS-Z-Layer - - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( "sosZLayerVoxelCount = SOSGrid_XYZ.x(%i) * SOSGrid_XYZ.y(%i) = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, sosZLayerVoxelCount); - printf( "sosZLayerSize = sosZLayerVoxelCount * (usedAmountOfEmitter(%i) + usedAmountOfReceiver(%i)) * (4+4)Byte = %i Bytes\n", usedAmountOfEmitter, usedAmountOfReceiver, sosZLayerSize); - printf( "maxZLayerCountInOneSoSZLayer = ceil(SOS_RESOLUTION(%f)/ IMAGE_RESOLUTION(%f)) = %i\n", SOS_RESOLUTION, IMAGE_RESOLUTION, maxZLayerCountInOneSoSZLayer); - printf( "maxZLayerCountInOneSoSZLayerSize = maxZLayerCountInOneSoSZLayer(%i) * zLayerSize(%i) + sosZLayerSize(%i) = %i Bytes\n", sosZLayerVoxelCount, zLayerSize, sosZLayerSize, maxZLayerCountInOneSoSZLayerSize); - #endif - - - // Anpassung der Speichergroesen - // ======================================================== - //Der freie Speicherplatz muss angepasst werden damit Speicher fuer folgende Belegung zur Verfuegung steht: - //1. Die Speed of Sound-Volumen Daten fuer die SoS Vorberechungen (relativ klein) - //2. Die Ausgangsdaten von der SoS Vorverarbeitung (Emitter->Voxel)(etwas groeßer) - // Der Zweite ist problematisch da er nicht einfach wie 1tes abgezogen werden kann, denn seine Groesse ist nicht bekannt - // Seine Groesse haengt von der Anzahl z-Layer ab, die gewaehlt wird. - - std::size_t - totalMemory, - freeMemory; -// CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); // Freier Speicher auslesen und am Anfang ausgeben -// memoryCheck(); - - // Die festen Speichergrößen wurden schon allokiert und damit belegt. - // ==> d.h. nur noch SoS-Zlayer und Z-Layer müssen bestimmt und genutzt werden. - - // Neben der Beschraenkung der maximalen Grid und Blockgoesse muss auch der dem Speicherplatz mit betrachtet werden. - // Zuerst in Z-Layer in Bezug auf die Gridgroesse einteilen, da diese Beschraenkung schon automatisch zur Verkleinerung des benoetigten Speichers fuehrt - // Maximal koennen 67108864 Voxel(=Block(1024)*Grid(65536)) auf einmal berechnet werden. Das entspricht 67108864*8 = 536,871MB - // Damit kann die maximale Outputgroesse von 32-Bit nicht ueberschritten werden. ==> Gesamtgroesse wird automatisch in Z-layer eingeteilt. - // Die Grid-Dimensionen fuer die Durchfuehrung der Rekonstruktion wird so an XY angepasst, das nur noch eine bestimmte Anzahl von Z-Layer berechnet werden koennen. - - - windowGridDimensions.x = genericSAFTGridDimensions.x; - windowGridDimensions.y = genericSAFTGridDimensions.y; - //windowGridDimensions.z = (IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z; // Aufrunden. Ueberpruefung, ob zu viel findet im Kernel statt - windowGridDimensions.z = genericSAFTGridDimensions.z; // Mueste auch ohne Aufrunden funktionieren, da genericSAFTGridDimensions.z genauso berechnet wurde - - #ifdef debug_OutputVariables - printf( "windowGridDimensions x,y,z: %i %i %i\n", windowGridDimensions.x, windowGridDimensions.y, windowGridDimensions.z); - #endif - - #ifdef debug_OutputInfo - printf( "Need to fit the Block size?"); - #endif - - // Anpassung 1 - // Ueberschreiten die GesamtGridDimensionen die maximal erlaubte Anzahl auf der GPU => dann Aufteilung in Z-Layer - // Wenn alles passt kann komplettes Volumen in einem GPU-Grid berechnet werden = (Grid = #Threads * #Bloecke (max. 1024*65536=2^26)) - if((windowGridDimensions.x * windowGridDimensions.y * windowGridDimensions.z) > 65536) //65536=max Anzahl Bloecke im Grid!!!! - { - #ifdef debug_OutputInfo - printf( " ==> Yes\n"); - #endif - - #ifdef debug_OutputVariables - printf( "windowGridDimensions X*Y*Z (%i) higher than 65536!!! => partialVolumeSize & zLayerCount bestimmen\n",(windowGridDimensions.x * windowGridDimensions.y * windowGridDimensions.z)); - #endif - - int zBlockCount = 65536/(windowGridDimensions.x * windowGridDimensions.y); // = Maximal moegliche Anzahl Z-Layer, die fuer eine XY-Resolution noch berechnet werden koennen, - // da sonst maximale Blockanzahl im Grid ueberschritten wuerde - #ifdef debug_OutputVariables - printf( "zBlockCount = 65536/(windowGridDimensions.x * windowGridDimensions.y) = %i \n",zBlockCount); - #endif - if (zBlockCount == 0) - printf("(zBlockCount == 0) && (windowGridDimensions > 65536) => try higher genericSAFTBlockDimensions if possible or lower XY-Resolution"); - - zLayerCount = zBlockCount * genericSAFTBlockDimensions.z; - partialVolumeSize = zLayerCount * zLayerSize; // partialVolumeSize wird an maximal parallel berechenbare Z-Layer angepasst. - #ifdef debug_OutputVariables - printf( "zLayerCount = zBlockCount * genericSAFTBlockDimensions.z = %i \n",zLayerCount); - printf( "partialVolumeSize = zLayerCount(%i) * zLayerSize(%i) = %i \n",zLayerCount, zLayerSize, partialVolumeSize); - printf( "partialVolumeSize = %u \n", partialVolumeSize); - #endif - } - else // komplettes Volumen kann in einem GPU-Grid berechnet werden - { - #ifdef debug_OutputInfo - printf( " ==> No\n"); - #endif - partialVolumeSize = outputVolume_Bytes; // TeilOutputgroesse = Outputgroesse für Volumen - zLayerCount = genericSAFTGridDimensions.z; - #ifdef debug_OutputVariables - printf( "partialVolumeSize = outputVolume_Bytes(%lld) \n", outputVolume_Bytes); - printf( "zLayerCount = genericSAFTGridDimensions.z = %i \n",zLayerCount); - #endif - } - - // Anpassung 2 - std::size_t requiredSosLayer; // Für die Anzahl der Z-Layer minimal benoetigte SoSZ-Layer die zu berechnen sind + 1 für Interpolation - zLayerCount = maxZLayerCountInOneSoSZLayer; // Maximale Z-Layer die berechnet werden kann ist # die in eine SOS Z-layer passt. - - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( "Determine SoS-ZLayer for preprocessSOSpaths: \n"); - printf( "================================================\n"); - #endif - - // Setzen der maximal moeglichen SOS-ZLayer! - switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]) - { - case 0: // Mit Textur -> 1ne SOS-ZLayer - requiredSosLayer = ceil((float)zLayerCount/(float)maxZLayerCountInOneSoSZLayer) ; // Für die Anzahl der Z-Layer maximal benoetigte SoSZ-Layer die zu berechnen sind - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( " set: requiredSosLayer = ceil(zLayerCount(%u)/maxZLayerCountInOneSoSZLayer(%u)) = %u \n", zLayerCount, maxZLayerCountInOneSoSZLayer, requiredSosLayer); - #endif - break; - case 1: // Mit Textur & Interpolation -> 2 SOS-ZLayer - requiredSosLayer = ceil((float)zLayerCount/(float)maxZLayerCountInOneSoSZLayer) + 1; // Für die Anzahl der Z-Layer maximal benoetigte SoSZ-Layer die zu berechnen sind + 1 für Interpolation - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( " set: requiredSosLayer = ceil(zLayerCount(%u)/maxZLayerCountInOneSoSZLayer(%u)) +1 = %u \n", zLayerCount, maxZLayerCountInOneSoSZLayer, requiredSosLayer); - #endif - break; - } - - maxFeasibleSosZLayerCount = requiredSosLayer; // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt. - #ifdef debug_OutputVariables - printf( "maxFeasibleSosZLayerCount = %u\n", maxFeasibleSosZLayerCount); - #endif - - // Setzen der maximal moeglichen ZLayer! - if (genericSAFTGridDimensions.z < maxZLayerCountInOneSoSZLayer){ - maxFeasibleZLayerCount = genericSAFTGridDimensions.z; // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt. - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( " set: maxFeasibleZLayerCount = genericSAFTGridDimensions.z = %u\n", genericSAFTGridDimensions.z); - #endif - } - else{ - maxFeasibleZLayerCount = maxZLayerCountInOneSoSZLayer; // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt. - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( " set: maxFeasibleZLayerCount = maxZLayerCountInOneSoSZLayer = %u\n", maxFeasibleZLayerCount); - #endif - } - - - size_t MatlabSavety = (MATLABSAVETY_MB * 1024 * 1024); // Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann. - partialVolumeSize = zLayerCount * zLayerSize + MatlabSavety; // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde - partialSosPathSize = requiredSosLayer * sosZLayerSize; // Speicher(SOSATTPaths ), der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde - partialAscanIndexSize = requiredSosLayer * TableAscanIndexAllocationCount * SOSGrid_XYZ.x * SOSGrid_XYZ.y * (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture) * 4*((int)(SOSMode_3DVolume) + (int)(ATTMode_3DVolume)); // Speicher(AscanIndex) , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde - - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( " set: maxFeasibleZLayerCount = maxZLayerCountInOneSoSZLayer = %u\n", maxFeasibleZLayerCount); - printf( " set: maxFeasibleSosZLayerCount = requiredSosLayer = %u\n", maxFeasibleSosZLayerCount); - printf( " set: partialVolumeSize = zLayerCount (%3u) * zLayerSize (%12u Bytes) = %12u Bytes = %u MiB\n", zLayerCount, zLayerSize, partialVolumeSize, partialVolumeSize/1024/1024); - printf( " set: partialSosPathSize = requiredSosLayer(%3u) * sosZLayerSize(%12u Bytes) = %12u Bytes = %u MiB\n", requiredSosLayer, sosZLayerSize, partialSosPathSize, partialSosPathSize/1024/1024); - printf( " set: partialAscanIndexSize = requiredSosLayer(%3u) * ... = %12u Bytes = %u MiB\n", requiredSosLayer, partialAscanIndexSize, partialAscanIndexSize/1024/1024); - printf( " -> Free GPU Memory (%6lld MiB) - required Memory (%6lld MiB)\n", memoryGPUfree()/1024/1024, (partialVolumeSize + partialSosPathSize)/1024/1024); - #endif - - // Warning if free memory on GPU Device is too small for OutputVolume and SOSATT-Path Data - #ifdef debug_OutputAScanIndexMemoryDivision - printf("%s :\n", device.name); - printf(" Total memory %lld Bytes\n", memoryGPUtotal() ); - printf(" Free memory %lld Bytes\n", memoryGPUfree() ); - printf(" => Used memory %lld Bytes\n", (memoryGPUtotal()-memoryGPUfree())); - #endif - //memoryCheck(); // Freier Speicher am Anfang ausgeben - - if (memoryGPUfree() <= (partialVolumeSize + partialSosPathSize)){ - maxSupportedTexturesForAscanIndex = 1; - printf("Free GPU Memory (%lld Bytes) < required Memory (%lld Bytes)\n --> try to reduce amount of Receiver!\n", memoryGPUfree(), (partialVolumeSize + partialSosPathSize) ); - } - - //================================================================================================= Memory allocation on GPU Device for - Table SoS-Paths - #ifdef debug_OutputInfo - printf( "Allocate Memory for SOS-Path Calculations:\n"); - #endif - - // Aufteilung der SOSPfadTabelle in 3D Array (x, y, z * Emitter bzw. Receiver) - - //std::size_t requestedMemorySize = 0; - unsigned int requestedMemorySize = 0; - int MallocStep = 0; // Wenn Fehler nur die Speicherbereiche freigeben, die auch alloziert wurden - - // 1. Speicher auf Device allozieren - // Fuer Emitter-SOSPfade Pitched Memory und 3D Array anlegen - // CUDA_CHECK(cudaDeviceSynchronize()); - // memoryCheck(); - - #ifdef SaftTextureForEmRecSosPathsTablesFloat1 - // SoSEmitterPathsSum mit Texturmemory ------------------------------------- - cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathSosSum = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - - // 1. Speicher auf Device allozieren - // 3D Array fuer Texturmemory anlegen - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - //printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToEmitterPathSosSumCuArray [%ix%ix%i]: %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * emitter_list_Size), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2))))*SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * emitter_list_Size) ); - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToEmitterPathSosSumCuArray [%ix%ix%i]: %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2))))*SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter) ); - #endif - - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTableVoxelToEmitterPathSosSumCuArray, - & texChannelDescTableVoxelToEmRecPathSosSum, - //make_cudaExtent(SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * emitter_list_Size) ), - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter) ), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! - //0 - cudaArraySurfaceLoadStore - ) - ); - - CUDA_CHECK(cudaDeviceSynchronize()); - memoryCheck(); - - // SoSEmitterPathsCount mit Texturmemory ------------------------------------- - //cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathCount = cudaCreateChannelDesc(); // Schritt 2.1 Output-Kanal anlegen und beschreiben - cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathCount = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - - // 1. Speicher auf Device allozieren - // 3D Array fuer Texturmemory anlegen - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - //printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToEmitterPathCountCuArray [%ix%ix%i]: %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * emitter_list_Size), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2))))*SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * emitter_list_Size)); - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToEmitterPathCountCuArray [%ix%ix%i]: %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2))))*SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter)); - #endif - // Schritt 3.2 SpeicherArray3D auf Device anlegen f�r Texturmemory - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTableVoxelToEmitterPathCountCuArray, - & texChannelDescTableVoxelToEmRecPathCount, - //make_cudaExtent(SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * emitter_list_Size) ), - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter) ), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! - //0 - cudaArraySurfaceLoadStore - ) - ); - #endif - #ifdef SaftTextureForEmRecSosPathsTablesFloat2 - // SoSEmitterPathsSum mit Texturmemory ------------------------------------- - cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathSosBoth = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - Float2 - - // 1. Speicher auf Device allozieren - // 3D Array fuer Texturmemory anlegen - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - //printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToEmitterPathSosSumCuArray [%ix%ix%i]: %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * emitter_list_Size), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2))))*SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * emitter_list_Size) ); - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToEmPathSosBothCuArray [%ix%ix%i] (Float2): %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float2))/log2(2))))*SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter)); - #endif - #endif - #ifdef SaftTextureForEmRecSosPathsTablesFloat4 - // SoSEmitterPathsSum mit Texturmemory ------------------------------------- - cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathSosBoth = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - Float4 - - requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float4)) / (float)deviceProperties[deviceId].texturePitchAlignment) * deviceProperties[deviceId].texturePitchAlignment * SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * usedAmountOfEmitter); - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToEmPathSosBothCuArray [%ix%ix%i] (Float4): %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter), requestedMemorySize); - #endif - - // Warning if free memory on GPU Device is too small for requested Memory size - if (memoryGPUfree() <= requestedMemorySize){ - printf( "Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize ); - printf( "Not possible CUDA: cudaMalloc3DArray: deviceTableVoxelToEmPathSosBothCuArray [%ix%ix%i] (Float4): %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter), requestedMemorySize); - printf( "Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); - Abort_ptr[0] = 1; - // mexErrMsgTxt(" "); - } - #endif - - if (Abort_ptr[0] == 0){ - #if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4) - // 3D Array fuer Texturmemory anlegen - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTableVoxelToEmPathSosBothCuArray, - & texChannelDescTableVoxelToEmRecPathSosBoth, - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter) ), // bei cudaMalloc3DArray wird width in Elementen angegeben! - //0 - cudaArraySurfaceLoadStore - ) - ); - MallocStep ++; - #endif - // CUDA_CHECK(cudaDeviceSynchronize()); - // memoryCheck(); - - // Fuer Receiver-SOSPfade Pitched Memory und 3D Array anlegen - // Fuer Receiver benoetigt man zwei Texturen da 2*1413 > 2048 die maximal erlaubt sind. - // TableVoxelToReceiverPathSosAllocationCount = Anzahl der Teiltabellen an. - // maxSoSReceiverArrayForTexture = Anzahl der Einträge in einer Teiltabelle - // maxFeasibleSosZLayerCount = Anzahl der SoS-Zlayer die gleichzeitig im Speicher vorgehalten werden (1 oder 2 bei Interpolation) - - #ifdef SaftTextureForEmRecSosPathsTablesFloat1 - deviceTableVoxelToReceiverPathSosSumCuArray = new cudaArray * [TableVoxelToReceiverPathSosAllocationCount]; // Für Arbeiten mit Texturmemory - //deviceTableVoxelToReceiverPathSosSumCuArray = (cudaArray **)malloc( TableVoxelToReceiverPathSosAllocationCount * sizeof( cudaArray *)); - //deviceTableVoxelToReceiverPathSosSumCuArray = (cudaArray **)cudaMalloc( TableVoxelToReceiverPathSosAllocationCount * sizeof( cudaArray *)); // ??? TODO, geht das auch? --> Dann statt delete auch free als gegenüber von malloc - deviceTableVoxelToReceiverPathCountCuArray = new cudaArray * [TableVoxelToReceiverPathSosAllocationCount]; // Für Arbeiten mit Texturmemory - //deviceTableVoxelToReceiverPathCountCuArray = (cudaArray **)malloc( TableVoxelToReceiverPathSosAllocationCount * sizeof( cudaArray *)); - #endif - #if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4) - deviceTableVoxelToRecPathSosBothCuArray = new cudaArray * [TableVoxelToReceiverPathSosAllocationCount]; // Für Arbeiten mit Texturmemory - #endif - } - - for(int i = 0; i < TableVoxelToReceiverPathSosAllocationCount; i++) // Fuer Receiver benoetigt man zwei Texturen da 2 * 1413 > 2048 die maximal erlaubt sind. - { - - #ifdef SaftTextureForEmRecSosPathsTablesFloat1 - // SoSReceiverPathsSum mit Texturmemory ------------------------------------- - - // 3D Array fuer Texturmemory anlegen - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - //printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToReceiverPathSosSumCuArray[%i] [%ix%ix%i]\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * receiver_list_Size)); - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToReceiverPathSosSumCuArray[%i] [%ix%ix%i]: %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2)))) * SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture)); - #endif - // Schritt 3.2 SpeicherArray3D auf Device anlegen f�r Texturmemory - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTableVoxelToReceiverPathSosSumCuArray[i], - & texChannelDescTableVoxelToEmRecPathSosSum, - //make_cudaExtent(SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * receiver_list_Size) ), - //make_cudaExtent(SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture) ), // Immer ein vielfaches der Größe anlegen - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture) ), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! - //0 - cudaArraySurfaceLoadStore - ) - ); - //CUDA_CHECK(cudaDeviceSynchronize()); - //memoryCheck(); - // SoSReceiverPathsCount mit Texturmemory ------------------------------------- - - // 3D Array fuer Texturmemory anlegen - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToReceiverPathCountCuArray[%i] [%ix%ix%i]: %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float))/log2(2))))*SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture)); - #endif - // Schritt 3.2 SpeicherArray3D auf Device anlegen f�r Texturmemory - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTableVoxelToReceiverPathCountCuArray[i], - & texChannelDescTableVoxelToEmRecPathCount, - //make_cudaExtent(SOSGrid_XYZ.x * sizeof(float), SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture) ), - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture) ), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! - //0 - cudaArraySurfaceLoadStore - ) - ); - //CUDA_CHECK(cudaDeviceSynchronize()); - //memoryCheck(); - #endif - #ifdef SaftTextureForEmRecSosPathsTablesFloat2 - // SoSReceiverPathsBoth mit Texturmemory ------------------------------------- - - // 3D Array fuer Texturmemory anlegen - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToRecPathSosBothCuArray[%i] [%ix%ix%i] (Float2): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture), (int) pow(2, ( ceil(log2(SOSGrid_XYZ.x * sizeof(float2))/log2(2)))) * SOSGrid_XYZ.y* (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture)); - #endif - //CUDA_CHECK(cudaDeviceSynchronize()); - //memoryCheck(); - #endif - #ifdef SaftTextureForEmRecSosPathsTablesFloat4 - // SoSReceiverPathsBoth mit Texturmemory ------------------------------------- - - // 3D Array fuer Texturmemory anlegen - requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float4)) / (float)deviceProperties[deviceId].texturePitchAlignment) * deviceProperties[deviceId].texturePitchAlignment * SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture); - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaMalloc3DArray: deviceTableVoxelToRecPathSosBothCuArray[%i] [%ix%ix%i] (Float4): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture), requestedMemorySize); - #endif - - // Warning if free memory on GPU Device is too small for requested Memory size - if (memoryGPUfree() <= requestedMemorySize){ - printf( "Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize ); - printf( "Not possible CUDA: cudaMalloc3DArray: deviceTableVoxelToRecPathSosBothCuArray[%i] [%ix%ix%i] (Float4): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture), requestedMemorySize); - printf( "Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); - Abort_ptr[0] = 1; - // mexErrMsgTxt(" "); - } - #endif - - // CUDA_CHECK(cudaDeviceSynchronize()); - // memoryCheck(); - - if (Abort_ptr[0] == 0){ - #if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4) - // Schritt 3.2 SpeicherArray3D auf Device anlegen fuer Texturmemory - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTableVoxelToRecPathSosBothCuArray[i], - & texChannelDescTableVoxelToEmRecPathSosBoth, - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture) ), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! - //0 - cudaArraySurfaceLoadStore - ) - ); - MallocStep ++; - #endif - } - } - - - // Schritt 2.1 Output-Kanal anlegen und beschreiben - - cudaChannelFormatDesc texChannelDescTableAscanIndexFloat; - if (ATTMode_3DVolume == false){ // ========= 3DVolume Mode without ATT-Correction - texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); - } - else if (ATTMode_3DVolume == true){ // ========= 3DVolume Mode with ATT-Correction - texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - } - - // Pitched Memory und 3D Array anlegen fuer Ascan-Index - - // Für Ascan-Index benoetigt man mehrere Texturen fuer jeweils 2 Z-Layer. - // 2*N < maxSurfaceTexture3DDimension(Fermi&Kepler: 2048) ==> (1024 Em/Rec - Kombinationen) - // maxSurfaceTexture3DDimension = maximale Groesse die erlaubt ist - // TableAscanIndexAllocationCount = Anzahl der Teiltabellen ==> auch Anzahl der benoetigten Durchlaeufe - // maxFeasibleSosZLayerCount = Anzahl der SoS-Zlayer die gleichzeitig im Speicher pro EM/REC-Kombi vorgehalten werden (1 oder 2 bei Interpolierten Variante) - // maxAscanIndexArraysInTexture = Anzahl der Ascans in einer Teiltabelle - // neededAscanBatchCount = Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen - - deviceTextureAscanIndexFloatCuArray = new cudaArray * [TableAscanIndexAllocationCount]; // Für Arbeiten mit Texturmemory - - for(int i = 0; i < TableAscanIndexAllocationCount; i++) // Für AscanIndex benoetigt man n Texturen da maximal 2048 Z-Layer erlaubt sind. - { - - // Ascan-IndexPathsSum mit Texturmemory ------------------------------------- - // Schritt 3.2 Surface-SpeicherArray3D auf Device anlegen fuer Texturmemory - - // 3D Array fuer Texturmemory anlegen - // CUDA_CHECK(cudaDeviceSynchronize()); - // memoryCheck(); - - if (ATTMode_3DVolume == false){ // ========= 3DVolume Mode without ATT-Correction - requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float)) / (float)deviceProperties[deviceId].texturePitchAlignment) * deviceProperties[deviceId].texturePitchAlignment * SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture); - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaMalloc3DArray: deviceTextureAscanIndexFloatCuArray[%i] [%ix%ix%i] (float): %u Bytes\n", - i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture), - requestedMemorySize ); - #endif - - // Warning if free memory on GPU Device is too small for requested Memory size - if (memoryGPUfree() <= requestedMemorySize){ - printf( "Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize ); - printf( "Not possible CUDA: cudaMalloc3DArray: deviceTextureAscanIndexFloatCuArray[%i] [%ix%ix%i] (float): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture), requestedMemorySize ); - printf( "Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); - Abort_ptr[0] = 1; - // mexErrMsgTxt(" "); - } - - if (Abort_ptr[0] == 0){ - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTextureAscanIndexFloatCuArray[i], - & texChannelDescTableAscanIndexFloat, - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture) ), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! - cudaArraySurfaceLoadStore - ) - ); - MallocStep ++; - } - } - else if (ATTMode_3DVolume == true){ // ========= 3DVolume Mode with ATT-Correction - requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float2)) / (float)deviceProperties[deviceId].texturePitchAlignment) * deviceProperties[deviceId].texturePitchAlignment * SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture); - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaMalloc3DArray: deviceTextureAscanIndexFloatCuArray[%i] [%ix%ix%i] (float2): %u Bytes\n", - i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture), - requestedMemorySize); - #endif - - // Warning if free memory on GPU Device is too small for requested Memory size - if (memoryGPUfree() <= requestedMemorySize){ - printf( "Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize ); - printf( "Not possible CUDA: cudaMalloc3DArray: deviceTextureAscanIndexFloatCuArray[%i] [%ix%ix%i] (float2): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture), requestedMemorySize); - printf( "Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); - Abort_ptr[0] = 1; - // mexErrMsgTxt(" "); - } - - if (Abort_ptr[0] == 0){ - CUDA_CHECK( - cudaMalloc3DArray ( - & deviceTextureAscanIndexFloatCuArray[i], - & texChannelDescTableAscanIndexFloat, - make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture) ), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! - cudaArraySurfaceLoadStore - ) - ); - MallocStep ++; - } - } - - - // CUDA_CHECK(cudaDeviceSynchronize()); - // memoryCheck(); - - } - - - // Set SOSPathMemory to Zero as Initialisation, if not used for useTwoLoops there can be data, from calculations before --> Wrong calculations - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - struct timeval startfillCuArray, stopfillCuArray; - gettimeofday(&startfillCuArray, NULL); - #endif - - if (Abort_ptr[0] == 0){ - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: fillCuArray: set deviceTextureAscanIndexFloatCuArray = 0\n"); - #endif - fillCuArray((float)0.0, deviceTextureAscanIndexFloatCuArray, TableAscanIndexAllocationCount); - } - - #ifdef debug_OutputStepsPerformance - CUDA_CHECK(cudaDeviceSynchronize()); - gettimeofday(&stopfillCuArray, NULL); - diff_time = (double)((stopfillCuArray.tv_sec * 1000000.0 + stopfillCuArray.tv_usec) - (startfillCuArray.tv_sec * 1000000.0 + startfillCuArray.tv_usec)); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### fillCuArray Surfaces (TableAscanIndexAllocationCount=%i) = %8.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, TableAscanIndexAllocationCount, diff_time); - #endif - - //CUDA_CHECK(cudaDeviceSynchronize()); - //memoryCheck(); - - - //=========================================================================================================================== adjust Size of OutputWindow - Part 2 - CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); // Freier Speicher auslesen - //memoryCheck(); - - #if defined(debug_OutputVariables) || defined(debug_OutputLookUpGeometryMemoryList) - printf( "\nIs complete Outputvolume small enough for Memory?\n"); - printf( " ??? (partialVolumeSize(%u) > freeMemory (%u) = %i\n", partialVolumeSize, freeMemory, ((partialVolumeSize) > freeMemory) ); - #endif - - - if ((partialVolumeSize) > freeMemory) - { - #ifdef debug_OutputVariables - printf( " ==> No. Adjust the output window size as there is not enough device memory left\n"); - printf( " (partialVolumeSize(%u)) > freeMemory (%u)\n", partialVolumeSize, freeMemory); - #endif - - //Brute-force the greatest possible number of z-layers for the partial output window and the speed of sound data window. - bool success = false; - - for(zLayerCount; zLayerCount > 0; zLayerCount--) //Anpassungsschritt 2: zLayerCount = (freeMemory - noch benoetigtem Speicher)/ (zLayerSize * sizeof(double)) - { - std::size_t allocationSize = 0; - partialVolumeSize = zLayerCount * zLayerSize + MatlabSavety; // Speicher, der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde - - allocationSize += partialVolumeSize; // Insgesamt benoetigter Volumen- und SoS-Pfadspeicher - - if(allocationSize <= freeMemory) //allocationSize - { - #ifdef debug_OutputVariables - printf( "allocationSize <= freeMemory bei\n"); - printf( " partialVolumeSize = zLayerCount(%i) * zLayerSize(%i) = %u\n", zLayerCount, zLayerSize, partialVolumeSize); - #endif - - maxFeasibleZLayerCount = zLayerCount; // Die Anzahl an Layer nutzen für Rekonstruktion, die gerade nochin den Speicher passt - partialVolumeSize = maxFeasibleZLayerCount * zLayerSize; // Speicher, der fuer die entsprechende Anzahl an Z-Layern benötigt wuerde - - #ifdef debug_OutputVariables - printf( " => maxFeasibleZLayerCount = %u\n", maxFeasibleZLayerCount); - printf( " => partialVolumeSize = maxFeasibleZLayerCount(%i) * zLayerSize(%i) = %u\n", maxFeasibleZLayerCount, zLayerSize, partialVolumeSize); - #endif - - success = true; - break; // Sobald die Anzahl an Z-Layern und SoS-Z-Layern passt Schleife beenden - } - #ifdef debug_OutputVariables - else - printf( " => zLayerCount = %u\n", zLayerCount); - #endif - } - - if(!success) - { - printf("Not enough free device memory available for one Z-Layer(%i) from outputVolume_Bytes(%u) and one Sos-Z-Layer(%u) to perform reconstruction\n", zLayerSize, outputVolume_Bytes, sosZLayerSize); - printf("Use a smaller Volume resolution for X-Y-Direction !!!!!!\n"); - Abort_ptr[0] = 1; - // mexErrMsgTxt(" "); - } - else - { - #ifdef debug_OutputInfo - printf( "Automatically reduced the partialVolumeSize size to %u bytes\n", partialVolumeSize); - printf( " -> maxFeasibleZLayerCount = %u\n", maxFeasibleZLayerCount); - #endif - #ifdef debug_OutputVariables - printf( "allocationSize <= freeMemory\n"); - printf( " -> partialVolumeSize = maxFeasibleZLayerCount(%i) * zLayerSize(%i) = %u\n", maxFeasibleZLayerCount, zLayerSize, partialVolumeSize);; - #endif - } - } - else // Alles passt in Speicher und benoetigt keine Anpassung - { - #ifdef debug_OutputInfo - printf( " ==> Yes. No need to adjust the output window size as there is enough device memory left\n"); - #endif - #ifdef debug_OutputVariables - printf( " -> maxFeasibleZLayerCount = %u\n", maxFeasibleZLayerCount); - printf( "partialVolumeSize(%u Bytes) = (%u MiB) <= freeMemory (%u) = (%u MiB)\n", partialVolumeSize, partialVolumeSize/1024/1024, freeMemory, freeMemory/1024/1024); - #endif - } - - - //================================================================================================= Memory allocation on GPU Device for - Volume-Output-Data - if (Abort_ptr[0] == 0){ - #ifdef debug_OutputMemory - printf( "partialVolumeSize = maxFeasibleZLayerCount(%i) * zLayerSize(%i) = %lld Bytes = %u MiB\n", maxFeasibleZLayerCount, zLayerSize, partialVolumeSize, partialVolumeSize/1024/1024); - printf( "CUDA: cudaMalloc(deviceOutput) der Groesse partialVolumeSize(%i Bytes) fuer maxFeasibleZLayerCount(%i) Z-Layer in maxFeasibleSosZLayerCount(%i) SOS-Z-Layern\n", partialVolumeSize, maxFeasibleZLayerCount, maxFeasibleSosZLayerCount); - #endif - CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceOutput), partialVolumeSize)); - } - // Check free memory after memory allocation - // CUDA_CHECK(cudaDeviceSynchronize()); - // memoryCheck(); - - //================================================================================================= Output Mem Alloc Time - #ifdef debug_OutputStepsPerformance - gettimeofday(&stopMalloc, NULL); - diff_time = (double)((stopMalloc.tv_sec * 1000000.0 + stopMalloc.tv_usec) - (startMalloc.tv_sec * 1000000.0 + startMalloc.tv_usec)); - printf ("########################################################################\n"); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Memory Allocation & Init = %6.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time); - printf ("########################################################################\n"); - #endif - - //===================================================================================================================================== performCoreReconstruction - //===================================================================================================================================== - - if (Abort_ptr[0] == 0){ - - #ifdef debug_OutputInfo - printf( "============================================= Call performCoreReconstruction()\n"); - #endif - auto startPerformCoreReconstruction = std::chrono::steady_clock::now(); - - performCoreReconstruction(); // Kernelaufruf //////////////////////////////////////////////////////////////////////////////////////// - - auto stopPerformCoreReconstruction = std::chrono::steady_clock::now(); - diff_time = std::chrono::duration_cast(stopPerformCoreReconstruction - startPerformCoreReconstruction).count(); // total duration in µs - #ifdef debug_OutputStepsPerformance - printf ("########################################################################\n"); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### PerformCoreReconstruction = duration: %8.0f µs \n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time); - printf ("########################################################################\n"); - #endif - - duration = (ullong)diff_time; - - } - //======================================================================================================================================= - //======================================================================================================================================= - - - //============================================================================================================================= Free / Clean-up GPU Device Memory - #ifdef debug_OutputStepsPerformance - struct timeval startMfree, stopMfree; - gettimeofday(&startMfree, NULL); - #endif - - #ifdef debug_OutputInfo - printf( "Clean-up - Free GPU-Memory\n"); - #endif - - if (SOSMode_3DVolume == true) // ===================================== 3DVolume Mode with SoS-Correction Z.506 - { - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "Abort = %i; MallocStep = %i \n", Abort_ptr[0], MallocStep); - #endif - - // Free allocated memory for SOS path calculation - #ifdef SaftTextureForEmRecSosPathsTablesFloat1 - // Free memory for 3DInterpolation with TextureMemory - #ifdef debug_OutputMemory - printf( "CUDA: cudaFreeArray: deviceTableVoxelToEmitterPathSosSumCuArray\n"); - printf( "CUDA: cudaFreeArray: deviceTableVoxelToEmitterPathCountCuArray\n"); - #endif - - // CUDA Arrays for Textur for SOS paths - CUDA_CHECK(cudaFreeArray(deviceTableVoxelToEmitterPathSosSumCuArray)); //Emitter SoSSum Array - CUDA_CHECK(cudaFreeArray(deviceTableVoxelToEmitterPathCountCuArray)); //Emitter Count Array - - // CUDA Arrays for Textur for SOS paths - for(int i = 0; i < TableVoxelToReceiverPathSosAllocationCount; i++){ - #ifdef debug_OutputMemory - printf( "CUDA: cudaFreeArray: deviceTableVoxelToReceiverPathSosSumCuArray[%i]\n",i); - printf( "CUDA: cudaFreeArray: deviceTableVoxelToReceiverPathCountCuArray[%i]\n",i); - #endif - - CUDA_CHECK(cudaFreeArray(deviceTableVoxelToReceiverPathSosSumCuArray[i])); //Receiver SoSSum - CUDA_CHECK(cudaFreeArray(deviceTableVoxelToReceiverPathCountCuArray[i])); //Receiver Count - } - - delete [] deviceTableVoxelToReceiverPathSosSumCuArray; // Array der CuArrays deleten, gegenüber vom New - delete [] deviceTableVoxelToReceiverPathCountCuArray; - - #endif - #if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4) - // Free memory for 3DInterpolation with TextureMemory - // CUDA Arrays for Textur for SOS paths - if (MallocStep > 0){ - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaFreeArray: deviceTableVoxelToEmPathSosBothCuArray\n"); - #endif - CUDA_CHECK(cudaFreeArray(deviceTableVoxelToEmPathSosBothCuArray)); //Emitter SoSSum and Count Array - } - - for(int i = 0; i < TableVoxelToReceiverPathSosAllocationCount; i++){ - if (MallocStep > (1+i)){ - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaFreeArray: deviceTableVoxelToRecPathSosBothCuArray[%i]\n",i); - #endif - CUDA_CHECK(cudaFreeArray(deviceTableVoxelToRecPathSosBothCuArray[i])); //Receiver SoSSum and Count Array - } - } - - delete [] deviceTableVoxelToRecPathSosBothCuArray; // delete array of CuArrays - #endif - - #ifndef SaftTextureForBresenhamSosPaths - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaFree: deviceSpeedOfSoundField\n"); - #endif - CUDA_CHECK(cudaFree(deviceSpeedOfSoundField)); //free deviceSpeedOfSoundField - - #else - - #ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaFreeArray: deviceSpeedOfSoundFieldCuArray\n"); - #endif - CUDA_CHECK(cudaFreeArray(deviceSpeedOfSoundFieldCuArray)); //free deviceSpeedOfSoundFieldCuArray - #endif - - #ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att) - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaFreeArray: deviceSosAttFieldCuArray\n"); - #endif - CUDA_CHECK(cudaFreeArray(deviceSosAttFieldCuArray)); //free deviceSpeedOfSoundFieldCuArray - #endif - #endif - - - // Free memory for 3DInterpolation with TextureMemory - // CUDA Arrays for AscanIndex-Textur for paths - for(int i = 0; i < TableAscanIndexAllocationCount; i++){ - if (MallocStep > (1+TableVoxelToReceiverPathSosAllocationCount+i)){ - #if defined(debug_OutputMemory) || defined(debug_OutputSOSPaths) - printf( "CUDA: cudaFreeArray: deviceTextureAscanIndexFloatCuArray[%i]\n",i); - #endif - CUDA_CHECK(cudaFreeArray(deviceTextureAscanIndexFloatCuArray[i])); //free deviceTextureAscanIndexFloatCuArray - } - } - - delete [] deviceTextureAscanIndexFloatCuArray; // delete array of CuArrays - - } - - - if (Abort_ptr[0] == 0){ - #ifdef debug_OutputMemory - printf( "CUDA: cudaFree: deviceOutput\n"); - #endif - CUDA_CHECK(cudaFree(deviceOutput)); //free deviceOutput - } - - #ifdef debug_OutputMemory - printf( "CUDA: cudaFree: deviceEmitterIndex_block\n"); - printf( "CUDA: cudaFree: deviceReceiverIndex_block\n"); - #endif - CUDA_CHECK(cudaFree(deviceEmitterIndex_block)); //free deviceEmitterIndex_block - CUDA_CHECK(cudaFree(deviceReceiverIndex_block)); //free deviceReceiverIndex_block - - - #ifdef debug_OutputMemory - printf( "CUDA: cudaFree: deviceSAFT_VARIANT\n"); - #endif - CUDA_CHECK(cudaFree(deviceSAFT_VARIANT)); //free deviceSAFT_VARIANT - - - for(std::size_t i = 0; i < aScanAllocationCount; i++) - { - #ifdef debug_OutputMemory - printf( "CUDA: cudaFreeArray: deviceAScansCuArray[%i]\n",i); - #endif - CUDA_CHECK(cudaFreeArray(deviceAScansCuArray[i])); //free deviceAScansCuArray - } - delete [] deviceAScansCuArray; // auch Felder wieder freigeben - - - #ifndef SaftUseConstantMemforGeometry - #ifdef debug_OutputMemory - printf( "CUDA: cudaFree: deviceListEmitterGeometry\n"); - printf( "CUDA: cudaFree: deviceListReceiverGeometry\n"); - #endif - CUDA_CHECK(cudaFree(deviceListEmitterGeometry)); //free deviceListEmitterGeometry - CUDA_CHECK(cudaFree(deviceListReceiverGeometry)); //free deviceListReceiverGeometry - #endif - - #ifdef debug_OutputStepsPerformance - gettimeofday(&stopMfree, NULL); - diff_time = (double)((stopMfree.tv_sec * 1000000.0 + stopMfree.tv_usec) - (startMfree.tv_sec * 1000000.0 + startMfree.tv_usec)); - printf ("########################################################################\n"); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Free Memory = %4.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time); - printf ("########################################################################\n"); - #endif - - - // Free Memory of Host-Buffer for GPU-Memory - #ifdef debug_OutputInfo - printf( "Clean-up - Free Host-Memory Buffer for GPU-Memory\n"); - #endif - free(hostLookUpGeometryMemoryListEmitterPtr); - free(hostLookUpGeometryMemoryListReceiverPtr); - -// if (Abort_ptr[0] != 0){ -// printf( "Abort Reconstruction: Not enough Memory !!!!!!!\n"); -// mexErrMsgTxt(" "); -// } - - #ifdef debug_OutputFunctions - printf( "<== SAFTHandler::processAScans - End\n"); - #endif - + // ==== Coordinates / Geometry - Blockdata + int deviceListEmitterGeometrySize = emitter_list_Size * sizeof(float3); // Size of ListEmitterGeometry + int deviceListReceiverGeometrySize = receiver_list_Size * sizeof(float3); // Size of ListReceiverGeometry + + CUDA_CHECK(cudaMemcpyToSymbol(constEmitterPtr, emitter_list, deviceListEmitterGeometrySize, 0, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyToSymbol(constReceiverPtr, receiver_list, deviceListReceiverGeometrySize, 0, cudaMemcpyHostToDevice)); + + //============================================================================= Memory allocation on GPU Device for - LookUpGeometryMemoryList for Emitter and Receiver + + // Steps: + // 1. Determine amount of used Emitter and Receiver --> usedAmountOfEmitter, usedAmountOfReceiver -> Done + // 2. Allocate Memory on Host for LookUpGeometryMemoryList for Emitter and Receiver -> Done + // 3. Fill LookUpGeometryMemoryList with MemoryPositionIndex for every emitter and Receiver -> Done + // Copy lookUpGeometryMemoryListEmitter, lookUpGeometryMemoryListReceiver to Constant Memory of Device + + // 4. Allocate SOSPathLists, depending on usedAmountOfEmitter, usedAmountOfReceiver -> Done + // 5. Perform PreprocessSOSPaths only with used Emitter and Receiver -> Done + // 6. Inside SAFT-Kernel first determine MemoryPositionIndex of current Emitter and Receivers then access Data with tex3D. -> Done + + // 1. Determine amount of used Emitter and Receiver --> usedAmountOfEmitter, usedAmountOfReceiver + usedAmountOfEmitter = 0; // amount of used emitter + usedAmountOfReceiver = 0; // amount of used receiver + + for (int i = 0; i < emitter_list_Size; i++) + { + if (emitter_list[i].x != 255) + usedAmountOfEmitter++; + } + for (int i = 0; i < receiver_list_Size; i++) + { + if (receiver_list[i].x != 255) + usedAmountOfReceiver++; + } + + // 2. Allocate Memory on Host for LookUpGeometryMemoryList for Emitter and Receiver + // create array + + // NEW 2019, trying to get rid of USCT II hard coded limitation + int maxEmitterReceiverListSize = MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY; // 2340 --> max. num of entries for 64kB constant memory, has to in alignment with constantMemory.cu + hostLookUpGeometryMemoryListEmitterPtr = (unsigned short *)malloc(maxEmitterReceiverListSize * sizeof(unsigned short)); + hostLookUpGeometryMemoryListReceiverPtr = (unsigned short *)malloc(maxEmitterReceiverListSize * sizeof(unsigned short)); + + // hostLookUpGeometryMemoryListEmitterPtr = (unsigned short*)mxMalloc(157 * 4 * sizeof(unsigned short)); + // hostLookUpGeometryMemoryListReceiverPtr = (unsigned short*)mxMalloc(157 * 9 * sizeof(unsigned short)); + + // 3. Fill LookUpGeometryMemoryList with MemoryPositionIndex for every emitter and Receiver + // Run over all emitter and fill constLookUpGeometryMemoryListEmitterPtr + int lookUpEmitterIndex = 0; + int lookUpReceiverIndex = 0; + for (int i = 0; i < emitter_list_Size; i++) + { + if (emitter_list[i].x != 255) + { // Wenn nicht leer dann an naechste Stelle einsortieren + hostLookUpGeometryMemoryListEmitterPtr[i] = (unsigned short)lookUpEmitterIndex; + lookUpEmitterIndex++; + } + else + { + hostLookUpGeometryMemoryListEmitterPtr[i] = (unsigned short)65535; + } + } + for (int i = 0; i < receiver_list_Size; i++) + { + if (receiver_list[i].x != 255) + { // Wenn nicht leer dann an naechste Stelle einsortieren + hostLookUpGeometryMemoryListReceiverPtr[i] = (unsigned short)lookUpReceiverIndex; + lookUpReceiverIndex++; + } + else + { + hostLookUpGeometryMemoryListReceiverPtr[i] = (unsigned short)65535; + } + } + + // Copy lookUpGeometryMemoryListEmitter, lookUpGeometryMemoryListReceiver to Constant Memory of Device + lookUpGeometryMemoryListEmitterSize = emitter_list_Size * sizeof(unsigned short); // Size of ListEmitterGeometry + lookUpGeometryMemoryListReceiverSize = receiver_list_Size * sizeof(unsigned short); // Size of ListReceiverGeometry + + CUDA_CHECK(cudaMemcpyToSymbol(constLookUpGeometryMemoryListEmitterPtr, hostLookUpGeometryMemoryListEmitterPtr, lookUpGeometryMemoryListEmitterSize, 0, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyToSymbol(constLookUpGeometryMemoryListReceiverPtr, hostLookUpGeometryMemoryListReceiverPtr, lookUpGeometryMemoryListReceiverSize, 0, cudaMemcpyHostToDevice)); + + //======================================================================================================= Memory allocation on GPU Device for - SOS 3DVolume-Data + + if (SOSMode_3DVolume == true) // ====================================== 3DVolume Mode with SOS-Correction + struct cudaMemcpy3DParms copyParams = {0}; + cudaChannelFormatDesc texChannelDescSosAttField = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben + + hostSosAttField = (float2 *)malloc(SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z * sizeof(float2)); // Auf dem Host ein Zwischenspeicher anlegen, fuer alle Elemente in float2 + + // Copy Sos and Att-Volume into intermediate Buffer to float2-format + for (size_t i = 0; i < (SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z); i++) + { + hostSosAttField[i].x = speedOfSoundField[i]; + + if (ATTMode_3DVolume == true) + { // 3DVolume Mode with ATT-Correction + hostSosAttField[i].y = attenuationField[i]; + } + else + { + hostSosAttField[i].y = 0.0f; + } + } + + // 3D Array fuer SOS and ATT Volume Texturmemory anlegen + // Schritt 3.2 SpeicherArray3D auf Device anlegen fuer Texturmemory + cudaMalloc3DArray(&deviceSosAttFieldCuArray, &texChannelDescSosAttField, make_cudaExtent(SOSGrid_XYZ.x * sizeof(float2), SOSGrid_XYZ.y, SOSGrid_XYZ.z), 0); + + // struct cudaMemcpy3DParms copyParams = {0}; + copyParams.srcPtr = make_cudaPitchedPtr((void *)hostSosAttField, SOSGrid_XYZ.x * sizeof(float2), SOSGrid_XYZ.x, SOSGrid_XYZ.y); + copyParams.srcPos = make_cudaPos(0, 0, 0); // Ab welcher Speicherstelle + copyParams.dstArray = deviceSosAttFieldCuArray; // Ziel-Array + copyParams.dstPos = make_cudaPos(0, 0, 0); // Ab welcher Speicherstelle + copyParams.extent = make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z); + copyParams.kind = cudaMemcpyHostToDevice; + + // Schritt 3.4 Daten von Zwischenspeicher in 3DArray fuer Texturmemory kopieren + CUDA_CHECK(cudaMemcpy3D(©Params)); + // Free intermediate Buffer for Sos and Att-Volume + free(hostSosAttField); + } + + // Feste Groeßssen einer Z-Layer und SoS-Z-Layer bestimmen + // ======================================================== + + std::size_t zLayerCount; // Anzahl Z-Layer zur Bestimmung der Anzahl der moeglichen Z-Layer für Outputgroesse + std::size_t zLayerSize; // Benoetigter Speicher für eine Z-Layer des Outputvolumens = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y * sizeof(double) Byte + std::size_t sosZLayerSize; // Benoetigter Speicher fuer alle S/E-Kombinationen fuer eine SoS-Z-Layer = SOSGrid_XYZ.x * SOSGrid_XYZ.y * (1413+628) * (4+1) Byte + + // Volumen Z-Layer + zLayerVoxelCount = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y; // Anzahl der X-Y-Voxel in einer Volumen-Layer. + zLayerSize = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y * sizeof(double); // Benoetigter Speicher für eine Z-Layer des Outputvolumens + + // SoS Z-Layer + // Fuer Receiver muss wegen der Anzahl von 1413, für die jeweils 2 SOS-Z-Layer gespeichert werden muessen, + // zwei Arrays angelegt werden, da sie nicht mehr ins Texturmemory passen (2*1413 > 2048). + + // TODO: Das koennte berechnet statt vorgegeben werden. + maxSoSReceiverArrayForTexture = MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE; // 2*710=1420 koennten so gewahlt maximal, genutzt werden aber nur 1413 + + // Wie viele Receiverbloecke benoetigt man. Abhaenig von Maximaler Anzahl muss < 2048 sein + TableVoxelToReceiverPathSosAllocationCount = ceil((float)usedAmountOfReceiver / maxSoSReceiverArrayForTexture); + receiver_list_Size_deviceMemory = maxSoSReceiverArrayForTexture * TableVoxelToReceiverPathSosAllocationCount; // Auf vielfaches von 710 vergroesserter Speicher + + // Benoetigter Speicher fuer alle S/E-Kombinationen fuer eine SoS-Z-Layer + sosZLayerVoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y; // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. + // TODO: Aktuell wird nicht von usedAmountOfReceiver sondern von maxSoSReceiverArrayForTexture = 710 ausgegangen. Da normalerweise meist alle Receiver und einige Emitter im Durchlauf genutzt + // werden. sosZLayerSize = sosZLayerVoxelCount * (usedAmountOfEmitter + usedAmountOfReceiver) * (sizeof(float) + sizeof(float)); //SOSGrid_XYZ.x * SOSGrid_XYZ.y * (1413+628) * (4+4) Byte + sosZLayerSize = TableVoxelToReceiverPathSosAllocationCount * sosZLayerVoxelCount * (usedAmountOfEmitter + maxSoSReceiverArrayForTexture) * + (4 * sizeof(float)); // SOSGrid_XYZ.x * SOSGrid_XYZ.y * (max628 + max710) * (4*4) Byte + + std::size_t maxZLayerCountInOneSoSZLayer = ceil(SOS_RESOLUTION / IMAGE_RESOLUTION) + 1; // Maximale Anzahl an Z-Layern in einem SOS-Z-Layer +1 ist auch moeglich!? + std::size_t maxZLayerCountInOneSoSZLayerSize = + maxZLayerCountInOneSoSZLayer * zLayerSize + sosZLayerSize; // Benoetigter Speicher fuer alle Z-Layer mit benoetigter SoS-Z-Layer innerhalb einer SOS-Z-Layer + + // Anpassung der Speichergroesen + // ======================================================== + // Der freie Speicherplatz muss angepasst werden damit Speicher fuer folgende Belegung zur Verfuegung steht: + // 1. Die Speed of Sound-Volumen Daten fuer die SoS Vorberechungen (relativ klein) + // 2. Die Ausgangsdaten von der SoS Vorverarbeitung (Emitter->Voxel)(etwas groeßer) + // Der Zweite ist problematisch da er nicht einfach wie 1tes abgezogen werden kann, denn seine Groesse ist nicht bekannt + // Seine Groesse haengt von der Anzahl z-Layer ab, die gewaehlt wird. + + std::size_t totalMemory, freeMemory; + // CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); // Freier Speicher auslesen und am Anfang ausgeben + // memoryCheck(); + + // Die festen Speichergrößen wurden schon allokiert und damit belegt. + // ==> d.h. nur noch SoS-Zlayer und Z-Layer müssen bestimmt und genutzt werden. + + // Neben der Beschraenkung der maximalen Grid und Blockgoesse muss auch der dem Speicherplatz mit betrachtet werden. + // Zuerst in Z-Layer in Bezug auf die Gridgroesse einteilen, da diese Beschraenkung schon automatisch zur Verkleinerung des benoetigten Speichers fuehrt + // Maximal koennen 67108864 Voxel(=Block(1024)*Grid(65536)) auf einmal berechnet werden. Das entspricht 67108864*8 = 536,871MB + // Damit kann die maximale Outputgroesse von 32-Bit nicht ueberschritten werden. ==> Gesamtgroesse wird automatisch in Z-layer eingeteilt. + // Die Grid-Dimensionen fuer die Durchfuehrung der Rekonstruktion wird so an XY angepasst, das nur noch eine bestimmte Anzahl von Z-Layer berechnet werden koennen. + + windowGridDimensions.x = genericSAFTGridDimensions.x; + windowGridDimensions.y = genericSAFTGridDimensions.y; + // windowGridDimensions.z = (IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z; // Aufrunden. Ueberpruefung, ob zu viel findet im Kernel statt + windowGridDimensions.z = genericSAFTGridDimensions.z; // Mueste auch ohne Aufrunden funktionieren, da genericSAFTGridDimensions.z genauso berechnet wurde + + // Anpassung 1 + // Ueberschreiten die GesamtGridDimensionen die maximal erlaubte Anzahl auf der GPU => dann Aufteilung in Z-Layer + // Wenn alles passt kann komplettes Volumen in einem GPU-Grid berechnet werden = (Grid = #Threads * #Bloecke (max. 1024*65536=2^26)) + if ((windowGridDimensions.x * windowGridDimensions.y * windowGridDimensions.z) > 65536) // 65536=max Anzahl Bloecke im Grid!!!! + { + int zBlockCount = 65536 / (windowGridDimensions.x * windowGridDimensions.y); // = Maximal moegliche Anzahl Z-Layer, die fuer eine XY-Resolution noch berechnet werden koennen, + // da sonst maximale Blockanzahl im Grid ueberschritten wuerde + if (zBlockCount == 0) + printf("(zBlockCount == 0) && (windowGridDimensions > 65536) => try higher genericSAFTBlockDimensions if possible or lower XY-Resolution"); + + zLayerCount = zBlockCount * genericSAFTBlockDimensions.z; + partialVolumeSize = zLayerCount * zLayerSize; // partialVolumeSize wird an maximal parallel berechenbare Z-Layer angepasst. + } + else // komplettes Volumen kann in einem GPU-Grid berechnet werden + { + partialVolumeSize = outputVolume_Bytes; // TeilOutputgroesse = Outputgroesse für Volumen + zLayerCount = genericSAFTGridDimensions.z; + } + + // Anpassung 2 + std::size_t requiredSosLayer; // Für die Anzahl der Z-Layer minimal benoetigte SoSZ-Layer die zu berechnen sind + 1 für Interpolation + zLayerCount = maxZLayerCountInOneSoSZLayer; // Maximale Z-Layer die berechnet werden kann ist # die in eine SOS Z-layer passt. + + // Setzen der maximal moeglichen SOS-ZLayer! + switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]) + { + case 0: // Mit Textur -> 1ne SOS-ZLayer + requiredSosLayer = ceil((float)zLayerCount / (float)maxZLayerCountInOneSoSZLayer); // Für die Anzahl der Z-Layer maximal benoetigte SoSZ-Layer die zu berechnen sind + break; + case 1: // Mit Textur & Interpolation -> 2 SOS-ZLayer + requiredSosLayer = ceil((float)zLayerCount / (float)maxZLayerCountInOneSoSZLayer) + 1; // Für die Anzahl der Z-Layer maximal benoetigte SoSZ-Layer die zu berechnen sind + 1 für Interpolation + break; + } + + maxFeasibleSosZLayerCount = requiredSosLayer; // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt. + + // Setzen der maximal moeglichen ZLayer! + if (genericSAFTGridDimensions.z < maxZLayerCountInOneSoSZLayer) + { + maxFeasibleZLayerCount = genericSAFTGridDimensions.z; // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt. + } + else + { + maxFeasibleZLayerCount = maxZLayerCountInOneSoSZLayer; // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt. + } + + size_t MatlabSavety = (MATLABSAVETY_MB * 1024 * 1024); // Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann. + partialVolumeSize = zLayerCount * zLayerSize + MatlabSavety; // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde + partialSosPathSize = requiredSosLayer * sosZLayerSize; // Speicher(SOSATTPaths ), der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde + partialAscanIndexSize = requiredSosLayer * TableAscanIndexAllocationCount * SOSGrid_XYZ.x * SOSGrid_XYZ.y * (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture) * 4 * + ((int)(SOSMode_3DVolume) + (int)(ATTMode_3DVolume)); // Speicher(AscanIndex) , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde + + if (memoryGPUfree() <= (partialVolumeSize + partialSosPathSize)) + { + maxSupportedTexturesForAscanIndex = 1; + printf("Free GPU Memory (%lld Bytes) < required Memory (%lld Bytes)\n --> try to reduce amount of Receiver!\n", memoryGPUfree(), (partialVolumeSize + partialSosPathSize)); + } + + //================================================================================================= Memory allocation on GPU Device for - Table SoS-Paths + + // Aufteilung der SOSPfadTabelle in 3D Array (x, y, z * Emitter bzw. Receiver) + + // std::size_t requestedMemorySize = 0; + unsigned int requestedMemorySize = 0; + int MallocStep = 0; // Wenn Fehler nur die Speicherbereiche freigeben, die auch alloziert wurden + + // 1. Speicher auf Device allozieren + // Fuer Emitter-SOSPfade Pitched Memory und 3D Array anlegen + // CUDA_CHECK(cudaDeviceSynchronize()); + // memoryCheck(); + + // SoSEmitterPathsSum mit Texturmemory ------------------------------------- + cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathSosBoth = cudaCreateChannelDesc(32, 32, 32, 32, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben - Float4 + + requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float4)) / (float)deviceProperties[deviceId].texturePitchAlignment) * deviceProperties[deviceId].texturePitchAlignment * + SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * usedAmountOfEmitter); + // Warning if free memory on GPU Device is too small for requested Memory size + if (memoryGPUfree() <= requestedMemorySize) + { + printf("Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize); + printf("Not possible CUDA: cudaMalloc3DArray: deviceTableVoxelToEmPathSosBothCuArray [%ix%ix%i] (Float4): %u Bytes\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, + (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter), requestedMemorySize); + printf("Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); + Abort_ptr[0] = 1; + // mexErrMsgTxt(" "); + } + + if (Abort_ptr[0] == 0) + { + // 3D Array fuer Texturmemory anlegen + CUDA_CHECK(cudaMalloc3DArray(&deviceTableVoxelToEmPathSosBothCuArray, &texChannelDescTableVoxelToEmRecPathSosBoth, + make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, (int)(maxFeasibleSosZLayerCount * usedAmountOfEmitter)), // bei cudaMalloc3DArray wird width in Elementen angegeben! + // 0 + cudaArraySurfaceLoadStore)); + MallocStep++; + + deviceTableVoxelToRecPathSosBothCuArray = new cudaArray *[TableVoxelToReceiverPathSosAllocationCount]; // Für Arbeiten mit Texturmemory + } + + for (int i = 0; i < TableVoxelToReceiverPathSosAllocationCount; i++) // Fuer Receiver benoetigt man zwei Texturen da 2 * 1413 > 2048 die maximal erlaubt sind. + { + // SoSReceiverPathsBoth mit Texturmemory ------------------------------------- + + // 3D Array fuer Texturmemory anlegen + requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float4)) / (float)deviceProperties[deviceId].texturePitchAlignment) * deviceProperties[deviceId].texturePitchAlignment * + SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture); + // Warning if free memory on GPU Device is too small for requested Memory size + if (memoryGPUfree() <= requestedMemorySize) + { + printf("Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize); + printf("Not possible CUDA: cudaMalloc3DArray: deviceTableVoxelToRecPathSosBothCuArray[%i] [%ix%ix%i] (Float4): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, + (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture), requestedMemorySize); + printf("Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); + Abort_ptr[0] = 1; + // mexErrMsgTxt(" "); + } + + if (Abort_ptr[0] == 0) + { + // Schritt 3.2 SpeicherArray3D auf Device anlegen fuer Texturmemory + CUDA_CHECK(cudaMalloc3DArray(&deviceTableVoxelToRecPathSosBothCuArray[i], &texChannelDescTableVoxelToEmRecPathSosBoth, + make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, + (int)(maxFeasibleSosZLayerCount * maxSoSReceiverArrayForTexture)), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! + // 0 + cudaArraySurfaceLoadStore)); + MallocStep++; + } + } + + // Schritt 2.1 Output-Kanal anlegen und beschreiben + + cudaChannelFormatDesc texChannelDescTableAscanIndexFloat; + if (ATTMode_3DVolume == false) + { // ========= 3DVolume Mode without ATT-Correction + texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); + } + else if (ATTMode_3DVolume == true) + { // ========= 3DVolume Mode with ATT-Correction + texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben + } + + // Pitched Memory und 3D Array anlegen fuer Ascan-Index + + // Für Ascan-Index benoetigt man mehrere Texturen fuer jeweils 2 Z-Layer. + // 2*N < maxSurfaceTexture3DDimension(Fermi&Kepler: 2048) ==> (1024 Em/Rec - Kombinationen) + // maxSurfaceTexture3DDimension = maximale Groesse die erlaubt ist + // TableAscanIndexAllocationCount = Anzahl der Teiltabellen ==> auch Anzahl der benoetigten Durchlaeufe + // maxFeasibleSosZLayerCount = Anzahl der SoS-Zlayer die gleichzeitig im Speicher pro EM/REC-Kombi vorgehalten werden (1 oder 2 bei Interpolierten Variante) + // maxAscanIndexArraysInTexture = Anzahl der Ascans in einer Teiltabelle + // neededAscanBatchCount = Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen + + deviceTextureAscanIndexFloatCuArray = new cudaArray *[TableAscanIndexAllocationCount]; // Für Arbeiten mit Texturmemory + + for (int i = 0; i < TableAscanIndexAllocationCount; i++) // Für AscanIndex benoetigt man n Texturen da maximal 2048 Z-Layer erlaubt sind. + { + // Ascan-IndexPathsSum mit Texturmemory ------------------------------------- + // Schritt 3.2 Surface-SpeicherArray3D auf Device anlegen fuer Texturmemory + + // 3D Array fuer Texturmemory anlegen + // CUDA_CHECK(cudaDeviceSynchronize()); + // memoryCheck(); + + if (ATTMode_3DVolume == false) + { // ========= 3DVolume Mode without ATT-Correction + requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float)) / (float)deviceProperties[deviceId].texturePitchAlignment) * + deviceProperties[deviceId].texturePitchAlignment * SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture); + // Warning if free memory on GPU Device is too small for requested Memory size + if (memoryGPUfree() <= requestedMemorySize) + { + printf("Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize); + printf("Not possible CUDA: cudaMalloc3DArray: deviceTextureAscanIndexFloatCuArray[%i] [%ix%ix%i] (float): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, + (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture), requestedMemorySize); + printf("Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); + Abort_ptr[0] = 1; + // mexErrMsgTxt(" "); + } + + if (Abort_ptr[0] == 0) + { + CUDA_CHECK(cudaMalloc3DArray(&deviceTextureAscanIndexFloatCuArray[i], &texChannelDescTableAscanIndexFloat, + make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, + (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture)), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! + cudaArraySurfaceLoadStore)); + MallocStep++; + } + } + else if (ATTMode_3DVolume == true) + { // ========= 3DVolume Mode with ATT-Correction + requestedMemorySize = (unsigned int)ceil((float)(SOSGrid_XYZ.x * sizeof(float2)) / (float)deviceProperties[deviceId].texturePitchAlignment) * + deviceProperties[deviceId].texturePitchAlignment * SOSGrid_XYZ.y * ((int)maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture); + + // Warning if free memory on GPU Device is too small for requested Memory size + if (memoryGPUfree() <= requestedMemorySize) + { + printf("Free GPU Memory (%lld Bytes) < requested Memory (%lld Bytes)\n", memoryGPUfree(), requestedMemorySize); + printf("Not possible CUDA: cudaMalloc3DArray: deviceTextureAscanIndexFloatCuArray[%i] [%ix%ix%i] (float2): %u Bytes\n", i, SOSGrid_XYZ.x, SOSGrid_XYZ.y, + (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture), requestedMemorySize); + printf("Abort Reconstruction: Not enough Memory for cudaMalloc3DArray !!!!!!!\n"); + Abort_ptr[0] = 1; + // mexErrMsgTxt(" "); + } + + if (Abort_ptr[0] == 0) + { + CUDA_CHECK(cudaMalloc3DArray(&deviceTextureAscanIndexFloatCuArray[i], &texChannelDescTableAscanIndexFloat, + make_cudaExtent(SOSGrid_XYZ.x, SOSGrid_XYZ.y, + (int)(maxFeasibleSosZLayerCount * maxAscanIndexArraysInTexture)), // bei cudaMalloc3DArray wird width in Elementen angegeben!!!! + cudaArraySurfaceLoadStore)); + MallocStep++; + } + } + } + + // Set SOSPathMemory to Zero as Initialisation, if not used for useTwoLoops there can be data, from calculations before --> Wrong calculations + + if (Abort_ptr[0] == 0) + { + fillCuArray((float)0.0, deviceTextureAscanIndexFloatCuArray, TableAscanIndexAllocationCount); + } + + //=========================================================================================================================== adjust Size of OutputWindow - Part 2 + CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); // Freier Speicher auslesen + // memoryCheck(); + if ((partialVolumeSize) > freeMemory) + { + // Brute-force the greatest possible number of z-layers for the partial output window and the speed of sound data window. + bool success = false; + + for (zLayerCount; zLayerCount > 0; zLayerCount--) // Anpassungsschritt 2: zLayerCount = (freeMemory - noch benoetigtem Speicher)/ (zLayerSize * sizeof(double)) + { + std::size_t allocationSize = 0; + partialVolumeSize = zLayerCount * zLayerSize + MatlabSavety; // Speicher, der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde + + allocationSize += partialVolumeSize; // Insgesamt benoetigter Volumen- und SoS-Pfadspeicher + + if (allocationSize <= freeMemory) // allocationSize + { + maxFeasibleZLayerCount = zLayerCount; // Die Anzahl an Layer nutzen für Rekonstruktion, die gerade nochin den Speicher passt + partialVolumeSize = maxFeasibleZLayerCount * zLayerSize; // Speicher, der fuer die entsprechende Anzahl an Z-Layern benötigt wuerde + + success = true; + break; // Sobald die Anzahl an Z-Layern und SoS-Z-Layern passt Schleife beenden + } + } + + if (!success) + { + printf("Not enough free device memory available for one Z-Layer(%i) from outputVolume_Bytes(%u) and one Sos-Z-Layer(%u) to perform reconstruction\n", zLayerSize, outputVolume_Bytes, + sosZLayerSize); + printf("Use a smaller Volume resolution for X-Y-Direction !!!!!!\n"); + Abort_ptr[0] = 1; + // mexErrMsgTxt(" "); + } + else + { + } + } + else // Alles passt in Speicher und benoetigt keine Anpassung + { + } + + //================================================================================================= Memory allocation on GPU Device for - Volume-Output-Data + if (Abort_ptr[0] == 0) + { + CUDA_CHECK(cudaMalloc(reinterpret_cast(&deviceOutput), partialVolumeSize)); + } + // Check free memory after memory allocation + // CUDA_CHECK(cudaDeviceSynchronize()); + // memoryCheck(); + //===================================================================================================================================== performCoreReconstruction + //===================================================================================================================================== + + if (Abort_ptr[0] == 0) + { + auto startPerformCoreReconstruction = std::chrono::steady_clock::now(); + + performCoreReconstruction(); // Kernelaufruf //////////////////////////////////////////////////////////////////////////////////////// + + auto stopPerformCoreReconstruction = std::chrono::steady_clock::now(); + diff_time = std::chrono::duration_cast(stopPerformCoreReconstruction - startPerformCoreReconstruction).count(); // total duration in µs + + duration = (ullong)diff_time; + } + //======================================================================================================================================= + //======================================================================================================================================= + + //============================================================================================================================= Free / Clean-up GPU Device Memory + if (SOSMode_3DVolume == true) // ===================================== 3DVolume Mode with SoS-Correction Z.506 + { + // Free allocated memory for SOS path calculation + // Free memory for 3DInterpolation with TextureMemory + // CUDA Arrays for Textur for SOS paths + if (MallocStep > 0) + { + CUDA_CHECK(cudaFreeArray(deviceTableVoxelToEmPathSosBothCuArray)); // Emitter SoSSum and Count Array + } + + for (int i = 0; i < TableVoxelToReceiverPathSosAllocationCount; i++) + { + if (MallocStep > (1 + i)) + { + CUDA_CHECK(cudaFreeArray(deviceTableVoxelToRecPathSosBothCuArray[i])); // Receiver SoSSum and Count Array + } + } + + delete[] deviceTableVoxelToRecPathSosBothCuArray; // delete array of CuArrays + CUDA_CHECK(cudaFreeArray(deviceSosAttFieldCuArray)); // free deviceSpeedOfSoundFieldCuArray + + // Free memory for 3DInterpolation with TextureMemory + // CUDA Arrays for AscanIndex-Textur for paths + for (int i = 0; i < TableAscanIndexAllocationCount; i++) + { + if (MallocStep > (1 + TableVoxelToReceiverPathSosAllocationCount + i)) + { + CUDA_CHECK(cudaFreeArray(deviceTextureAscanIndexFloatCuArray[i])); // free deviceTextureAscanIndexFloatCuArray + } + } + + delete[] deviceTextureAscanIndexFloatCuArray; // delete array of CuArrays + } + + if (Abort_ptr[0] == 0) + { + CUDA_CHECK(cudaFree(deviceOutput)); // free deviceOutput + } + CUDA_CHECK(cudaFree(deviceEmitterIndex_block)); // free deviceEmitterIndex_block + CUDA_CHECK(cudaFree(deviceReceiverIndex_block)); // free deviceReceiverIndex_block + + CUDA_CHECK(cudaFree(deviceSAFT_VARIANT)); // free deviceSAFT_VARIANT + + for (std::size_t i = 0; i < aScanAllocationCount; i++) + { + CUDA_CHECK(cudaFreeArray(deviceAScansCuArray[i])); // free deviceAScansCuArray + } + delete[] deviceAScansCuArray; // auch Felder wieder freigeben + + // Free Memory of Host-Buffer for GPU-Memory + free(hostLookUpGeometryMemoryListEmitterPtr); + free(hostLookUpGeometryMemoryListReceiverPtr); } diff --git a/SAFT_TOFI/src/saft.cpp b/SAFT_TOFI/src/saft.cpp index 06b8fb8..262153e 100644 --- a/SAFT_TOFI/src/saft.cpp +++ b/SAFT_TOFI/src/saft.cpp @@ -1,126 +1,91 @@ +#include + +#include +#include +#include #include #include -#include -#include -#include -#include - -//#include -//#include -//#include +// #include +// #include +// #include #include "saft.hpp" - - - /** Clumsy constructor of the core reconstruction class. - Unbeholfener Konstruktor der Kern Rekonstuktionsklasse */ -SAFTHandler::SAFTHandler( - int deviceId, ///< CUDA ID of the device to be used. - int deviceIndex, ///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?) - float *aScan_ptr, ///< Zeiger zu den AScandaten - double *output_ptr, ///< Zeiger zu den Volumen-Daten - double *Duration_ptr, ///< Zeiger auf Rueckgabewert fuer Matlab fuer Laufzeit des Kernels - unsigned short *receiver_index_ptr, ///< - unsigned short *emitter_index_ptr, ///< - float *receiver_list_ptr, ///< - int receiver_list_Size, - float *emitter_list_ptr, ///< - int emitter_list_Size, - float *speed_vec_ptr, ///< Zeiger auf die SoS-Daten in Block-/Gridmode - int3 SOSGrid_XYZ, - float3 sosOffset, ///< Startpoint of SoSGrid - float SOS_RESOLUTION, ///< Aufloesung des SoSGrid - float *att_vec_ptr, ///< Zeiger auf die Att-Daten inm Gridmode +SAFTHandler::SAFTHandler(int deviceId, ///< CUDA ID of the device to be used. + int deviceIndex, ///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?) + float *aScan_ptr, ///< Zeiger zu den AScandaten + double *output_ptr, ///< Zeiger zu den Volumen-Daten + double *Duration_ptr, ///< Zeiger auf Rueckgabewert fuer Matlab fuer Laufzeit des Kernels + unsigned short *receiver_index_ptr, ///< + unsigned short *emitter_index_ptr, ///< + float *receiver_list_ptr, ///< + int receiver_list_Size, + float *emitter_list_ptr, ///< + int emitter_list_Size, + float *speed_vec_ptr, ///< Zeiger auf die SoS-Daten in Block-/Gridmode + int3 SOSGrid_XYZ, + float3 sosOffset, ///< Startpoint of SoSGrid + float SOS_RESOLUTION, ///< Aufloesung des SoSGrid + float *att_vec_ptr, ///< Zeiger auf die Att-Daten inm Gridmode - int aScanCount, - int aScanLength, - int3 IMAGE_SIZE_XYZ, - float sampleRate, - float3 regionOfInterestOffset, - float IMAGE_RESOLUTION, - dim3 const & fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning. - float debugMode, - float debugModeParameter, - bool SOSMode_3DVolume, - bool ATTMode_3DVolume, + int aScanCount, int aScanLength, int3 IMAGE_SIZE_XYZ, float sampleRate, float3 regionOfInterestOffset, float IMAGE_RESOLUTION, + dim3 const &fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning. + float debugMode, float debugModeParameter, bool SOSMode_3DVolume, bool ATTMode_3DVolume, - int SAFT_MODE, - int *SAFT_VARIANT, - int SAFT_VARIANT_Size, + int SAFT_MODE, int *SAFT_VARIANT, int SAFT_VARIANT_Size, - int *Abort_ptr ///< If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch; - ): - // Initialisation der Klassenvariablen mit den uebergebenen Werten (aehnlich Konstruktor) - // Initializer list of class variables - deviceId(deviceId), - deviceIndex(deviceIndex), + int *Abort_ptr ///< If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch; + ) + : deviceId(deviceId), + deviceIndex(deviceIndex), - aScan_ptr(aScan_ptr), //aScanSamplesPath(aScanSamplesPath), + aScan_ptr(aScan_ptr), // aScanSamplesPath(aScanSamplesPath), - output_ptr(output_ptr), //Path(Path), - Duration_ptr(Duration_ptr), + output_ptr(output_ptr), // Path(Path), + Duration_ptr(Duration_ptr), - receiver_index_ptr(receiver_index_ptr), // - emitter_index_ptr(emitter_index_ptr), // - receiver_list_ptr(receiver_list_ptr), // - receiver_list_Size(receiver_list_Size), - emitter_list_ptr(emitter_list_ptr), // - emitter_list_Size(emitter_list_Size), - speed_vec_ptr(speed_vec_ptr), ///< SoS-Daten im Blockmode oder SoSGrid - SOSGrid_XYZ(SOSGrid_XYZ), // Groesse des SoSGrids - sosOffset(sosOffset), ///< Startpoint of SoSGrid - SOS_RESOLUTION(SOS_RESOLUTION), ///< Aufloesung des SoSGrid + receiver_index_ptr(receiver_index_ptr), // + emitter_index_ptr(emitter_index_ptr), // + receiver_list_ptr(receiver_list_ptr), // + receiver_list_Size(receiver_list_Size), + emitter_list_ptr(emitter_list_ptr), // + emitter_list_Size(emitter_list_Size), + speed_vec_ptr(speed_vec_ptr), ///< SoS-Daten im Blockmode oder SoSGrid + SOSGrid_XYZ(SOSGrid_XYZ), // Groesse des SoSGrids + sosOffset(sosOffset), ///< Startpoint of SoSGrid + SOS_RESOLUTION(SOS_RESOLUTION), ///< Aufloesung des SoSGrid - att_vec_ptr(att_vec_ptr), ///< Att-Daten als ATTGrid + att_vec_ptr(att_vec_ptr), ///< Att-Daten als ATTGrid - aScanCount(aScanCount), - aScanLength(aScanLength), - IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ), - sampleRate(sampleRate), - regionOfInterestOffset(regionOfInterestOffset), - IMAGE_RESOLUTION(IMAGE_RESOLUTION), + aScanCount(aScanCount), + aScanLength(aScanLength), + IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ), + sampleRate(sampleRate), + regionOfInterestOffset(regionOfInterestOffset), + IMAGE_RESOLUTION(IMAGE_RESOLUTION), - fixedBlockDimensions(fixedBlockDimensions), - debugMode(debugMode), - debugModeParameter(debugModeParameter), - SOSMode_3DVolume(SOSMode_3DVolume), - ATTMode_3DVolume(ATTMode_3DVolume), + fixedBlockDimensions(fixedBlockDimensions), + debugMode(debugMode), + debugModeParameter(debugModeParameter), + SOSMode_3DVolume(SOSMode_3DVolume), + ATTMode_3DVolume(ATTMode_3DVolume), - SAFT_MODE(SAFT_MODE), - SAFT_VARIANT(SAFT_VARIANT), - SAFT_VARIANT_Size(SAFT_VARIANT_Size), + SAFT_MODE(SAFT_MODE), + SAFT_VARIANT(SAFT_VARIANT), + SAFT_VARIANT_Size(SAFT_VARIANT_Size), - Abort_ptr(Abort_ptr) + Abort_ptr(Abort_ptr) { - #ifdef debug_OutputFunctions - printf( "==> SAFTHandler::SAFTHandler - Start\n"); - #endif - - #ifdef debug_OutputInfo - printf( "SAFTHandler Constructor\n"); - #endif - - aScanAllocationCount = USED_ASCANSMEMORYREGIONS; // Anzahl der A-Scan-Speicherbereiche die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer A-ScanCopy genutzt werden sollen. - maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX; // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex; - - IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION; // Auflösung im OutputVolumen - SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION; // Auflösung im SoS-Volumen - - #ifdef debug_OutputVariables - printf( "IMAGE_RESOLUTION_FACTOR = %e\n", IMAGE_RESOLUTION_FACTOR); - printf( "SOS_RESOLUTION_FACTOR = %e\n", SOS_RESOLUTION_FACTOR); - printf( "Samplerate = %e\n", sampleRate); - #endif - - #ifdef debug_OutputFunctions - printf( "<== SAFTHandler::SAFTHandler - End\n"); - #endif + aScanAllocationCount = USED_ASCANSMEMORYREGIONS; // Anzahl der A-Scan-Speicherbereiche die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer A-ScanCopy genutzt werden sollen. + maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX; // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex; + IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION; // Auflösung im OutputVolumen + SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION; // Auflösung im SoS-Volumen } /** @@ -129,271 +94,100 @@ SAFTHandler::SAFTHandler( */ void SAFTHandler::performReconstruction() { - #ifdef debug_OutputFunctions - printf( "==> SAFTHandler::performReconstruction - Start\n"); - #endif + aScanSamples = (float *)aScan_ptr; // Ascan-Data + emitter_index = (unsigned short *)emitter_index_ptr; // Index for associating emitter to corresponding coordinates + receiver_index = (unsigned short *)receiver_index_ptr; // Index for associating receiver to corresponding coordinates + emitter_list = (float3 *)emitter_list_ptr; // Lookuptable for emitter coordinates + receiver_list = (float3 *)receiver_list_ptr; // Lookuptable for receiver coordinates + output = (double *)output_ptr; // Output-Data + speedOfSoundField = (float *)speed_vec_ptr; // For SOS Correction + // SoSData = (float*) speed_vec_ptr; // Fuer Blockmode + attenuationField = (float *)att_vec_ptr; // For Attenuation Correction - //Pointeruebergabe der AScan-Daten, Geometrie-Daten und Output-Daten von Matlab - #ifdef debug_OutputInfo - printf( "Give Pointer Names for AScan, Geometry, Output and SoS-Data from Matlab\n"); - #endif - aScanSamples = (float*) aScan_ptr; // Ascan-Data - emitter_index = (unsigned short*) emitter_index_ptr; // Index for associating emitter to corresponding coordinates - receiver_index = (unsigned short*) receiver_index_ptr; // Index for associating receiver to corresponding coordinates - emitter_list = (float3*) emitter_list_ptr; // Lookuptable for emitter coordinates - receiver_list = (float3*) receiver_list_ptr; // Lookuptable for receiver coordinates - output = (double*) output_ptr; // Output-Data + // Read out GPU-Device Properties + // ---------------------------------------------------------- + // List of all deviceProperties: http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__DEVICE_g5aa4f47938af8276f08074d09b7d520c.html - speedOfSoundField = (float*) speed_vec_ptr; // For SOS Correction - //SoSData = (float*) speed_vec_ptr; // Fuer Blockmode - attenuationField = (float*) att_vec_ptr; // For Attenuation Correction + // Determine the number of GPU-Devices in System + int deviceCount; + CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); + deviceProperties.reserve(static_cast(deviceCount)); // Request Vector for all GPU Devices with the size deviceCount + // Determine the number of GPU-Devices in System + cudaDeviceProp &device = deviceProperties[deviceId]; + CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId)); // Read out Properties of current used GPU-Device in this thread - #ifdef debug_OutputInfo // Name des Device mit ID ausgeben - printf( "Device ID: %i\n", deviceId); - #endif + CUDA_CHECK(cudaSetDevice(deviceId)); - #ifdef debug_OutputFunctions - printf( "==> loadDevices - Start\n"); - #endif + deviceProperties.push_back(device); // Add element at the end of the vector outputProb + // Determine minimum supported Surface size. Dependent on device.maxTexture3D[2] and device.maxSurface3D[2] + maxSurfaceTexture3DDimension = (device.maxTexture3D[2] < device.maxSurface3D[2]) ? device.maxTexture3D[2] : device.maxSurface3D[2]; + // printf("DEVICE => maxSurfaceTexture3DDimension = %d (device.maxTexture3D[2] = %d - device.maxSurface3D[2] = %d)\n", maxSurfaceTexture3DDimension, device.maxTexture3D[2], + // device.maxSurface3D[2]); // Set maximum Size of Texture - // Read out GPU-Device Properties - // ---------------------------------------------------------- - // List of all deviceProperties: http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__DEVICE_g5aa4f47938af8276f08074d09b7d520c.html - - // Determine the number of GPU-Devices in System - int deviceCount; - CUDA_CHECK(cudaGetDeviceCount(&deviceCount)); - deviceProperties.reserve(static_cast(deviceCount)); // Request Vector for all GPU Devices with the size deviceCount - - // Determine the number of GPU-Devices in System - cudaDeviceProp & device = deviceProperties[deviceId]; - CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId)); // Read out Properties of current used GPU-Device in this thread - //printf("%i. %s\n", deviceId, device.name); - //#ifdef debug_OutputInfo - //printf( "Device used: %18s (HW-ID %i) (Idx %i)\n", device.name , deviceId, deviceIndex); // Name des Device mit ID ausgeben - //#endif - CUDA_CHECK(cudaSetDevice(deviceId)); - - #ifdef debug_OutputInfo - printf("Reset Device\n"); // Reset Device - #endif - - // CUDA_CHECK(cudaDeviceReset()); 2019: commented to remove re-initialization when called, avoids blocked threads later on - - //printf("%i. %s\n", deviceId, deviceProperties[deviceId].name); - //printf("DEVICE => Maximum 3D texture dimensions: [%d %d %d]\n", device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]); - //printf("DEVICE => Maximum width, height, and depth for a 3D surface reference bound to a CUDA array: [%d %d %d]\n", device.maxSurface3D[0], device.maxSurface3D[1], device.maxSurface3D[2]); - - #ifdef debug_OutputInfo - printf("%i. %s\n", deviceId, device.name); - printf(" Byte Total Global Mem: %lld \n", device.totalGlobalMem); - printf(" Compute Capability: %i.%i\n", device.major,device.minor); - - printf(" Name: %s\n", device.name); - printf(" Major revision number: %d\n", device.major); - printf(" Minor revision number: %d\n", device.minor); - printf(" Total global memory: %lld\n", device.totalGlobalMem); - printf(" Total shared memory per block: %u\n", device.sharedMemPerBlock); - printf(" Total registers per block: %d\n", device.regsPerBlock); - printf(" Warp size: %d\n", device.warpSize); - printf(" Maximum memory pitch: %lld\n", device.memPitch); - printf(" Maximum threads per block: %d\n", device.maxThreadsPerBlock); - printf(" Maximum 3D texture dimensions: [%d %d %d]\n", device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]); - for (int i = 0; i < 3; ++i) - printf(" Maximum dimension %d of block: %lld\n", i, device.maxThreadsDim[i]); - for (int i = 0; i < 3; ++i) - printf(" Maximum dimension %d of grid: %lld\n", i, device.maxGridSize[i]); - printf(" Clock rate: %d\n", device.clockRate); - printf(" Total constant memory: %u\n", device.totalConstMem); - printf(" Texture alignment: %u\n", device.textureAlignment); - printf(" Concurrent copy and execution: %s\n", (device.deviceOverlap ? "Yes" : "No")); - printf(" Number of multiprocessors: %d\n", device.multiProcessorCount); - printf(" Kernel execution timeout: %s\n\n", (device.kernelExecTimeoutEnabled ? "Yes" : "No")); - printf(" Maximum 3D texture dimensions: [%d %d %d]\n", device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]); - printf(" Maximum width, height, and depth for a 3D surface reference bound to a CUDA array: [%d %d %d]\n", device.maxSurface3D[0], device.maxSurface3D[1], device.maxSurface3D[2]); - #endif - - deviceProperties.push_back(device); // Add element at the end of the vector outputProb - -// printf(" Maximum memory pitch: %lld\n", device.memPitch); -// printf(" Texture alignment: %u\n", device.textureAlignment); -// printf(" Texture Pitch alignment: %u\n", device.texturePitchAlignment); - - // Determine minimum supported Surface size. Dependent on device.maxTexture3D[2] and device.maxSurface3D[2] - maxSurfaceTexture3DDimension = (device.maxTexture3D[2] maxSurfaceTexture3DDimension = %d (device.maxTexture3D[2] = %d - device.maxSurface3D[2] = %d)\n", maxSurfaceTexture3DDimension, device.maxTexture3D[2], device.maxSurface3D[2]); // Set maximum Size of Texture - - //Set the maximal used number of SOS-ZLayers, dependend on SAFT_VARIANT-Parameter 3DVolumeInterpolationAtReconstruction (=3) + // Set the maximal used number of SOS-ZLayers, dependend on SAFT_VARIANT-Parameter 3DVolumeInterpolationAtReconstruction (=3) + + //按照目前配置必定 maxFeasibleSosZLayerCount = 2;!!! switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction]) - { - case 0: // Mit Textur -> 1ne SOS-ZLayer - maxFeasibleSosZLayerCount = 1; - break; - case 1: // Mit Textur & Interpolation -> 2 SOS-ZLayer - maxFeasibleSosZLayerCount = 2; - break; - } - #ifdef debug_OutputVariables - printf( "Set maxFeasibleSosZLayerCount = %u\n", maxFeasibleSosZLayerCount); - #endif + { + case 0: // Mit Textur -> 1ne SOS-ZLayer + maxFeasibleSosZLayerCount = 1; + break; + case 1: // Mit Textur & Interpolation -> 2 SOS-ZLayer + maxFeasibleSosZLayerCount = 2; + break; + } - //printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount); - //printf( "maxSurfaceTexture3DDimension= %i\n", maxSurfaceTexture3DDimension); + maxAscanIndexArraysInTexture = maxSurfaceTexture3DDimension / maxFeasibleSosZLayerCount; // Max Anzahl der Ascans in einer Teiltabelle (1024) - // Fuer Ascan-Index-Varainte von SAFT werden mehrere Texturen benoetigt, da die Anzahl der Z_layer limitiert ist. - // Um 3D-Interpolation zu ermoeglichen muessen jeweils 2 Z-Layer pro A-Scan vorhanden sein. - // --> 2*nAscans < maxSurfaceTexture3DDimension(Fermi & Kepler: 2048) ==> maximal 1024 Em/Rec - Kombinationen koennen in einem Surface/Textur gespeichert werden - // maxSurfaceTexture3DDimension = maximale Groesse die erlaubt ist (2048) - // TableAscanIndexAllocationCount = Anzahl der TeilSurfaces ==> auch Anzahl der benoetigten Durchlaeufe (aktuell 4 Texturen) - // maxFeasibleSosZLayerCount = Anzahl der SoS-Zlayer die gleichzeitig im Speicher pro EM/REC-Kombi vorgehalten werden (1 oder 2 bei Interpolierten Variante) - // maxAscanIndexArraysInTexture = Anzahl der Ascans in einer Teiltabelle (1024) - // maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX (=4) // Definiert die aktuell maximal unterstuetzen Texturen im Code fuer AscanIndex - // neededAscanBatchCount = Anzahl an benoetigten Durchlaeufe des SAFTs um alle Ascans abarbeiten zu koennen + // memoryCheck(); // Freier Speicher am Anfang ausgeben - maxAscanIndexArraysInTexture = maxSurfaceTexture3DDimension/maxFeasibleSosZLayerCount; // Max Anzahl der Ascans in einer Teiltabelle (1024) + // if ((strcmp(device.name, "GeForce GTX 690") == 0)||(strcmp(device.name, "GeForce GTX 590") == 0)){ + if (memoryGPUfree() <= 2500000000) + { // IF GPUMemory < 2.5GB only 1 Surface can be used + maxSupportedTexturesForAscanIndex = 1; + } + neededAscanBatchCount = ceil((float)aScanCount / (maxSurfaceTexture3DDimension / maxFeasibleSosZLayerCount) / maxSupportedTexturesForAscanIndex); + // Determine amount of PartSurfaces + if (neededAscanBatchCount > 1) + { + TableAscanIndexAllocationCount = maxSupportedTexturesForAscanIndex; // Wenn mehr als ein Durlauf nötig --> so viele wie möglich nutzen + } + else + { + TableAscanIndexAllocationCount = (int)ceil((float)aScanCount / (maxAscanIndexArraysInTexture)); // Wenn nur ein Durlauf nötig --> so wenige wie nötig nutzen + } - #ifdef debug_OutputAScanIndexMemoryDivision - printf("%s :\n", device.name); - printf(" Total memory %lld Bytes\n", memoryGPUtotal() ); - printf(" Free memory %lld Bytes\n", memoryGPUfree() ); - printf(" => Used memory %lld Bytes\n", (memoryGPUtotal()-memoryGPUfree())); - #endif - //memoryCheck(); // Freier Speicher am Anfang ausgeben + // Set Block and Grid-Dimensions for GPU Threads + genericSAFTBlockDimensions = fixedBlockDimensions; // fixedBlockDimensions = Parameter BlockDim_XYZ + genericSAFTGridDimensions = + dim3((IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x - 1) / genericSAFTBlockDimensions.x, // hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau + (IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y - 1) / genericSAFTBlockDimensions.y, // in Blockgroesse geteilt werden kann, muss ein weiterer + (IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z - 1) / genericSAFTBlockDimensions.z // Block berechnet werden. Wenn insgesamt zu viele werden sie im Kernel aussortiert. + ); - // if ((strcmp(device.name, "GeForce GTX 690") == 0)||(strcmp(device.name, "GeForce GTX 590") == 0)){ - if (memoryGPUfree() <= 2500000000){ // IF GPUMemory < 2.5GB only 1 Surface can be used - maxSupportedTexturesForAscanIndex = 1; - #ifdef debug_OutputAScanIndexMemoryDivision - printf("Free GPU Memory: %lld < 2.5GB\n --> reduce maxSupportedTexturesForAscanIndex 4 -> %i \n", memoryGPUfree(), maxSupportedTexturesForAscanIndex ); - //printf("GeForce GTX 690/590 \n --> reduce maxSupportedTexturesForAscanIndex 4 -> %i \n", maxSupportedTexturesForAscanIndex ); - #endif - } - - #ifdef debug_OutputAScanIndexMemoryDivision - printf( "--> maxSupportedTexturesForAscanIndex %i \n", maxSupportedTexturesForAscanIndex); - //printf( "--> TableAscanIndexAllocationCount %i \n", TableAscanIndexAllocationCount); - #endif - - neededAscanBatchCount = ceil((float)aScanCount/(maxSurfaceTexture3DDimension/maxFeasibleSosZLayerCount)/maxSupportedTexturesForAscanIndex); - #ifdef debug_OutputAScanIndexMemoryDivision - printf("aScanCount %i -> neededAscanBatchCount = %i!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", aScanCount, neededAscanBatchCount); - - //printf("totalGlobalMem = %lld Byte\n", device.totalGlobalMem); - //printf("multiProcessorCount = %i MultiProcessors\n", device.multiProcessorCount); - #endif - - // Determine amount of PartSurfaces - if (neededAscanBatchCount > 1){ - TableAscanIndexAllocationCount = maxSupportedTexturesForAscanIndex; // Wenn mehr als ein Durlauf nötig --> so viele wie möglich nutzen - } else { - TableAscanIndexAllocationCount = (int)ceil((float)aScanCount/(maxAscanIndexArraysInTexture)); // Wenn nur ein Durlauf nötig --> so wenige wie nötig nutzen - } - - - - // Set Block and Grid-Dimensions for GPU Threads - genericSAFTBlockDimensions = fixedBlockDimensions; // fixedBlockDimensions = Parameter BlockDim_XYZ - genericSAFTGridDimensions = dim3( - (IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x-1)/ genericSAFTBlockDimensions.x, // hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau - (IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y-1)/ genericSAFTBlockDimensions.y, // in Blockgroesse geteilt werden kann, muss ein weiterer - (IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z // Block berechnet werden. Wenn insgesamt zu viele werden sie im Kernel aussortiert. - ); - #ifdef debug_OutputVariables - printf( "genericSAFTBlockDimensions X,Y,Z = (%i %i %i)\n",genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z); - printf( "genericSAFTGridDimensions X,Y,Z = (%i %i %i)\n",genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z); - #endif - - - - // Outputsize of SOS Volume + // Outputsize of SOS Volume SOSVolume_VoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z; SOSVolume_Bytes = SOSVolume_VoxelCount * sizeof(float); - #ifdef debug_OutputVariables - printf(" SOSVolume_VoxelCount [%ix%ix%i] = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, SOSVolume_VoxelCount); - printf(" SOSVolume_Bytes = SOSVolumeVoxelCount(%i) x sizeof(float = 4) = %i\n", SOSVolume_VoxelCount, SOSVolume_Bytes); - #endif - - // Warn if Outputsize of Volume is too big for 32Bit Sytems - outputVolume_VoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen - outputVolume_Bytes = outputVolume_VoxelCount * sizeof(double); // Speicherbedarf fuer alle Voxel im Volumen + outputVolume_VoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen + outputVolume_Bytes = outputVolume_VoxelCount * sizeof(double); // Speicherbedarf fuer alle Voxel im Volumen - #ifdef debug_OutputVariables - printf(" outputVolume_VoxelCount [%ix%ix%i]= %lld\n",IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, outputVolume_VoxelCount); - printf(" outputVolume_Bytes [%lld x sizeof(double = 8)] = %lld\n", outputVolume_VoxelCount, outputVolume_Bytes); - #endif + aScan_Bytes = aScanLength * sizeof(float); + aScanBatch_Bytes = aScanCount * aScan_Bytes; - //Hier auf maximale Outputgroesse von 32-BitSystem ueberpruefen --> falls Probleme mit 32-Bitsystemen hier noch Abfrage und Abbruch implementieren - //if (outputVolume_VoxelCount > 536870912) // 536870912 = 2^32 / sizeof(double) - // std::cout << "outputVolume_Bytes > 2^32 the upper limit of unsigned integer!!!\n => Reconstruction only in 64-Bit Systems"; - - //Groesse der Datenbloecke fuer die Blockverarbeitung wird mit aScanCount angegeben - #ifdef debug_OutputVariables - printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount); - #endif - aScan_Bytes = aScanLength * sizeof(float); - aScanBatch_Bytes = aScanCount * aScan_Bytes; - #ifdef debug_OutputVariables - printf( "aScan_Bytes = aScanLength(%i) * sizeof(float=4) = %i\n", aScanLength, aScan_Bytes); - printf( "aScanCount = %i\n", aScanCount); - printf( "aScanBatch_Bytes = aScanCount * aScan_Bytes ( = %i * sizeof(float)) = %i\n", aScanLength, aScanBatch_Bytes); - #endif - - #ifdef debug_OutputInfo - printf("\nParameter for Image Reconstruction\n"); - printf( "========================================================================\n"); - printf( "IMAGE_SIZE_XYZ: [%i x %i x %i]\n", IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z); - printf( "outputVolume_VoxelCount: %lld\n", outputVolume_VoxelCount); - printf( "Increment vector/Resolution: %f\n", IMAGE_RESOLUTION); - printf( "IMAGE_STARTPOINT in m: [%f x %f x %f]\n", regionOfInterestOffset.x, regionOfInterestOffset.y, regionOfInterestOffset.z); - outputVolume_size_m.x = IMAGE_SIZE_XYZ.x * IMAGE_RESOLUTION; - outputVolume_size_m.y = IMAGE_SIZE_XYZ.y * IMAGE_RESOLUTION; - outputVolume_size_m.z = IMAGE_SIZE_XYZ.z * IMAGE_RESOLUTION; - printf( "Volume Size in m: [%f x %f x %f]\n", outputVolume_size_m.x, outputVolume_size_m.y, outputVolume_size_m.z); - printf( "aScanCount: %i\n", aScanCount); - printf( "========================================================================\n\n"); - #endif - - - #ifdef debug_OutputPerformance - struct timeval startProcessAscans, stopProcessAscans; - gettimeofday(&startProcessAscans, NULL); - #endif - - //perform processing with AScan-Data - //=========================================================================================================== - //=========================================================================================================== - ullong duration; - processAScans(duration); - //=========================================================================================================== - //=========================================================================================================== - - #ifdef debug_OutputPerformance - diff_time = (double)((stopProcessAscans.tv_sec * 1000000.0 + stopProcessAscans.tv_usec) - (startProcessAscans.tv_sec * 1000000.0 + startProcessAscans.tv_usec)); - printf ("########################################################################\n"); - printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Free Memory = %4.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time); - printf ("########################################################################\n"); - #endif - - Duration_ptr[(deviceIndex+1)] = (double)duration; // Für jede GPU einen Laufzeitwert in µs übermitteln, Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig - - #ifdef debug_OutputVariables - printf( " GPU (%s:ID %i,Index %i): => Duration_ptr[%i] = duration(%i µs) = %.2f s\n", device.name, deviceId, deviceIndex, (deviceIndex+1), duration, Duration_ptr[(deviceIndex+1)]/1000/1000); - #endif - - #ifdef debug_OutputInfo - printf("Reset Device\n"); // Reset Device - #endif - // CUDA_CHECK(cudaDeviceReset()); // news 2019 commented, see above reason. - - #ifdef debug_OutputFunctions - printf( "<== SAFTHandler::performReconstruction - End\n"); - #endif + // perform processing with AScan-Data + //=========================================================================================================== + //=========================================================================================================== + ullong duration; + processAScans(duration); + //=========================================================================================================== + //=========================================================================================================== + Duration_ptr[(deviceIndex + 1)] = (double)duration; // Für jede GPU einen Laufzeitwert in µs übermitteln, Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig } /** @@ -402,96 +196,42 @@ void SAFTHandler::performReconstruction() - Der SAFT Kernel erwartet Argumente in den die Grid Dimension auf drei Dimensionen reduziert wurde und die Block-Dimensionen auf nur eine Dimension reduziert ist. - Das haengt auch von den Eigenschaften der verfuegbaren HW ab (shader model) */ -void SAFTHandler::reduceKernelDimensions( - dim3 const & gridDimensions, ///< Input grid dimensions. - dim3 const & blockDimensions, ///< Input block dimensions. - dim3 & reducedGridDimensions, ///< Reduced output grid dimensions. - dim3 & reducedBlockDimensions ///< Reduced output block dimensions. - ) +void SAFTHandler::reduceKernelDimensions(dim3 const &gridDimensions, ///< Input grid dimensions. + dim3 const &blockDimensions, ///< Input block dimensions. + dim3 &reducedGridDimensions, ///< Reduced output grid dimensions. + dim3 &reducedBlockDimensions ///< Reduced output block dimensions. +) { - - #ifdef debug_OutputFunctions - printf( "==> SAFTHandler::reduceKernelDimensions - Start\n"); - #endif - - - if(deviceProperties[deviceId].maxGridSize[2] > 1) + if (deviceProperties[deviceId].maxGridSize[2] > 1) { reducedGridDimensions = gridDimensions; - #ifdef debug_OutputParameter - printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z); - #endif } else { - reducedGridDimensions = dim3( - gridDimensions.x * gridDimensions.y, - gridDimensions.z, - 1 - ); - #ifdef debug_OutputParameter - printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z); - #endif + reducedGridDimensions = dim3(gridDimensions.x * gridDimensions.y, gridDimensions.z, 1); } reducedBlockDimensions = dim3(blockDimensions.x * blockDimensions.y * blockDimensions.z); - #ifdef debug_OutputParameter - printf( "reducedBlockDimensions X,Y,Z = (%i %i %i)\n", reducedBlockDimensions.x, reducedBlockDimensions.y, reducedBlockDimensions.z); - #endif - - #ifdef debug_OutputFunctions - printf( "<== SAFTHandler::reduceKernelDimensions - End\n"); - #endif - } - /** Determine free memory available on the current device. */ std::size_t memoryGPUfree() { - #ifdef debug_OutputFunctions - printf( "==> memoryGPUfree - Start\n"); - #endif - - std::size_t - totalMemory, - freeMemory; + std::size_t totalMemory, freeMemory; CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); - -// printf(" Total memory %lld Bytes\n", totalMemory); -// printf(" Free memory %lld Bytes\n", freeMemory); -// printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory)); - - #ifdef debug_OutputFunctions - printf( "<== memoryGPUfree - End\n"); - #endif return freeMemory; } - /** Determine free memory available on the current device. */ std::size_t memoryGPUtotal() { - #ifdef debug_OutputFunctions - printf( "==> current - Start\n"); - #endif - - std::size_t - totalMemory, - freeMemory; + std::size_t totalMemory, freeMemory; CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); -// printf(" Total memory %lld Bytes\n", totalMemory); -// printf(" Free memory %lld Bytes\n", freeMemory); -// printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory)); - - #ifdef debug_OutputFunctions - printf( "<== current - End\n"); - #endif return totalMemory; } @@ -501,111 +241,125 @@ std::size_t memoryGPUtotal() */ void memoryCheck() { - #ifdef debug_OutputFunctions - printf( "==> memoryCheck - Start\n"); - #endif +#ifdef debug_OutputFunctions + printf("==> memoryCheck - Start\n"); +#endif - std::size_t - totalMemory, - freeMemory; - float check; + std::size_t totalMemory, freeMemory; + float check; CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); + // totalMemory + check = 1024.0f * 1024.0f * 1024.0f * 1024.0f; + if (totalMemory >= check) + { + printf(" Total memory %.3f TB\n", totalMemory / check); + } + else + { + check /= 1024.0f; + if (totalMemory >= check) + { + printf(" Total memory %.3f GB\n", totalMemory / check); + } + else + { + check /= 1024.0f; + if (totalMemory >= check) + { + printf(" Total memory %.3f MB\n", totalMemory / check); + } + else + { + check /= 1024.0f; + if (totalMemory >= check) + { + printf(" Total memory %.3f kB\n", totalMemory / check); + } + else + { + check /= 1024.0f; + if (totalMemory >= check) + printf(" Total memory %.3f Bytes\n", totalMemory / check); + } + } + } + } - //#if defined(debug_OutputInfo) || defined(debug_OutputMaxMemory) -// printf(" Total memory %lld Bytes\n", totalMemory); -// printf(" Free memory %lld Bytes\n", freeMemory); -// printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory)); - - - // totalMemory - check = 1024.0f*1024.0f*1024.0f*1024.0f; - if (totalMemory >= check){ - printf(" Total memory %.3f TB\n", totalMemory/check); - } else { - check /= 1024.0f; - if (totalMemory >= check){ - printf(" Total memory %.3f GB\n", totalMemory/check); - } else { - check /= 1024.0f; - if (totalMemory >= check){ - printf(" Total memory %.3f MB\n", totalMemory/check); - } else { - check /= 1024.0f; - if (totalMemory >= check){ - printf(" Total memory %.3f kB\n", totalMemory/check); - } else { - check /= 1024.0f; - if (totalMemory >= check) - printf(" Total memory %.3f Bytes\n", totalMemory/check); - } - } - } - } - - // freeMemory - check = 1024.0f*1024.0f*1024.0f*1024.0f; - if (freeMemory >= check){ - printf(" Free memory %.3f TB\n", freeMemory/check); - } else { - check /= 1024.0f; - if (freeMemory >= check){ - printf(" Free memory %.3f GB\n", freeMemory/check); - } else { - check /= 1024.0f; - if (freeMemory >= check){ - printf(" Free memory %.3f MB\n", freeMemory/check); - } else { - check /= 1024.0f; - if (freeMemory >= check){ - printf(" Free memory %.3f kB\n", freeMemory/check); - } else { - check /= 1024.0f; - if (freeMemory >= check) - printf(" Free memory %.3f Bytes\n", freeMemory/check); - } - } - } - } - - // Used Memory - check = 1024.0f*1024.0f*1024.0f*1024.0f; - if ((totalMemory-freeMemory) >= check){ - printf(" Used memory %.3f TB\n", (totalMemory-freeMemory)/check); - } else { - check /= 1024.0f; - if ((totalMemory-freeMemory) >= check){ - printf(" Used memory %.3f GB\n", (totalMemory-freeMemory)/check); - } else { - check /= 1024.0f; - if ((totalMemory-freeMemory) >= check){ - printf(" Used memory %.3f MB\n", (totalMemory-freeMemory)/check); - } else { - check /= 1024.0f; - if ((totalMemory-freeMemory) >= check){ - printf(" Used memory %.3f kB\n", (totalMemory-freeMemory)/check); - } else { - check /= 1024.0f; - if ((totalMemory-freeMemory) >= check) - printf(" Used memory %.3f Bytes\n", (totalMemory-freeMemory)/check); - } - } - } - } - - - //#endif - - #ifdef debug_OutputFunctions - printf( "<== memoryCheck - End\n"); - #endif + // freeMemory + check = 1024.0f * 1024.0f * 1024.0f * 1024.0f; + if (freeMemory >= check) + { + printf(" Free memory %.3f TB\n", freeMemory / check); + } + else + { + check /= 1024.0f; + if (freeMemory >= check) + { + printf(" Free memory %.3f GB\n", freeMemory / check); + } + else + { + check /= 1024.0f; + if (freeMemory >= check) + { + printf(" Free memory %.3f MB\n", freeMemory / check); + } + else + { + check /= 1024.0f; + if (freeMemory >= check) + { + printf(" Free memory %.3f kB\n", freeMemory / check); + } + else + { + check /= 1024.0f; + if (freeMemory >= check) + printf(" Free memory %.3f Bytes\n", freeMemory / check); + } + } + } + } + // Used Memory + check = 1024.0f * 1024.0f * 1024.0f * 1024.0f; + if ((totalMemory - freeMemory) >= check) + { + printf(" Used memory %.3f TB\n", (totalMemory - freeMemory) / check); + } + else + { + check /= 1024.0f; + if ((totalMemory - freeMemory) >= check) + { + printf(" Used memory %.3f GB\n", (totalMemory - freeMemory) / check); + } + else + { + check /= 1024.0f; + if ((totalMemory - freeMemory) >= check) + { + printf(" Used memory %.3f MB\n", (totalMemory - freeMemory) / check); + } + else + { + check /= 1024.0f; + if ((totalMemory - freeMemory) >= check) + { + printf(" Used memory %.3f kB\n", (totalMemory - freeMemory) / check); + } + else + { + check /= 1024.0f; + if ((totalMemory - freeMemory) >= check) + printf(" Used memory %.3f Bytes\n", (totalMemory - freeMemory) / check); + } + } + } + } } - - - - - /** Generic CUDA call wrapper. Check the result of a CUDA operation and throw an exception if an error occurred. @@ -614,21 +368,19 @@ void memoryCheck() - �berpr�ft die Ergebnisse einer CUDA Operation und wirft eine Exception wenn ein Fehler auftritt - Das wird wird mit einer Kombination mit einem Makro in saft.hpp genutzt. */ -//inline // Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen -void performCUDAResultCheck( - cudaError_t result, ///< Result of the CUDA operation. - std::string const & file, ///< Path to the source code file. - int line ///< Line within the source code - ) +// inline // Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen +void performCUDAResultCheck(cudaError_t result, ///< Result of the CUDA operation. + std::string const &file, ///< Path to the source code file. + int line ///< Line within the source code +) { - if(result != cudaSuccess) + if (result != cudaSuccess) { - //printf("A CUDA operation failed in file \"%s\" (line %i): %s \n", file, line, cudaGetErrorString(result).c_str() ); - printf("%s\n", cudaGetErrorString( cudaGetLastError() ) ); + // printf("A CUDA operation failed in file \"%s\" (line %i): %s \n", file, line, cudaGetErrorString(result).c_str() ); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); - //std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result)); - //std::cout << errorMessage << std::endl; + // std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result)); + // std::cout << errorMessage << std::endl; printf("-> Error occurred"); } } - diff --git a/SAFT_TOFI/src/saft.cu b/SAFT_TOFI/src/saft.cu deleted file mode 100644 index 78fe0da..0000000 --- a/SAFT_TOFI/src/saft.cu +++ /dev/null @@ -1,15 +0,0 @@ -#include - -#include "saft.hpp" - -/*! - This is the central CUDA file which really just includes the other modules. - This is done because CUDA does not support external references for referencing data from other compilation units. - - Dies ist das zentrale CUDA-File welches nur die anderen Module einbindet - - Das wird gemacht, weil CUDA keine externen Referenzen unterst�tzt, um Daten von anderen Compilierungs Einheiten zu referenzieren. -*/ - -// #include "kernel/rayTracing.cuh" // GPU-Code für Bresenham -// #include "kernel/precalculateSpeedOfSoundKernel.cuh" // GPU-Code Partitionierung für Bresenham. Ruft den Bresenham auf. -// #include "kernel/saftKernel.cuh" // GPU-Kernel für SAFT - diff --git a/SAFT_TOFI/src/saft.hpp b/SAFT_TOFI/src/saft.hpp index 3c5c2b4..a9f361b 100644 --- a/SAFT_TOFI/src/saft.hpp +++ b/SAFT_TOFI/src/saft.hpp @@ -1,154 +1,88 @@ #pragma once -#include - #include #include #include - #include -#include // standard input/output -#include // stl vector header +#include // standard input/output +#include +#include // stl vector header -typedef unsigned char uchar; +typedef unsigned char uchar; typedef unsigned short ushort; -typedef unsigned long ulong; -typedef unsigned long long ullong; - - -//Define Outputs for Debugmode -//============================ - //#define debug_OutputFormat_German // German Format , instead . for numbers - //#define debug_OutputFunctions // Funktionenaufrufe ausgeben - //#define debug_OutputVariables // Werte der Variablen ausgeben - //#define debug_OutputParameter // Uebersicht der Eingabedaten anzeigen sowie Infobloeke in den einzelnen Schritten - //#define debug_OutputMemory // Speicherverwaltung, Malloc, Free, Groessen - //#define debug_OutputMaxMemory // Gibt aktuellen Speicherverbrauch an, wenn memoryCheck aufgerufen wird - //#define debug_OutputInfo // Gibt Infos zu Schritten, Variablen,... aus - //#define debug_OutputPerformance // Gibt die Laufzeiten und die eizelnen Multi-GPU Performanzwerte von ProcessAscans aus - //#define debug_OutputStepsPerformance // Gibt die Laufzeiten und für die einzelnen Schritte in performCoreReconstruction aus (Copy Ascans, Precalc, PerfCoreReconstruction, copy back) - //#define debug_OutputStepsPrecalculation // Gibt Infos ueber die einzelnen Schritte der Precalculation Steps an - //#define debug_OutputHostStepsPerformance // Gibt die Laufzeiten für die eizelnen Schritte auf dem HOST aus (Preintegrated Ascans) - //#define debug_OutputSAFTHandlerThreadPerformance // Gibt die Gesamt-Laufzeiten der einzelnen Multi-GPU Threads aus - //#define debug_OutputMultiGpu // Einteilung des Volumens auf mehrerer GPUs ausgeben - //#define debug_OutputStreams // Gibt die Schritte der Berechnung der Streams aus - //#define debug_OutputSOSPaths // Gibt die Schritte und Werte der SOSPfadberechnung aus - //#define debug_OutputSOSStepsParameter // Einteilung der ZLayer in SOSZlayer - //#define debug_OutputLookUpGeometryMemoryList // Debugausgabe fuer die LookUpGeometryMemoryList (Constant Memory) - //#define debug_OutputAScanIndexMemoryDivision // Debugausgabe für die Einteilung in Surfaces da mehrere Surfaces benoetigt werden - - //#define OutputVolume // Ausgabe des Volumens - -// Debugging CUDA Kernels -//================================================ - //#define debug_CudaSAFTKernelModes // Use variable debugMode for different calulations methods and output - //#define debug_CudaSAFTKernel_EnableAnalyticAverageSpeedCalculation // Fuer Fehlerberchnungen - //#define debug_CudaSAFTKernel - //#define debug_CudaPrecalculateKernel - //#define debug_CudaPrecalculateAscanIndexKernel // Kernel function for PrecalculateAscanIndex - //#define debug_CudaPrecalculateAscanIndexKernelProxy // Proxy function for PrecalculateAscanIndex - //#define debug_CudaFillCuArrayKernelProxy - //#define debug_CudaSAFTAscanIndexKernel // Kernel function for SAFTAscanIndex - //#define debug_CudaSAFTAscanIndexKernelDataAccess // Access and sum up values from Ascans - //#define debug_CudaRayTraceKernel - //#define debug_CudaRayTraceKernelLive - - - //#define DebugSetMemoryToZero // Set SOSPathMemory to Zero as Initialisation - - -// Define specific Hardware-Versions - #define GTX_590 - //#define GTX_690 - //#define GTX_TITAN - - #if defined(GTX_590) - #define GTX_Fermi - #endif - #if defined(GTX_690) || defined(GTX_TITAN) - #define GTX_Kepler - #endif +typedef unsigned long ulong; +typedef unsigned long long ullong; // Memory management of GPU and Errordetection //================================================ - //#define SaftNoTexture // Instead of TextureMemory use Memory access on GPU Device Memory // out-of-date - //#define SaftCorrectSumAllAscan // Recalculation if too big values occur - - -// SAFT-Implementierung 2-stufig mit AscanIndexInterpolation -//============================================================ - #define SaftUseAscanIndexInterpolation - #define noGeometryLoading - //#define SaftUseAscanIndexInterpolation_PartWise // Kernel mit nur einem Durchlauf durchfuehren, sonst ueber Ascans im Kernel laufen - #define AscanTextureUse1Float // Textur mit nur Float1 für SOS berechnen +// #define SaftNoTexture // Instead of TextureMemory use Memory access on GPU Device Memory // out-of-date +// #define SaftCorrectSumAllAscan // Recalculation if too big values occur +// #define SaftUseAscanIndexInterpolation_PartWise // Kernel mit nur einem Durchlauf durchfuehren, sonst ueber Ascans im Kernel laufen +#define AscanTextureUse1Float // Textur mit nur Float1 für SOS berechnen // Integration der A-scans im Vornherein durchfuehren um Samplebreite an zu rekonstruierende Aufloesung anzupassen //======================================================================================================================= - #define preAscanIntegrationToMatchSamplerateToResolution // Integration der Ascans ueber Fensterbreite durchfuehren - //#define debug_preAscanIntegration - #define DebugSammleMin 2990 // Grenzen feeru Degbugausgabe der Werte - #define DebugSammleMax 3000 - //#define preAscanIntegrationVersion1Michael // direkt übernommene Version von Michael - #define preAscanIntegrationVersion2Ernst // korrigierte Variante mit genauerer Fensterbreite - +#define preAscanIntegrationToMatchSamplerateToResolution // Integration der Ascans ueber Fensterbreite durchfuehren + // #define debug_preAscanIntegration +#define DebugSammleMin 2990 // Grenzen feeru Degbugausgabe der Werte +#define DebugSammleMax 3000 +// #define preAscanIntegrationVersion1Michael // direkt übernommene Version von Michael +#define preAscanIntegrationVersion2Ernst // korrigierte Variante mit genauerer Fensterbreite // Parameter fuer SAFT-Kernel //======================================================================================================================= - #define USED_ASCANSMEMORYREGIONS 1 // Anzahl der A-Scan-Speicherbereiche die alloziert werden - // Hier reicht einer statt zwei! 2 nur nötig wenn A-scans Spückweise mit Streams fuer kopiert werden sollen. - #define MAX_SUPPORTEDTEXTURES_FORASCANINDEX 4 // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex; - #define MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE 710 // Definiert maximale #Receiver in einem SOSPATH-Textur - #define MATLABSAVETY_MB 25 // in MB. Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann, daher zur Sicherheit Speicher freihalten +#define USED_ASCANSMEMORYREGIONS \ + 1 // Anzahl der A-Scan-Speicherbereiche die alloziert werden + // Hier reicht einer statt zwei! 2 nur nötig wenn A-scans Spückweise mit Streams fuer kopiert werden sollen. +#define MAX_SUPPORTEDTEXTURES_FORASCANINDEX 4 // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex; +#define MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE 710 // Definiert maximale #Receiver in einem SOSPATH-Textur +#define MATLABSAVETY_MB 25 // in MB. Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann, daher zur Sicherheit Speicher freihalten +#define SaftLinearInterpolation // Lineare Interpolation beim Zugriff auf A-scans durchführen - #define SaftLinearInterpolation // Lineare Interpolation beim Zugriff auf A-scans durchführen +#define SaftUseConstantMemforGeometry // Geometriedaten im Constantmemory nutzen - #define SaftUseConstantMemforGeometry // Geometriedaten im Constantmemory nutzen +// #define SaftTextureForEmRecSosPathsTablesFloat1 // Use Float1-Textur for loading SOS-Paths -> Sum, Count separated +// #define SaftTextureForEmRecSosPathsTablesFloat2 // Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position +#define SaftTextureForEmRecSosPathsTablesFloat4 // Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position - //#define SaftTextureForEmRecSosPathsTablesFloat1 // Use Float1-Textur for loading SOS-Paths -> Sum, Count separated - //#define SaftTextureForEmRecSosPathsTablesFloat2 // Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position - #define SaftTextureForEmRecSosPathsTablesFloat4 // Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position - - #if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4) - #define SaftTextureForEmRecSosPathsTables // Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible - #endif - - // Several SAFT_VARIANTs - #define SAFT_VARIANT_AscanPreintegration 0 - #define SAFT_VARIANT_AscanInterpolation 1 - #define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing 2 // Use interpolation while Preprocessing - #define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction 3 // Use interpolation while Reconstruction - #define SAFT_VARIANT_CalcStandardDeviation 4 // Not yet implemented - #define SAFT_VARIANT_SumUpOverBoarderIndices 5 // Not yet implemented +#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4) +#define SaftTextureForEmRecSosPathsTables // Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible +#endif +// Several SAFT_VARIANTs +#define SAFT_VARIANT_AscanPreintegration 0 +#define SAFT_VARIANT_AscanInterpolation 1 +#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing 2 // Use interpolation while Preprocessing +#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction 3 // Use interpolation while Reconstruction +#define SAFT_VARIANT_CalcStandardDeviation 4 // Not yet implemented +#define SAFT_VARIANT_SumUpOverBoarderIndices 5 // Not yet implemented // Cache <-> shared Memory - //#define SaftPreferSharedMem // cudaFuncCachePreferShared: shared memory is 48 KB - #define SaftPreferL1SharedMem // cudaFuncCachePreferL1: shared memory is 16 - //#define SaftPreferNone // cudaFuncCachePreferNone: no preference +// #define SaftPreferSharedMem // cudaFuncCachePreferShared: shared memory is 48 KB +#define SaftPreferL1SharedMem // cudaFuncCachePreferL1: shared memory is 16 +// #define SaftPreferNone // cudaFuncCachePreferNone: no preference // Receiver Cache mit shared Memory (nur bei kleinen Blockgroeßen) - //#define SaftReceiverSharedMemCacheReceiverDistance - //#define SaftCacheReceiverSOS - //#define SaftReceiverSharedMemCacheReceiverSOS // Use Shared Memory for Caching - //#define SaftRegisterCacheReceiverSOS // Use Register for Caching - +// #define SaftReceiverSharedMemCacheReceiverDistance +// #define SaftCacheReceiverSOS +// #define SaftReceiverSharedMemCacheReceiverSOS // Use Shared Memory for Caching +// #define SaftRegisterCacheReceiverSOS // Use Register for Caching // Berechnung der mittleren Schallgeschwindigkeit //================================================ - #define SaftUseHarmonicMean // harmonic Mean +#define SaftUseHarmonicMean // harmonic Mean - #define SaftTextureForBresenhamSosPaths // Texturmemory für SOS-Volumen nutzen (Version without not full implemented) - //#define SaftTextureForBresenhamInterpolated //iSOS-Version --> wird nun ueber Parameter uebergeben - //#define SaftUseFastMath //FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig. +#define SaftTextureForBresenhamSosPaths // Texturmemory für SOS-Volumen nutzen (Version without not full implemented) + // #define SaftTextureForBresenhamInterpolated //iSOS-Version --> wird nun ueber Parameter uebergeben +// #define SaftUseFastMath //FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig. - //#define SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert - #define SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att) - - #define SOS_Version2 // korrekte Version mit Definitionen im Mittelpunkt +// #define SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert +#define SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att) +#define SOS_Version2 // korrekte Version mit Definitionen im Mittelpunkt // MultiGPU //================================================ @@ -163,18 +97,16 @@ typedef unsigned long long ullong; // constant such that 64kB of constant is fully blocked by emitter/receiver combinations const int MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY = 2340; -//Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred. +// Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred. #define CUDA_CHECK(operation) performCUDAResultCheck(operation, __FILE__, __LINE__); -//Macro used to see when a particular line of code is executed on the host. +// Macro used to see when a particular line of code is executed on the host. #define DEBUG_MARK std::cout << "[DEBUG] file " << __FILE__ << ", line " << __LINE__ << std::endl - -//Convenient typedefs for containers +// Convenient typedefs for containers typedef std::vector DeviceProperties; typedef std::vector Dimensions; - /** Most important class in the application. - Haupt-Klasse der Applikation @@ -183,399 +115,288 @@ typedef std::vector Dimensions; */ class SAFTHandler { -public: - SAFTHandler(int deviceId, - int deviceIndex, - float *aScan_ptr, ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath, - double *output_ptr, ///< Zeiger zu den Outputdaten //std::string const & outputPath, - double *Duration_ptr, ///< Zeiger auf Ausgabewert f�r benoetigte Laufzeit des SAFT-Kernels - unsigned short *receiver_index_ptr, ///< - unsigned short *emitter_index_ptr, ///< - float *receiver_list_ptr, ///< - int receiver_list_Size, ///< - float *emitter_list_ptr, ///< - int emitter_list_Size, ///< - float *speed_vec_ptr, - int3 SOSGrid_XYZ, - float3 sosOffset, ///< Startpoint of SoSGrid - float SOS_RESOLUTION, ///< Aufloesung des SoSGrid - float *att_vec_ptr, //att_vec_ptr + public: + SAFTHandler(int deviceId, int deviceIndex, + float *aScan_ptr, ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath, + double *output_ptr, ///< Zeiger zu den Outputdaten //std::string const & outputPath, + double *Duration_ptr, ///< Zeiger auf Ausgabewert f�r benoetigte Laufzeit des SAFT-Kernels + unsigned short *receiver_index_ptr, ///< + unsigned short *emitter_index_ptr, ///< + float *receiver_list_ptr, ///< + int receiver_list_Size, ///< + float *emitter_list_ptr, ///< + int emitter_list_Size, ///< + float *speed_vec_ptr, int3 SOSGrid_XYZ, + float3 sosOffset, ///< Startpoint of SoSGrid + float SOS_RESOLUTION, ///< Aufloesung des SoSGrid + float *att_vec_ptr, // att_vec_ptr - int aScanCount, - int aScanLength, - int3 IMAGE_SIZE_XYZ, - float sampleRate, - float3 regionOfInterestOffset, - float IMAGE_RESOLUTION, - dim3 const & fixedBlockDimensions, - float debugMode, - float debugModeParameter, - //bool useFixedPartialOutputWindow, + int aScanCount, int aScanLength, int3 IMAGE_SIZE_XYZ, float sampleRate, float3 regionOfInterestOffset, float IMAGE_RESOLUTION, dim3 const &fixedBlockDimensions, float debugMode, + float debugModeParameter, + // bool useFixedPartialOutputWindow, - bool SOSMode_3DVolume, - bool ATTMode_3DVolume, + bool SOSMode_3DVolume, bool ATTMode_3DVolume, - int SAFT_MODE, - int *SAFT_VARIANT, - int SAFT_VARIANT_Size, + int SAFT_MODE, int *SAFT_VARIANT, int SAFT_VARIANT_Size, - int *Abort_ptr - ); + int *Abort_ptr); void performReconstruction(); -private: + private: + int *Abort_ptr; // Ist ein Fehler aufgetreten der zum Abburch geführt hat + // int Abort; - int *Abort_ptr; // Ist ein Fehler aufgetreten der zum Abburch geführt hat - //int Abort; + bool SOSMode_3DVolume, ATTMode_3DVolume; - bool SOSMode_3DVolume, - ATTMode_3DVolume; - - int SAFT_MODE; - int *SAFT_VARIANT; - int *deviceSAFT_VARIANT; - int SAFT_VARIANT_Size; + int SAFT_MODE; + int *SAFT_VARIANT; + int *deviceSAFT_VARIANT; + int SAFT_VARIANT_Size; int deviceId; int deviceIndex; - float debugMode; - float debugModeParameter; + float debugMode; + float debugModeParameter; DeviceProperties deviceProperties; - float - *aScan_ptr; + float *aScan_ptr; -// float -// *rec_vec_ptr, -// *send_vec_ptr; + // float + // *rec_vec_ptr, + // *send_vec_ptr; // Zuordnungslisten in der geschaut wird welcher Emitter/Receiver genutzt wird (65535 = nicht genutzt, alles andere ist dann der Index) - unsigned short* hostLookUpGeometryMemoryListEmitterPtr; // Memory of hostLookUpGeometryMemoryListEmitter - unsigned short* hostLookUpGeometryMemoryListReceiverPtr; // Memory of hostLookUpGeometryMemoryListReceiver - int lookUpGeometryMemoryListEmitterSize; // Size of hostLookUpGeometryMemoryListEmitterPtr - int lookUpGeometryMemoryListReceiverSize;// Size of hostLookUpGeometryMemoryListReceiverPtr + unsigned short *hostLookUpGeometryMemoryListEmitterPtr; // Memory of hostLookUpGeometryMemoryListEmitter + unsigned short *hostLookUpGeometryMemoryListReceiverPtr; // Memory of hostLookUpGeometryMemoryListReceiver + int lookUpGeometryMemoryListEmitterSize; // Size of hostLookUpGeometryMemoryListEmitterPtr + int lookUpGeometryMemoryListReceiverSize; // Size of hostLookUpGeometryMemoryListReceiverPtr - unsigned short - *emitter_index_ptr, - *receiver_index_ptr; + unsigned short *emitter_index_ptr, *receiver_index_ptr; - float - *emitter_list_ptr, - *receiver_list_ptr; + float *emitter_list_ptr, *receiver_list_ptr; - int - receiver_list_Size, - emitter_list_Size; + int receiver_list_Size, emitter_list_Size; - double - *output_ptr; + double *output_ptr; - double - *Duration_ptr; + double *Duration_ptr; - float - Sos, - *speed_vec_ptr, - *att_vec_ptr; + float Sos, *speed_vec_ptr, *att_vec_ptr; - int3 - SOSGrid_XYZ; + int3 SOSGrid_XYZ; - float3 - sosOffset; ///< Startpoint of SoSGrid + float3 sosOffset; ///< Startpoint of SoSGrid - int - aScanCount, - aScanLength; + int aScanCount, aScanLength; - int3 - IMAGE_SIZE_XYZ; + int3 IMAGE_SIZE_XYZ; - float3 - regionOfInterestOffset; //imageStartpoint; TODO: umbenennen! + float3 regionOfInterestOffset; // imageStartpoint; TODO: umbenennen! - float - IMAGE_RESOLUTION, ///< Aufl�sung im OutputVolumen - IMAGE_RESOLUTION_FACTOR, ///< 1/Aufl�sung im OutputVolumen - SOS_RESOLUTION, ///< Aufloesung des SoSGrid - SOS_RESOLUTION_FACTOR; ///< 1/Aufl�sung im SoS-Grid + float IMAGE_RESOLUTION, ///< Aufl�sung im OutputVolumen + IMAGE_RESOLUTION_FACTOR, ///< 1/Aufl�sung im OutputVolumen + SOS_RESOLUTION, ///< Aufloesung des SoSGrid + SOS_RESOLUTION_FACTOR; ///< 1/Aufl�sung im SoS-Grid - std::string - emitterGeometryPath, - receiverGeometryPath, - aScanSamplesPath, - outputPath; + std::string emitterGeometryPath, receiverGeometryPath, aScanSamplesPath, outputPath; float *aScanSamples; double *output; - //int aScanCount; + // int aScanCount; int - //aScanSize, - aScan_Bytes, - //batchSize, // --> aScanCount - //aScanBatchSize; - aScanBatch_Bytes; + // aScanSize, + aScan_Bytes, + // batchSize, // --> aScanCount + // aScanBatchSize; + aScanBatch_Bytes; float voxelSize; float sampleRate; - //size_t + // size_t uint64_t - //regionOfInterestVoxelCount, - outputVolume_VoxelCount, - //outputSize; - outputVolume_Bytes; + // regionOfInterestVoxelCount, + outputVolume_VoxelCount, + // outputSize; + outputVolume_Bytes; - float3 outputVolume_size_m; // ROI-Groesse in meter + float3 outputVolume_size_m; // ROI-Groesse in meter - uint64_t - partialOutputZLayerOffset; + uint64_t partialOutputZLayerOffset; - int - partialOutputZLayerOffsetCount, - partialOutputSoSZLayerCount, - currentZLayerCount, - partialSoSZLayerCount; + int partialOutputZLayerOffsetCount, partialOutputSoSZLayerCount, currentZLayerCount, partialSoSZLayerCount; // Fuer AscanIndexInterpolation // ------------------------------------------------------ - int currentEmIndexUsedForAscanIndexCalculation; - float *deviceTextureAscanIndexFloat; // Texture adresses for precalculated AscanIndex data - //std::size_t deviceTextureAscanIndexFloatSize; + int currentEmIndexUsedForAscanIndexCalculation; + float *deviceTextureAscanIndexFloat; // Texture adresses for precalculated AscanIndex data + // std::size_t deviceTextureAscanIndexFloatSize; - cudaArray **deviceTextureAscanIndexFloatCuArray; // CudaArray for AscanIndex data - int maxSurfaceTexture3DDimension; // max Dimension in 3D --> Max size for Texture - int maxAscanIndexArraysInTexture; // = maxSurfaceTexture3DDimension/2; - int TableAscanIndexAllocationCount; // Anzahl der benoetigten AscanBlocks der Groesse 2048/4096 - int maxSupportedTexturesForAscanIndex; // Definiert die maximal unterstuetzen Texturen fuer AscanIndex - int neededAscanBatchCount; // Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen + cudaArray **deviceTextureAscanIndexFloatCuArray; // CudaArray for AscanIndex data + int maxSurfaceTexture3DDimension; // max Dimension in 3D --> Max size for Texture + int maxAscanIndexArraysInTexture; // = maxSurfaceTexture3DDimension/2; + int TableAscanIndexAllocationCount; // Anzahl der benoetigten AscanBlocks der Groesse 2048/4096 + int maxSupportedTexturesForAscanIndex; // Definiert die maximal unterstuetzen Texturen fuer AscanIndex + int neededAscanBatchCount; // Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen // ------------------------------------------------------ double *currentHostOutputAdress; // Pointer of Inputdata in memory of Ascanblock - float3 - *receiver_list, // LookUpTable receiverNr -> coordinates - *emitter_list; // LookUpTable emitterNr -> coordinates + float3 *receiver_list, // LookUpTable receiverNr -> coordinates + *emitter_list; // LookUpTable emitterNr -> coordinates - unsigned short - *receiver_index, // Input Ascanblockdata: corresponding receiverNr - *emitter_index; // Input Ascanblockdata: corresponding emitterNr + unsigned short *receiver_index, // Input Ascanblockdata: corresponding receiverNr + *emitter_index; // Input Ascanblockdata: corresponding emitterNr - //float + // float // *SoSData; // Input Ascanblockdata: Corresponding SOS value - float *speedOfSoundField; // Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen - float *attenuationField; // Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen + float *speedOfSoundField; // Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen + float *attenuationField; // Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen - #ifdef SaftUseSosAttFloat2 - float2 *hostSosAttField; - #endif + float2 *hostSosAttField; // Memorysizes int - //speedOfSoundFieldVoxelCount, // - SOSVolume_VoxelCount, // Amount of Voxels of SOSVolume - //speedOfSoundFieldBytes, // - SOSVolume_Bytes, // Size of SOSVolume in Byte - speedOfSoundEmitterVoxelPathCountByteSize, // Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen - speedOfSoundEmitterVoxelPathSumByteSize; // Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel + // speedOfSoundFieldVoxelCount, // + SOSVolume_VoxelCount, // Amount of Voxels of SOSVolume + // speedOfSoundFieldBytes, // + SOSVolume_Bytes, // Size of SOSVolume in Byte + speedOfSoundEmitterVoxelPathCountByteSize, // Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen + speedOfSoundEmitterVoxelPathSumByteSize; // Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel - dim3 - fixedBlockDimensions, // kann ws durch genericSAFTBlockDimensions ersetzt - genericSAFTBlockDimensions, - genericSAFTGridDimensions, - windowGridDimensions; + dim3 fixedBlockDimensions, // kann ws durch genericSAFTBlockDimensions ersetzt + genericSAFTBlockDimensions, genericSAFTGridDimensions, windowGridDimensions; - cudaArray **deviceAScansCuArray; + cudaArray **deviceAScansCuArray; -#ifdef SaftTextureForBresenhamSosPaths + cudaArray *deviceSosAttFieldCuArray; - #ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) - cudaArray *deviceSpeedOfSoundFieldCuArray; // SOS volume - cudaArray *deviceAttenuationFieldCuArray; // ATT volume - #endif + int maxSoSReceiverArrayForTexture; + int TableVoxelToReceiverPathSosAllocationCount; + std::size_t receiver_list_Size_deviceMemory; - #ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att) - cudaArray *deviceSosAttFieldCuArray; - #endif -#endif + // Für Emitter ----- normal definieren + cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray; // SoSSum + cudaArray *deviceTableVoxelToEmitterPathCountCuArray; // Count + // Für Receiver ----- als Arrays definieren da zwei benoetigt + cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray; // SoSSum + cudaArray **deviceTableVoxelToReceiverPathCountCuArray; // Count + cudaArray *deviceTableVoxelToEmPathSosBothCuArray; // Emitter SoSSum + Count + cudaArray **deviceTableVoxelToRecPathSosBothCuArray; // Receiver SoSSum + Count - int maxSoSReceiverArrayForTexture; - int TableVoxelToReceiverPathSosAllocationCount; - std::size_t receiver_list_Size_deviceMemory; - -#ifdef SaftTextureForEmRecSosPathsTables - // Für Emitter ----- normal definieren - cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray; //SoSSum - cudaArray *deviceTableVoxelToEmitterPathCountCuArray; //Count - - // Für Receiver ----- als Arrays definieren da zwei benoetigt - cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray; //SoSSum - cudaArray **deviceTableVoxelToReceiverPathCountCuArray; //Count -#endif - -#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4) - cudaArray *deviceTableVoxelToEmPathSosBothCuArray; //Emitter SoSSum + Count - cudaArray **deviceTableVoxelToRecPathSosBothCuArray; //Receiver SoSSum + Count -#endif - - // Schallgeschwindigkeitskorrektur-Mode - float *deviceSpeedOfSoundField; // Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU + // Schallgeschwindigkeitskorrektur-Mode + float *deviceSpeedOfSoundField; // Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU // Block-Mode - unsigned short *deviceEmitterIndex_block; // Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU + unsigned short *deviceEmitterIndex_block; // Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU unsigned short *deviceReceiverIndex_block; - float3 *deviceListEmitterGeometry; // Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU + float3 *deviceListEmitterGeometry; // Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU float3 *deviceListReceiverGeometry; - float *deviceSoSData_block; // Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU - -// VoxelCountType // Adressen fuer Speicher der SoS-Pfade auf der GPU -// * deviceTableVoxelToEmitterPathCount, -// * deviceTableVoxelToReceiverPathCount; - float - *deviceTableVoxelToEmitterPathCountFloat, // Texture adresses for precalculated SOS data - *deviceTableVoxelToReceiverPathCountFloat, - *deviceTableVoxelToEmitterPathSosSum, - *deviceTableVoxelToReceiverPathSosSum; + float *deviceSoSData_block; // Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU + float *deviceTableVoxelToEmitterPathCountFloat, // Texture adresses for precalculated SOS data + *deviceTableVoxelToReceiverPathCountFloat, *deviceTableVoxelToEmitterPathSosSum, *deviceTableVoxelToReceiverPathSosSum; bool *deviceValidEmitterReceiverCombinations; int *deviceTransducerVectorAnalysisDistributionCounters; -// float3 -// * deviceEmitterGeometry, -// * deviceReceiverGeometry; - - int usedAmountOfEmitter, // amount of used emitter - usedAmountOfReceiver; // amount of used receiver + int usedAmountOfEmitter, // amount of used emitter + usedAmountOfReceiver; // amount of used receiver // Output volume - double *deviceOutput; + double *deviceOutput; - //Streams used for synchronisation - cudaStream_t - copyStream, - calculationStream; + // Streams used for synchronisation + cudaStream_t copyStream, calculationStream; - //This variable describes the number of allocations used by the current SAFT mode - std::size_t aScanAllocationCount; // Anzahl der Speicher die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen. + // This variable describes the number of allocations used by the current SAFT mode + std::size_t aScanAllocationCount; // Anzahl der Speicher die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen. - int - invalidEmitterReceiverCombinationsCount, - validEmitterReceiverCombinationsCount; + int invalidEmitterReceiverCombinationsCount, validEmitterReceiverCombinationsCount; Dimensions validBlockDimensions; bool useAutoTuning; -// AutoTuningConfiguration autoTuningConfiguration; + // AutoTuningConfiguration autoTuningConfiguration; - size_t - partialOutputSize, - partialVolumeSize, // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde - partialSosPathSize, // Speicher(SOSATTPaths) , der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde - partialAscanIndexSize, // Speicher(AscanIndex) , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde - maxFeasibleZLayerCount, // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt. - maxFeasibleSosZLayerCount; // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt. + size_t partialOutputSize, + partialVolumeSize, // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde + partialSosPathSize, // Speicher(SOSATTPaths) , der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde + partialAscanIndexSize, // Speicher(AscanIndex) , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde + maxFeasibleZLayerCount, // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt. + maxFeasibleSosZLayerCount; // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt. - int - minimumAutoTuningThreadCount, - maximumAutoTuningThreadCount; + int minimumAutoTuningThreadCount, maximumAutoTuningThreadCount; - - //New partial reconstruction data + // New partial reconstruction data std::size_t partialSpeedOfSoundVoxelCount; std::size_t partialOutputZLayerCount; std::size_t zLayerVoxelCount; - std::size_t sosZLayerVoxelCount; // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp + std::size_t sosZLayerVoxelCount; // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp std::size_t partialOutputVoxelCount; + double diff_time; // For Time Measurement + float transferRate; // For DataTransferrate Measurement + float performRate; // For PerformSAFTrate Measurement + cudaDeviceProp deviceProp; // Ausgabe der Frequenz - double diff_time; // For Time Measurement - float transferRate; // For DataTransferrate Measurement - float performRate; // For PerformSAFTrate Measurement - cudaDeviceProp deviceProp; // Ausgabe der Frequenz + // Core reconstruction - - //Core reconstruction - - void processAScans(ullong & duration); + void processAScans(ullong &duration); void performCoreReconstruction(); - //Pre-calculation + // Pre-calculation - //void precalculateAverageSpeedOfSound(int zLayer, int zLayerCount); // TODO: Funktion die nicht mehr benutzt wird? -// void analysisOfTransducerVectors(); - -// void normalisePerformanceStatisticsOutput(); -// void printTransducerVectorStatistics(); - - //Auto-tuning - bool determineGridDimensions(dim3 const & blockDimensions, dim3 & gridDimensions); + // Auto-tuning + bool determineGridDimensions(dim3 const &blockDimensions, dim3 &gridDimensions); void determineValidBlockDimensions(); + void reduceKernelDimensions(dim3 const &gridDimensions, dim3 const &blockDimensions, dim3 &reducedGridDimensions, dim3 &reducedBlockDimensions); - void reduceKernelDimensions(dim3 const & gridDimensions, dim3 const & blockDimensions, dim3 & reducedGridDimensions, dim3 & reducedBlockDimensions); - - //Pre-calculation kernels + // Pre-calculation kernels //------------------------------------------------------------------------ - #ifdef SaftUseConstantMemforGeometry - //void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput); - void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput); - #else - //void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceSpeedOfSoundSumOutput); - void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, float * deviceSpeedOfSoundSumOutput); - #endif + void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float *deviceVoxelCountOutputFloat, float *deviceSpeedOfSoundSumOutput); - #ifdef SaftUseAscanIndexInterpolation - void precalculateAscanIndex - ( - int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for. - int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for. - //int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated - //int emitter_list_Size, ///< Number of emitter_array got from Matlab - //int receiver_list_Size, ///< Number of receiver_array got from Matlab - //float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to Receiver. - ); + void precalculateAscanIndex(int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for. + int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for. + // int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated + // int emitter_list_Size, ///< Number of emitter_array got from Matlab + // int receiver_list_Size, ///< Number of receiver_array got from Matlab + // float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to Receiver. + ); - void precalculateAscanIndex_usePaths - ( - int ascanIndex_i, ///< Offset of AscanIndex batch. - int aScanWindowSize, ///< Amount of Ascans in AscanIndex batch to process. - int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for. - int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for. - // int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated -> No more necessary due to all Combinations-should be Calculated - // float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to Receiver. - ); + void precalculateAscanIndex_usePaths( + int ascanIndex_i, ///< Offset of AscanIndex batch. + int aScanWindowSize, ///< Amount of Ascans in AscanIndex batch to process. + int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for. + int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for. + // int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated -> No more necessary due to all + // Combinations-should be Calculated float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to + // Receiver. + ); - #endif - - // Initialize AScanIndexSurface - void fillCuArray - ( - float useValue, - cudaArray **deviceTextureAscanIndexFloatCuArray, ///< CuArray to fill - int TableAscanIndexAllocationCount - ); - - //SAFT Kernel - void performSAFT(int aScanIndex, size_t aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, int currentEmIndexUsedForAscanIndexCalculation, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceAScansCuArray); //Ascans in CuArray f�r Texturmemory + // Initialize AScanIndexSurface + void fillCuArray(float useValue, + cudaArray **deviceTextureAscanIndexFloatCuArray, ///< CuArray to fill + int TableAscanIndexAllocationCount); + // SAFT Kernel + void performSAFT(int aScanIndex, size_t aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, + int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, int currentEmIndexUsedForAscanIndexCalculation, dim3 const &windowGridDimensions, dim3 const &gridDimensions, + dim3 const &blockDimensions, float *deviceSpeedOfSoundField, cudaArray *deviceAScansCuArray); // Ascans in CuArray f�r Texturmemory }; -//std::string vectorToString(float3 const & vector); -//std::string voxelToString(dim3 const & voxel); extern void memoryCheck(); extern std::size_t memoryGPUfree(); extern std::size_t memoryGPUtotal(); -extern void performCUDAResultCheck(cudaError_t result, std::string const & file, int line); - - +extern void performCUDAResultCheck(cudaError_t result, std::string const &file, int line);