feat: refactor & clean cpp code in SAFT_TOFI

This commit is contained in:
kradchen
2024-11-21 09:49:34 +08:00
parent 0e29f139af
commit 82a2a9e132
7 changed files with 2210 additions and 4593 deletions

View File

@@ -4,7 +4,8 @@ set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)
find_package (OpenMP REQUIRED)
file(GLOB_RECURSE cu_files ./src/*.cu)
file(GLOB_RECURSE cuh_files ./src/*.cuh)
file(GLOB_RECURSE cuh_files ./src/*.cuh)
add_library(SaftTofi SHARED ./src/SAFT_TOFI.cpp ./src/processAScans.cpp ./src/saft.cpp ${cu_files} ${cuh_files})
target_include_directories(SaftTofi PRIVATE ../SAFT ./src /usr/local/cuda/include )
set_target_properties(SaftTofi PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
@@ -13,7 +14,7 @@ target_compile_options(SaftTofi PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--use_fast_math
--ptxas-options=-v
-arch compute_30 -code compute_30,sm_30
>)
>)
target_link_libraries(SaftTofi PRIVATE ${CUDA_RUNTIME_LIBRARY} )
target_link_libraries(SaftTofi PRIVATE OpenMP::OpenMP_CXX )

File diff suppressed because it is too large Load Diff

View File

@@ -137,7 +137,7 @@ __device__ __forceinline__ void performRayTracedSpeedAdditionTexture(float &voxe
pathPoint[slowDim1] = voxel2f[slowDim1];
pathPoint[slowDim2] = voxel2f[slowDim2];
}
else // voxel2f < voxel1f Endpukt < Startpkt -> Steigung negativ
else // voxel2f < voxel1f End point < Start point -> Slope negative
{
fastDirectionSteps = floor(voxel2f[greatestDistanceDim] + 0.5f) - floor(voxel1f[greatestDistanceDim] + 0.5f) + 1;
pathPoint[greatestDistanceDim] = voxel1f[greatestDistanceDim];

File diff suppressed because it is too large Load Diff

View File

@@ -1,126 +1,91 @@
#include <string.h>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <ctime>
#include <cmath>
#include <string.h>
//#include <ail/file.hpp>
//#include <ail/string.hpp>
//#include <ail/time.hpp>
// #include <ail/file.hpp>
// #include <ail/string.hpp>
// #include <ail/time.hpp>
#include "saft.hpp"
/**
Clumsy constructor of the core reconstruction class.
- Unbeholfener Konstruktor der Kern Rekonstuktionsklasse
*/
SAFTHandler::SAFTHandler(
int deviceId, ///< CUDA ID of the device to be used.
int deviceIndex, ///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
float *aScan_ptr, ///< Zeiger zu den AScandaten
double *output_ptr, ///< Zeiger zu den Volumen-Daten
double *Duration_ptr, ///< Zeiger auf Rueckgabewert fuer Matlab fuer Laufzeit des Kernels
unsigned short *receiver_index_ptr, ///<
unsigned short *emitter_index_ptr, ///<
float *receiver_list_ptr, ///<
int receiver_list_Size,
float *emitter_list_ptr, ///<
int emitter_list_Size,
float *speed_vec_ptr, ///< Zeiger auf die SoS-Daten in Block-/Gridmode
int3 SOSGrid_XYZ,
float3 sosOffset, ///< Startpoint of SoSGrid
float SOS_RESOLUTION, ///< Aufloesung des SoSGrid
float *att_vec_ptr, ///< Zeiger auf die Att-Daten inm Gridmode
SAFTHandler::SAFTHandler(int deviceId, ///< CUDA ID of the device to be used.
int deviceIndex, ///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
float *aScan_ptr, ///< Zeiger zu den AScandaten
double *output_ptr, ///< Zeiger zu den Volumen-Daten
double *Duration_ptr, ///< Zeiger auf Rueckgabewert fuer Matlab fuer Laufzeit des Kernels
unsigned short *receiver_index_ptr, ///<
unsigned short *emitter_index_ptr, ///<
float *receiver_list_ptr, ///<
int receiver_list_Size,
float *emitter_list_ptr, ///<
int emitter_list_Size,
float *speed_vec_ptr, ///< Zeiger auf die SoS-Daten in Block-/Gridmode
int3 SOSGrid_XYZ,
float3 sosOffset, ///< Startpoint of SoSGrid
float SOS_RESOLUTION, ///< Aufloesung des SoSGrid
float *att_vec_ptr, ///< Zeiger auf die Att-Daten inm Gridmode
int aScanCount,
int aScanLength,
int3 IMAGE_SIZE_XYZ,
float sampleRate,
float3 regionOfInterestOffset,
float IMAGE_RESOLUTION,
dim3 const & fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
float debugMode,
float debugModeParameter,
bool SOSMode_3DVolume,
bool ATTMode_3DVolume,
int aScanCount, int aScanLength, int3 IMAGE_SIZE_XYZ, float sampleRate, float3 regionOfInterestOffset, float IMAGE_RESOLUTION,
dim3 const &fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
float debugMode, float debugModeParameter, bool SOSMode_3DVolume, bool ATTMode_3DVolume,
int SAFT_MODE,
int *SAFT_VARIANT,
int SAFT_VARIANT_Size,
int SAFT_MODE, int *SAFT_VARIANT, int SAFT_VARIANT_Size,
int *Abort_ptr ///< If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch;
):
// Initialisation der Klassenvariablen mit den uebergebenen Werten (aehnlich Konstruktor)
// Initializer list of class variables
deviceId(deviceId),
deviceIndex(deviceIndex),
int *Abort_ptr ///< If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch;
)
: deviceId(deviceId),
deviceIndex(deviceIndex),
aScan_ptr(aScan_ptr), //aScanSamplesPath(aScanSamplesPath),
aScan_ptr(aScan_ptr), // aScanSamplesPath(aScanSamplesPath),
output_ptr(output_ptr), //Path(Path),
Duration_ptr(Duration_ptr),
output_ptr(output_ptr), // Path(Path),
Duration_ptr(Duration_ptr),
receiver_index_ptr(receiver_index_ptr), //
emitter_index_ptr(emitter_index_ptr), //
receiver_list_ptr(receiver_list_ptr), //
receiver_list_Size(receiver_list_Size),
emitter_list_ptr(emitter_list_ptr), //
emitter_list_Size(emitter_list_Size),
speed_vec_ptr(speed_vec_ptr), ///< SoS-Daten im Blockmode oder SoSGrid
SOSGrid_XYZ(SOSGrid_XYZ), // Groesse des SoSGrids
sosOffset(sosOffset), ///< Startpoint of SoSGrid
SOS_RESOLUTION(SOS_RESOLUTION), ///< Aufloesung des SoSGrid
receiver_index_ptr(receiver_index_ptr), //
emitter_index_ptr(emitter_index_ptr), //
receiver_list_ptr(receiver_list_ptr), //
receiver_list_Size(receiver_list_Size),
emitter_list_ptr(emitter_list_ptr), //
emitter_list_Size(emitter_list_Size),
speed_vec_ptr(speed_vec_ptr), ///< SoS-Daten im Blockmode oder SoSGrid
SOSGrid_XYZ(SOSGrid_XYZ), // Groesse des SoSGrids
sosOffset(sosOffset), ///< Startpoint of SoSGrid
SOS_RESOLUTION(SOS_RESOLUTION), ///< Aufloesung des SoSGrid
att_vec_ptr(att_vec_ptr), ///< Att-Daten als ATTGrid
att_vec_ptr(att_vec_ptr), ///< Att-Daten als ATTGrid
aScanCount(aScanCount),
aScanLength(aScanLength),
IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
sampleRate(sampleRate),
regionOfInterestOffset(regionOfInterestOffset),
IMAGE_RESOLUTION(IMAGE_RESOLUTION),
aScanCount(aScanCount),
aScanLength(aScanLength),
IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
sampleRate(sampleRate),
regionOfInterestOffset(regionOfInterestOffset),
IMAGE_RESOLUTION(IMAGE_RESOLUTION),
fixedBlockDimensions(fixedBlockDimensions),
debugMode(debugMode),
debugModeParameter(debugModeParameter),
SOSMode_3DVolume(SOSMode_3DVolume),
ATTMode_3DVolume(ATTMode_3DVolume),
fixedBlockDimensions(fixedBlockDimensions),
debugMode(debugMode),
debugModeParameter(debugModeParameter),
SOSMode_3DVolume(SOSMode_3DVolume),
ATTMode_3DVolume(ATTMode_3DVolume),
SAFT_MODE(SAFT_MODE),
SAFT_VARIANT(SAFT_VARIANT),
SAFT_VARIANT_Size(SAFT_VARIANT_Size),
SAFT_MODE(SAFT_MODE),
SAFT_VARIANT(SAFT_VARIANT),
SAFT_VARIANT_Size(SAFT_VARIANT_Size),
Abort_ptr(Abort_ptr)
Abort_ptr(Abort_ptr)
{
#ifdef debug_OutputFunctions
printf( "==> SAFTHandler::SAFTHandler - Start\n");
#endif
#ifdef debug_OutputInfo
printf( "SAFTHandler Constructor\n");
#endif
aScanAllocationCount = USED_ASCANSMEMORYREGIONS; // Anzahl der A-Scan-Speicherbereiche die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer A-ScanCopy genutzt werden sollen.
maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX; // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;
IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION; // Auflösung im OutputVolumen
SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION; // Auflösung im SoS-Volumen
#ifdef debug_OutputVariables
printf( "IMAGE_RESOLUTION_FACTOR = %e\n", IMAGE_RESOLUTION_FACTOR);
printf( "SOS_RESOLUTION_FACTOR = %e\n", SOS_RESOLUTION_FACTOR);
printf( "Samplerate = %e\n", sampleRate);
#endif
#ifdef debug_OutputFunctions
printf( "<== SAFTHandler::SAFTHandler - End\n");
#endif
aScanAllocationCount = USED_ASCANSMEMORYREGIONS; // Anzahl der A-Scan-Speicherbereiche die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer A-ScanCopy genutzt werden sollen.
maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX; // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;
IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION; // Auflösung im OutputVolumen
SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION; // Auflösung im SoS-Volumen
}
/**
@@ -129,271 +94,100 @@ SAFTHandler::SAFTHandler(
*/
void SAFTHandler::performReconstruction()
{
#ifdef debug_OutputFunctions
printf( "==> SAFTHandler::performReconstruction - Start\n");
#endif
aScanSamples = (float *)aScan_ptr; // Ascan-Data
emitter_index = (unsigned short *)emitter_index_ptr; // Index for associating emitter to corresponding coordinates
receiver_index = (unsigned short *)receiver_index_ptr; // Index for associating receiver to corresponding coordinates
emitter_list = (float3 *)emitter_list_ptr; // Lookuptable for emitter coordinates
receiver_list = (float3 *)receiver_list_ptr; // Lookuptable for receiver coordinates
output = (double *)output_ptr; // Output-Data
speedOfSoundField = (float *)speed_vec_ptr; // For SOS Correction
// SoSData = (float*) speed_vec_ptr; // Fuer Blockmode
attenuationField = (float *)att_vec_ptr; // For Attenuation Correction
//Pointeruebergabe der AScan-Daten, Geometrie-Daten und Output-Daten von Matlab
#ifdef debug_OutputInfo
printf( "Give Pointer Names for AScan, Geometry, Output and SoS-Data from Matlab\n");
#endif
aScanSamples = (float*) aScan_ptr; // Ascan-Data
emitter_index = (unsigned short*) emitter_index_ptr; // Index for associating emitter to corresponding coordinates
receiver_index = (unsigned short*) receiver_index_ptr; // Index for associating receiver to corresponding coordinates
emitter_list = (float3*) emitter_list_ptr; // Lookuptable for emitter coordinates
receiver_list = (float3*) receiver_list_ptr; // Lookuptable for receiver coordinates
output = (double*) output_ptr; // Output-Data
// Read out GPU-Device Properties
// ----------------------------------------------------------
// List of all deviceProperties: http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__DEVICE_g5aa4f47938af8276f08074d09b7d520c.html
speedOfSoundField = (float*) speed_vec_ptr; // For SOS Correction
//SoSData = (float*) speed_vec_ptr; // Fuer Blockmode
attenuationField = (float*) att_vec_ptr; // For Attenuation Correction
// Determine the number of GPU-Devices in System
int deviceCount;
CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
deviceProperties.reserve(static_cast<std::size_t>(deviceCount)); // Request Vector for all GPU Devices with the size deviceCount
// Determine the number of GPU-Devices in System
cudaDeviceProp &device = deviceProperties[deviceId];
CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId)); // Read out Properties of current used GPU-Device in this thread
#ifdef debug_OutputInfo // Name des Device mit ID ausgeben
printf( "Device ID: %i\n", deviceId);
#endif
CUDA_CHECK(cudaSetDevice(deviceId));
#ifdef debug_OutputFunctions
printf( "==> loadDevices - Start\n");
#endif
deviceProperties.push_back(device); // Add element at the end of the vector outputProb
// Determine minimum supported Surface size. Dependent on device.maxTexture3D[2] and device.maxSurface3D[2]
maxSurfaceTexture3DDimension = (device.maxTexture3D[2] < device.maxSurface3D[2]) ? device.maxTexture3D[2] : device.maxSurface3D[2];
// printf("DEVICE => maxSurfaceTexture3DDimension = %d (device.maxTexture3D[2] = %d - device.maxSurface3D[2] = %d)\n", maxSurfaceTexture3DDimension, device.maxTexture3D[2],
// device.maxSurface3D[2]); // Set maximum Size of Texture
// Read out GPU-Device Properties
// ----------------------------------------------------------
// List of all deviceProperties: http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__DEVICE_g5aa4f47938af8276f08074d09b7d520c.html
// Determine the number of GPU-Devices in System
int deviceCount;
CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
deviceProperties.reserve(static_cast<std::size_t>(deviceCount)); // Request Vector for all GPU Devices with the size deviceCount
// Determine the number of GPU-Devices in System
cudaDeviceProp & device = deviceProperties[deviceId];
CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId)); // Read out Properties of current used GPU-Device in this thread
//printf("%i. %s\n", deviceId, device.name);
//#ifdef debug_OutputInfo
//printf( "Device used: %18s (HW-ID %i) (Idx %i)\n", device.name , deviceId, deviceIndex); // Name des Device mit ID ausgeben
//#endif
CUDA_CHECK(cudaSetDevice(deviceId));
#ifdef debug_OutputInfo
printf("Reset Device\n"); // Reset Device
#endif
// CUDA_CHECK(cudaDeviceReset()); 2019: commented to remove re-initialization when called, avoids blocked threads later on
//printf("%i. %s\n", deviceId, deviceProperties[deviceId].name);
//printf("DEVICE => Maximum 3D texture dimensions: [%d %d %d]\n", device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]);
//printf("DEVICE => Maximum width, height, and depth for a 3D surface reference bound to a CUDA array: [%d %d %d]\n", device.maxSurface3D[0], device.maxSurface3D[1], device.maxSurface3D[2]);
#ifdef debug_OutputInfo
printf("%i. %s\n", deviceId, device.name);
printf(" Byte Total Global Mem: %lld \n", device.totalGlobalMem);
printf(" Compute Capability: %i.%i\n", device.major,device.minor);
printf(" Name: %s\n", device.name);
printf(" Major revision number: %d\n", device.major);
printf(" Minor revision number: %d\n", device.minor);
printf(" Total global memory: %lld\n", device.totalGlobalMem);
printf(" Total shared memory per block: %u\n", device.sharedMemPerBlock);
printf(" Total registers per block: %d\n", device.regsPerBlock);
printf(" Warp size: %d\n", device.warpSize);
printf(" Maximum memory pitch: %lld\n", device.memPitch);
printf(" Maximum threads per block: %d\n", device.maxThreadsPerBlock);
printf(" Maximum 3D texture dimensions: [%d %d %d]\n", device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]);
for (int i = 0; i < 3; ++i)
printf(" Maximum dimension %d of block: %lld\n", i, device.maxThreadsDim[i]);
for (int i = 0; i < 3; ++i)
printf(" Maximum dimension %d of grid: %lld\n", i, device.maxGridSize[i]);
printf(" Clock rate: %d\n", device.clockRate);
printf(" Total constant memory: %u\n", device.totalConstMem);
printf(" Texture alignment: %u\n", device.textureAlignment);
printf(" Concurrent copy and execution: %s\n", (device.deviceOverlap ? "Yes" : "No"));
printf(" Number of multiprocessors: %d\n", device.multiProcessorCount);
printf(" Kernel execution timeout: %s\n\n", (device.kernelExecTimeoutEnabled ? "Yes" : "No"));
printf(" Maximum 3D texture dimensions: [%d %d %d]\n", device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]);
printf(" Maximum width, height, and depth for a 3D surface reference bound to a CUDA array: [%d %d %d]\n", device.maxSurface3D[0], device.maxSurface3D[1], device.maxSurface3D[2]);
#endif
deviceProperties.push_back(device); // Add element at the end of the vector outputProb
// printf(" Maximum memory pitch: %lld\n", device.memPitch);
// printf(" Texture alignment: %u\n", device.textureAlignment);
// printf(" Texture Pitch alignment: %u\n", device.texturePitchAlignment);
// Determine minimum supported Surface size. Dependent on device.maxTexture3D[2] and device.maxSurface3D[2]
maxSurfaceTexture3DDimension = (device.maxTexture3D[2]<device.maxSurface3D[2]) ? device.maxTexture3D[2]:device.maxSurface3D[2];
//printf("DEVICE => maxSurfaceTexture3DDimension = %d (device.maxTexture3D[2] = %d - device.maxSurface3D[2] = %d)\n", maxSurfaceTexture3DDimension, device.maxTexture3D[2], device.maxSurface3D[2]); // Set maximum Size of Texture
//Set the maximal used number of SOS-ZLayers, dependend on SAFT_VARIANT-Parameter 3DVolumeInterpolationAtReconstruction (=3)
// Set the maximal used number of SOS-ZLayers, dependend on SAFT_VARIANT-Parameter 3DVolumeInterpolationAtReconstruction (=3)
//按照目前配置必定 maxFeasibleSosZLayerCount = 2;
switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
{
case 0: // Mit Textur -> 1ne SOS-ZLayer
maxFeasibleSosZLayerCount = 1;
break;
case 1: // Mit Textur & Interpolation -> 2 SOS-ZLayer
maxFeasibleSosZLayerCount = 2;
break;
}
#ifdef debug_OutputVariables
printf( "Set maxFeasibleSosZLayerCount = %u\n", maxFeasibleSosZLayerCount);
#endif
{
case 0: // Mit Textur -> 1ne SOS-ZLayer
maxFeasibleSosZLayerCount = 1;
break;
case 1: // Mit Textur & Interpolation -> 2 SOS-ZLayer
maxFeasibleSosZLayerCount = 2;
break;
}
//printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
//printf( "maxSurfaceTexture3DDimension= %i\n", maxSurfaceTexture3DDimension);
maxAscanIndexArraysInTexture = maxSurfaceTexture3DDimension / maxFeasibleSosZLayerCount; // Max Anzahl der Ascans in einer Teiltabelle (1024)
// Fuer Ascan-Index-Varainte von SAFT werden mehrere Texturen benoetigt, da die Anzahl der Z_layer limitiert ist.
// Um 3D-Interpolation zu ermoeglichen muessen jeweils 2 Z-Layer pro A-Scan vorhanden sein.
// --> 2*nAscans < maxSurfaceTexture3DDimension(Fermi & Kepler: 2048) ==> maximal 1024 Em/Rec - Kombinationen koennen in einem Surface/Textur gespeichert werden
// maxSurfaceTexture3DDimension = maximale Groesse die erlaubt ist (2048)
// TableAscanIndexAllocationCount = Anzahl der TeilSurfaces ==> auch Anzahl der benoetigten Durchlaeufe (aktuell 4 Texturen)
// maxFeasibleSosZLayerCount = Anzahl der SoS-Zlayer die gleichzeitig im Speicher pro EM/REC-Kombi vorgehalten werden (1 oder 2 bei Interpolierten Variante)
// maxAscanIndexArraysInTexture = Anzahl der Ascans in einer Teiltabelle (1024)
// maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX (=4) // Definiert die aktuell maximal unterstuetzen Texturen im Code fuer AscanIndex
// neededAscanBatchCount = Anzahl an benoetigten Durchlaeufe des SAFTs um alle Ascans abarbeiten zu koennen
// memoryCheck(); // Freier Speicher am Anfang ausgeben
maxAscanIndexArraysInTexture = maxSurfaceTexture3DDimension/maxFeasibleSosZLayerCount; // Max Anzahl der Ascans in einer Teiltabelle (1024)
// if ((strcmp(device.name, "GeForce GTX 690") == 0)||(strcmp(device.name, "GeForce GTX 590") == 0)){
if (memoryGPUfree() <= 2500000000)
{ // IF GPUMemory < 2.5GB only 1 Surface can be used
maxSupportedTexturesForAscanIndex = 1;
}
neededAscanBatchCount = ceil((float)aScanCount / (maxSurfaceTexture3DDimension / maxFeasibleSosZLayerCount) / maxSupportedTexturesForAscanIndex);
// Determine amount of PartSurfaces
if (neededAscanBatchCount > 1)
{
TableAscanIndexAllocationCount = maxSupportedTexturesForAscanIndex; // Wenn mehr als ein Durlauf nötig --> so viele wie möglich nutzen
}
else
{
TableAscanIndexAllocationCount = (int)ceil((float)aScanCount / (maxAscanIndexArraysInTexture)); // Wenn nur ein Durlauf nötig --> so wenige wie nötig nutzen
}
#ifdef debug_OutputAScanIndexMemoryDivision
printf("%s :\n", device.name);
printf(" Total memory %lld Bytes\n", memoryGPUtotal() );
printf(" Free memory %lld Bytes\n", memoryGPUfree() );
printf(" => Used memory %lld Bytes\n", (memoryGPUtotal()-memoryGPUfree()));
#endif
//memoryCheck(); // Freier Speicher am Anfang ausgeben
// Set Block and Grid-Dimensions for GPU Threads
genericSAFTBlockDimensions = fixedBlockDimensions; // fixedBlockDimensions = Parameter BlockDim_XYZ
genericSAFTGridDimensions =
dim3((IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x - 1) / genericSAFTBlockDimensions.x, // hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
(IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y - 1) / genericSAFTBlockDimensions.y, // in Blockgroesse geteilt werden kann, muss ein weiterer
(IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z - 1) / genericSAFTBlockDimensions.z // Block berechnet werden. Wenn insgesamt zu viele werden sie im Kernel aussortiert.
);
// if ((strcmp(device.name, "GeForce GTX 690") == 0)||(strcmp(device.name, "GeForce GTX 590") == 0)){
if (memoryGPUfree() <= 2500000000){ // IF GPUMemory < 2.5GB only 1 Surface can be used
maxSupportedTexturesForAscanIndex = 1;
#ifdef debug_OutputAScanIndexMemoryDivision
printf("Free GPU Memory: %lld < 2.5GB\n --> reduce maxSupportedTexturesForAscanIndex 4 -> %i \n", memoryGPUfree(), maxSupportedTexturesForAscanIndex );
//printf("GeForce GTX 690/590 \n --> reduce maxSupportedTexturesForAscanIndex 4 -> %i \n", maxSupportedTexturesForAscanIndex );
#endif
}
#ifdef debug_OutputAScanIndexMemoryDivision
printf( "--> maxSupportedTexturesForAscanIndex %i \n", maxSupportedTexturesForAscanIndex);
//printf( "--> TableAscanIndexAllocationCount %i \n", TableAscanIndexAllocationCount);
#endif
neededAscanBatchCount = ceil((float)aScanCount/(maxSurfaceTexture3DDimension/maxFeasibleSosZLayerCount)/maxSupportedTexturesForAscanIndex);
#ifdef debug_OutputAScanIndexMemoryDivision
printf("aScanCount %i -> neededAscanBatchCount = %i!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", aScanCount, neededAscanBatchCount);
//printf("totalGlobalMem = %lld Byte\n", device.totalGlobalMem);
//printf("multiProcessorCount = %i MultiProcessors\n", device.multiProcessorCount);
#endif
// Determine amount of PartSurfaces
if (neededAscanBatchCount > 1){
TableAscanIndexAllocationCount = maxSupportedTexturesForAscanIndex; // Wenn mehr als ein Durlauf nötig --> so viele wie möglich nutzen
} else {
TableAscanIndexAllocationCount = (int)ceil((float)aScanCount/(maxAscanIndexArraysInTexture)); // Wenn nur ein Durlauf nötig --> so wenige wie nötig nutzen
}
// Set Block and Grid-Dimensions for GPU Threads
genericSAFTBlockDimensions = fixedBlockDimensions; // fixedBlockDimensions = Parameter BlockDim_XYZ
genericSAFTGridDimensions = dim3(
(IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x-1)/ genericSAFTBlockDimensions.x, // hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
(IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y-1)/ genericSAFTBlockDimensions.y, // in Blockgroesse geteilt werden kann, muss ein weiterer
(IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z // Block berechnet werden. Wenn insgesamt zu viele werden sie im Kernel aussortiert.
);
#ifdef debug_OutputVariables
printf( "genericSAFTBlockDimensions X,Y,Z = (%i %i %i)\n",genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z);
printf( "genericSAFTGridDimensions X,Y,Z = (%i %i %i)\n",genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z);
#endif
// Outputsize of SOS Volume
// Outputsize of SOS Volume
SOSVolume_VoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z;
SOSVolume_Bytes = SOSVolume_VoxelCount * sizeof(float);
#ifdef debug_OutputVariables
printf(" SOSVolume_VoxelCount [%ix%ix%i] = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, SOSVolume_VoxelCount);
printf(" SOSVolume_Bytes = SOSVolumeVoxelCount(%i) x sizeof(float = 4) = %i\n", SOSVolume_VoxelCount, SOSVolume_Bytes);
#endif
// Warn if Outputsize of Volume is too big for 32Bit Sytems
outputVolume_VoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen
outputVolume_Bytes = outputVolume_VoxelCount * sizeof(double); // Speicherbedarf fuer alle Voxel im Volumen
outputVolume_VoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen
outputVolume_Bytes = outputVolume_VoxelCount * sizeof(double); // Speicherbedarf fuer alle Voxel im Volumen
#ifdef debug_OutputVariables
printf(" outputVolume_VoxelCount [%ix%ix%i]= %lld\n",IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, outputVolume_VoxelCount);
printf(" outputVolume_Bytes [%lld x sizeof(double = 8)] = %lld\n", outputVolume_VoxelCount, outputVolume_Bytes);
#endif
aScan_Bytes = aScanLength * sizeof(float);
aScanBatch_Bytes = aScanCount * aScan_Bytes;
//Hier auf maximale Outputgroesse von 32-BitSystem ueberpruefen --> falls Probleme mit 32-Bitsystemen hier noch Abfrage und Abbruch implementieren
//if (outputVolume_VoxelCount > 536870912) // 536870912 = 2^32 / sizeof(double)
// std::cout << "outputVolume_Bytes > 2^32 the upper limit of unsigned integer!!!\n => Reconstruction only in 64-Bit Systems";
//Groesse der Datenbloecke fuer die Blockverarbeitung wird mit aScanCount angegeben
#ifdef debug_OutputVariables
printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
#endif
aScan_Bytes = aScanLength * sizeof(float);
aScanBatch_Bytes = aScanCount * aScan_Bytes;
#ifdef debug_OutputVariables
printf( "aScan_Bytes = aScanLength(%i) * sizeof(float=4) = %i\n", aScanLength, aScan_Bytes);
printf( "aScanCount = %i\n", aScanCount);
printf( "aScanBatch_Bytes = aScanCount * aScan_Bytes ( = %i * sizeof(float)) = %i\n", aScanLength, aScanBatch_Bytes);
#endif
#ifdef debug_OutputInfo
printf("\nParameter for Image Reconstruction\n");
printf( "========================================================================\n");
printf( "IMAGE_SIZE_XYZ: [%i x %i x %i]\n", IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z);
printf( "outputVolume_VoxelCount: %lld\n", outputVolume_VoxelCount);
printf( "Increment vector/Resolution: %f\n", IMAGE_RESOLUTION);
printf( "IMAGE_STARTPOINT in m: [%f x %f x %f]\n", regionOfInterestOffset.x, regionOfInterestOffset.y, regionOfInterestOffset.z);
outputVolume_size_m.x = IMAGE_SIZE_XYZ.x * IMAGE_RESOLUTION;
outputVolume_size_m.y = IMAGE_SIZE_XYZ.y * IMAGE_RESOLUTION;
outputVolume_size_m.z = IMAGE_SIZE_XYZ.z * IMAGE_RESOLUTION;
printf( "Volume Size in m: [%f x %f x %f]\n", outputVolume_size_m.x, outputVolume_size_m.y, outputVolume_size_m.z);
printf( "aScanCount: %i\n", aScanCount);
printf( "========================================================================\n\n");
#endif
#ifdef debug_OutputPerformance
struct timeval startProcessAscans, stopProcessAscans;
gettimeofday(&startProcessAscans, NULL);
#endif
//perform processing with AScan-Data
//===========================================================================================================
//===========================================================================================================
ullong duration;
processAScans(duration);
//===========================================================================================================
//===========================================================================================================
#ifdef debug_OutputPerformance
diff_time = (double)((stopProcessAscans.tv_sec * 1000000.0 + stopProcessAscans.tv_usec) - (startProcessAscans.tv_sec * 1000000.0 + startProcessAscans.tv_usec));
printf ("########################################################################\n");
printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Free Memory = %4.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time);
printf ("########################################################################\n");
#endif
Duration_ptr[(deviceIndex+1)] = (double)duration; // Für jede GPU einen Laufzeitwert in µs übermitteln, Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig
#ifdef debug_OutputVariables
printf( " GPU (%s:ID %i,Index %i): => Duration_ptr[%i] = duration(%i µs) = %.2f s\n", device.name, deviceId, deviceIndex, (deviceIndex+1), duration, Duration_ptr[(deviceIndex+1)]/1000/1000);
#endif
#ifdef debug_OutputInfo
printf("Reset Device\n"); // Reset Device
#endif
// CUDA_CHECK(cudaDeviceReset()); // news 2019 commented, see above reason.
#ifdef debug_OutputFunctions
printf( "<== SAFTHandler::performReconstruction - End\n");
#endif
// perform processing with AScan-Data
//===========================================================================================================
//===========================================================================================================
ullong duration;
processAScans(duration);
//===========================================================================================================
//===========================================================================================================
Duration_ptr[(deviceIndex + 1)] = (double)duration; // Für jede GPU einen Laufzeitwert in µs übermitteln, Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig
}
/**
@@ -402,96 +196,42 @@ void SAFTHandler::performReconstruction()
- Der SAFT Kernel erwartet Argumente in den die Grid Dimension auf drei Dimensionen reduziert wurde und die Block-Dimensionen auf nur eine Dimension reduziert ist.
- Das haengt auch von den Eigenschaften der verfuegbaren HW ab (shader model)
*/
void SAFTHandler::reduceKernelDimensions(
dim3 const & gridDimensions, ///< Input grid dimensions.
dim3 const & blockDimensions, ///< Input block dimensions.
dim3 & reducedGridDimensions, ///< Reduced output grid dimensions.
dim3 & reducedBlockDimensions ///< Reduced output block dimensions.
)
void SAFTHandler::reduceKernelDimensions(dim3 const &gridDimensions, ///< Input grid dimensions.
dim3 const &blockDimensions, ///< Input block dimensions.
dim3 &reducedGridDimensions, ///< Reduced output grid dimensions.
dim3 &reducedBlockDimensions ///< Reduced output block dimensions.
)
{
#ifdef debug_OutputFunctions
printf( "==> SAFTHandler::reduceKernelDimensions - Start\n");
#endif
if(deviceProperties[deviceId].maxGridSize[2] > 1)
if (deviceProperties[deviceId].maxGridSize[2] > 1)
{
reducedGridDimensions = gridDimensions;
#ifdef debug_OutputParameter
printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
#endif
}
else
{
reducedGridDimensions = dim3(
gridDimensions.x * gridDimensions.y,
gridDimensions.z,
1
);
#ifdef debug_OutputParameter
printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
#endif
reducedGridDimensions = dim3(gridDimensions.x * gridDimensions.y, gridDimensions.z, 1);
}
reducedBlockDimensions = dim3(blockDimensions.x * blockDimensions.y * blockDimensions.z);
#ifdef debug_OutputParameter
printf( "reducedBlockDimensions X,Y,Z = (%i %i %i)\n", reducedBlockDimensions.x, reducedBlockDimensions.y, reducedBlockDimensions.z);
#endif
#ifdef debug_OutputFunctions
printf( "<== SAFTHandler::reduceKernelDimensions - End\n");
#endif
}
/**
Determine free memory available on the current device.
*/
std::size_t memoryGPUfree()
{
#ifdef debug_OutputFunctions
printf( "==> memoryGPUfree - Start\n");
#endif
std::size_t
totalMemory,
freeMemory;
std::size_t totalMemory, freeMemory;
CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
// printf(" Total memory %lld Bytes\n", totalMemory);
// printf(" Free memory %lld Bytes\n", freeMemory);
// printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory));
#ifdef debug_OutputFunctions
printf( "<== memoryGPUfree - End\n");
#endif
return freeMemory;
}
/**
Determine free memory available on the current device.
*/
std::size_t memoryGPUtotal()
{
#ifdef debug_OutputFunctions
printf( "==> current - Start\n");
#endif
std::size_t
totalMemory,
freeMemory;
std::size_t totalMemory, freeMemory;
CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
// printf(" Total memory %lld Bytes\n", totalMemory);
// printf(" Free memory %lld Bytes\n", freeMemory);
// printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory));
#ifdef debug_OutputFunctions
printf( "<== current - End\n");
#endif
return totalMemory;
}
@@ -501,111 +241,125 @@ std::size_t memoryGPUtotal()
*/
void memoryCheck()
{
#ifdef debug_OutputFunctions
printf( "==> memoryCheck - Start\n");
#endif
#ifdef debug_OutputFunctions
printf("==> memoryCheck - Start\n");
#endif
std::size_t
totalMemory,
freeMemory;
float check;
std::size_t totalMemory, freeMemory;
float check;
CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
// totalMemory
check = 1024.0f * 1024.0f * 1024.0f * 1024.0f;
if (totalMemory >= check)
{
printf(" Total memory %.3f TB\n", totalMemory / check);
}
else
{
check /= 1024.0f;
if (totalMemory >= check)
{
printf(" Total memory %.3f GB\n", totalMemory / check);
}
else
{
check /= 1024.0f;
if (totalMemory >= check)
{
printf(" Total memory %.3f MB\n", totalMemory / check);
}
else
{
check /= 1024.0f;
if (totalMemory >= check)
{
printf(" Total memory %.3f kB\n", totalMemory / check);
}
else
{
check /= 1024.0f;
if (totalMemory >= check)
printf(" Total memory %.3f Bytes\n", totalMemory / check);
}
}
}
}
//#if defined(debug_OutputInfo) || defined(debug_OutputMaxMemory)
// printf(" Total memory %lld Bytes\n", totalMemory);
// printf(" Free memory %lld Bytes\n", freeMemory);
// printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory));
// totalMemory
check = 1024.0f*1024.0f*1024.0f*1024.0f;
if (totalMemory >= check){
printf(" Total memory %.3f TB\n", totalMemory/check);
} else {
check /= 1024.0f;
if (totalMemory >= check){
printf(" Total memory %.3f GB\n", totalMemory/check);
} else {
check /= 1024.0f;
if (totalMemory >= check){
printf(" Total memory %.3f MB\n", totalMemory/check);
} else {
check /= 1024.0f;
if (totalMemory >= check){
printf(" Total memory %.3f kB\n", totalMemory/check);
} else {
check /= 1024.0f;
if (totalMemory >= check)
printf(" Total memory %.3f Bytes\n", totalMemory/check);
}
}
}
}
// freeMemory
check = 1024.0f*1024.0f*1024.0f*1024.0f;
if (freeMemory >= check){
printf(" Free memory %.3f TB\n", freeMemory/check);
} else {
check /= 1024.0f;
if (freeMemory >= check){
printf(" Free memory %.3f GB\n", freeMemory/check);
} else {
check /= 1024.0f;
if (freeMemory >= check){
printf(" Free memory %.3f MB\n", freeMemory/check);
} else {
check /= 1024.0f;
if (freeMemory >= check){
printf(" Free memory %.3f kB\n", freeMemory/check);
} else {
check /= 1024.0f;
if (freeMemory >= check)
printf(" Free memory %.3f Bytes\n", freeMemory/check);
}
}
}
}
// Used Memory
check = 1024.0f*1024.0f*1024.0f*1024.0f;
if ((totalMemory-freeMemory) >= check){
printf(" Used memory %.3f TB\n", (totalMemory-freeMemory)/check);
} else {
check /= 1024.0f;
if ((totalMemory-freeMemory) >= check){
printf(" Used memory %.3f GB\n", (totalMemory-freeMemory)/check);
} else {
check /= 1024.0f;
if ((totalMemory-freeMemory) >= check){
printf(" Used memory %.3f MB\n", (totalMemory-freeMemory)/check);
} else {
check /= 1024.0f;
if ((totalMemory-freeMemory) >= check){
printf(" Used memory %.3f kB\n", (totalMemory-freeMemory)/check);
} else {
check /= 1024.0f;
if ((totalMemory-freeMemory) >= check)
printf(" Used memory %.3f Bytes\n", (totalMemory-freeMemory)/check);
}
}
}
}
//#endif
#ifdef debug_OutputFunctions
printf( "<== memoryCheck - End\n");
#endif
// freeMemory
check = 1024.0f * 1024.0f * 1024.0f * 1024.0f;
if (freeMemory >= check)
{
printf(" Free memory %.3f TB\n", freeMemory / check);
}
else
{
check /= 1024.0f;
if (freeMemory >= check)
{
printf(" Free memory %.3f GB\n", freeMemory / check);
}
else
{
check /= 1024.0f;
if (freeMemory >= check)
{
printf(" Free memory %.3f MB\n", freeMemory / check);
}
else
{
check /= 1024.0f;
if (freeMemory >= check)
{
printf(" Free memory %.3f kB\n", freeMemory / check);
}
else
{
check /= 1024.0f;
if (freeMemory >= check)
printf(" Free memory %.3f Bytes\n", freeMemory / check);
}
}
}
}
// Used Memory
check = 1024.0f * 1024.0f * 1024.0f * 1024.0f;
if ((totalMemory - freeMemory) >= check)
{
printf(" Used memory %.3f TB\n", (totalMemory - freeMemory) / check);
}
else
{
check /= 1024.0f;
if ((totalMemory - freeMemory) >= check)
{
printf(" Used memory %.3f GB\n", (totalMemory - freeMemory) / check);
}
else
{
check /= 1024.0f;
if ((totalMemory - freeMemory) >= check)
{
printf(" Used memory %.3f MB\n", (totalMemory - freeMemory) / check);
}
else
{
check /= 1024.0f;
if ((totalMemory - freeMemory) >= check)
{
printf(" Used memory %.3f kB\n", (totalMemory - freeMemory) / check);
}
else
{
check /= 1024.0f;
if ((totalMemory - freeMemory) >= check)
printf(" Used memory %.3f Bytes\n", (totalMemory - freeMemory) / check);
}
}
}
}
}
/**
Generic CUDA call wrapper.
Check the result of a CUDA operation and throw an exception if an error occurred.
@@ -614,21 +368,19 @@ void memoryCheck()
- <20>berpr<70>ft die Ergebnisse einer CUDA Operation und wirft eine Exception wenn ein Fehler auftritt
- Das wird wird mit einer Kombination mit einem Makro in saft.hpp genutzt.
*/
//inline // Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
void performCUDAResultCheck(
cudaError_t result, ///< Result of the CUDA operation.
std::string const & file, ///< Path to the source code file.
int line ///< Line within the source code
)
// inline // Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
void performCUDAResultCheck(cudaError_t result, ///< Result of the CUDA operation.
std::string const &file, ///< Path to the source code file.
int line ///< Line within the source code
)
{
if(result != cudaSuccess)
if (result != cudaSuccess)
{
//printf("A CUDA operation failed in file \"%s\" (line %i): %s \n", file, line, cudaGetErrorString(result).c_str() );
printf("%s\n", cudaGetErrorString( cudaGetLastError() ) );
// printf("A CUDA operation failed in file \"%s\" (line %i): %s \n", file, line, cudaGetErrorString(result).c_str() );
printf("%s\n", cudaGetErrorString(cudaGetLastError()));
//std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
//std::cout << errorMessage << std::endl;
// std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
// std::cout << errorMessage << std::endl;
printf("-> Error occurred");
}
}

View File

@@ -1,15 +0,0 @@
#include <iostream>
#include "saft.hpp"
/*!
This is the central CUDA file which really just includes the other modules.
This is done because CUDA does not support external references for referencing data from other compilation units.
- Dies ist das zentrale CUDA-File welches nur die anderen Module einbindet
- Das wird gemacht, weil CUDA keine externen Referenzen unterst<73>tzt, um Daten von anderen Compilierungs Einheiten zu referenzieren.
*/
// #include "kernel/rayTracing.cuh" // GPU-Code für Bresenham
// #include "kernel/precalculateSpeedOfSoundKernel.cuh" // GPU-Code Partitionierung für Bresenham. Ruft den Bresenham auf.
// #include "kernel/saftKernel.cuh" // GPU-Kernel für SAFT

View File

@@ -1,154 +1,88 @@
#pragma once
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <stdint.h>
#include <stdio.h> // standard input/output
#include <vector> // stl vector header
#include <stdio.h> // standard input/output
#include <string>
#include <vector> // stl vector header
typedef unsigned char uchar;
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned long ulong;
typedef unsigned long long ullong;
//Define Outputs for Debugmode
//============================
//#define debug_OutputFormat_German // German Format , instead . for numbers
//#define debug_OutputFunctions // Funktionenaufrufe ausgeben
//#define debug_OutputVariables // Werte der Variablen ausgeben
//#define debug_OutputParameter // Uebersicht der Eingabedaten anzeigen sowie Infobloeke in den einzelnen Schritten
//#define debug_OutputMemory // Speicherverwaltung, Malloc, Free, Groessen
//#define debug_OutputMaxMemory // Gibt aktuellen Speicherverbrauch an, wenn memoryCheck aufgerufen wird
//#define debug_OutputInfo // Gibt Infos zu Schritten, Variablen,... aus
//#define debug_OutputPerformance // Gibt die Laufzeiten und die eizelnen Multi-GPU Performanzwerte von ProcessAscans aus
//#define debug_OutputStepsPerformance // Gibt die Laufzeiten und für die einzelnen Schritte in performCoreReconstruction aus (Copy Ascans, Precalc, PerfCoreReconstruction, copy back)
//#define debug_OutputStepsPrecalculation // Gibt Infos ueber die einzelnen Schritte der Precalculation Steps an
//#define debug_OutputHostStepsPerformance // Gibt die Laufzeiten für die eizelnen Schritte auf dem HOST aus (Preintegrated Ascans)
//#define debug_OutputSAFTHandlerThreadPerformance // Gibt die Gesamt-Laufzeiten der einzelnen Multi-GPU Threads aus
//#define debug_OutputMultiGpu // Einteilung des Volumens auf mehrerer GPUs ausgeben
//#define debug_OutputStreams // Gibt die Schritte der Berechnung der Streams aus
//#define debug_OutputSOSPaths // Gibt die Schritte und Werte der SOSPfadberechnung aus
//#define debug_OutputSOSStepsParameter // Einteilung der ZLayer in SOSZlayer
//#define debug_OutputLookUpGeometryMemoryList // Debugausgabe fuer die LookUpGeometryMemoryList (Constant Memory)
//#define debug_OutputAScanIndexMemoryDivision // Debugausgabe für die Einteilung in Surfaces da mehrere Surfaces benoetigt werden
//#define OutputVolume // Ausgabe des Volumens
// Debugging CUDA Kernels
//================================================
//#define debug_CudaSAFTKernelModes // Use variable debugMode for different calulations methods and output
//#define debug_CudaSAFTKernel_EnableAnalyticAverageSpeedCalculation // Fuer Fehlerberchnungen
//#define debug_CudaSAFTKernel
//#define debug_CudaPrecalculateKernel
//#define debug_CudaPrecalculateAscanIndexKernel // Kernel function for PrecalculateAscanIndex
//#define debug_CudaPrecalculateAscanIndexKernelProxy // Proxy function for PrecalculateAscanIndex
//#define debug_CudaFillCuArrayKernelProxy
//#define debug_CudaSAFTAscanIndexKernel // Kernel function for SAFTAscanIndex
//#define debug_CudaSAFTAscanIndexKernelDataAccess // Access and sum up values from Ascans
//#define debug_CudaRayTraceKernel
//#define debug_CudaRayTraceKernelLive
//#define DebugSetMemoryToZero // Set SOSPathMemory to Zero as Initialisation
// Define specific Hardware-Versions
#define GTX_590
//#define GTX_690
//#define GTX_TITAN
#if defined(GTX_590)
#define GTX_Fermi
#endif
#if defined(GTX_690) || defined(GTX_TITAN)
#define GTX_Kepler
#endif
typedef unsigned long ulong;
typedef unsigned long long ullong;
// Memory management of GPU and Errordetection
//================================================
//#define SaftNoTexture // Instead of TextureMemory use Memory access on GPU Device Memory // out-of-date
//#define SaftCorrectSumAllAscan // Recalculation if too big values occur
// SAFT-Implementierung 2-stufig mit AscanIndexInterpolation
//============================================================
#define SaftUseAscanIndexInterpolation
#define noGeometryLoading
//#define SaftUseAscanIndexInterpolation_PartWise // Kernel mit nur einem Durchlauf durchfuehren, sonst ueber Ascans im Kernel laufen
#define AscanTextureUse1Float // Textur mit nur Float1 für SOS berechnen
// #define SaftNoTexture // Instead of TextureMemory use Memory access on GPU Device Memory // out-of-date
// #define SaftCorrectSumAllAscan // Recalculation if too big values occur
// #define SaftUseAscanIndexInterpolation_PartWise // Kernel mit nur einem Durchlauf durchfuehren, sonst ueber Ascans im Kernel laufen
#define AscanTextureUse1Float // Textur mit nur Float1 für SOS berechnen
// Integration der A-scans im Vornherein durchfuehren um Samplebreite an zu rekonstruierende Aufloesung anzupassen
//=======================================================================================================================
#define preAscanIntegrationToMatchSamplerateToResolution // Integration der Ascans ueber Fensterbreite durchfuehren
//#define debug_preAscanIntegration
#define DebugSammleMin 2990 // Grenzen feeru Degbugausgabe der Werte
#define DebugSammleMax 3000
//#define preAscanIntegrationVersion1Michael // direkt übernommene Version von Michael
#define preAscanIntegrationVersion2Ernst // korrigierte Variante mit genauerer Fensterbreite
#define preAscanIntegrationToMatchSamplerateToResolution // Integration der Ascans ueber Fensterbreite durchfuehren
// #define debug_preAscanIntegration
#define DebugSammleMin 2990 // Grenzen feeru Degbugausgabe der Werte
#define DebugSammleMax 3000
// #define preAscanIntegrationVersion1Michael // direkt übernommene Version von Michael
#define preAscanIntegrationVersion2Ernst // korrigierte Variante mit genauerer Fensterbreite
// Parameter fuer SAFT-Kernel
//=======================================================================================================================
#define USED_ASCANSMEMORYREGIONS 1 // Anzahl der A-Scan-Speicherbereiche die alloziert werden
// Hier reicht einer statt zwei! 2 nur nötig wenn A-scans Spückweise mit Streams fuer kopiert werden sollen.
#define MAX_SUPPORTEDTEXTURES_FORASCANINDEX 4 // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;
#define MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE 710 // Definiert maximale #Receiver in einem SOSPATH-Textur
#define MATLABSAVETY_MB 25 // in MB. Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann, daher zur Sicherheit Speicher freihalten
#define USED_ASCANSMEMORYREGIONS \
1 // Anzahl der A-Scan-Speicherbereiche die alloziert werden
// Hier reicht einer statt zwei! 2 nur nötig wenn A-scans Spückweise mit Streams fuer kopiert werden sollen.
#define MAX_SUPPORTEDTEXTURES_FORASCANINDEX 4 // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;
#define MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE 710 // Definiert maximale #Receiver in einem SOSPATH-Textur
#define MATLABSAVETY_MB 25 // in MB. Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann, daher zur Sicherheit Speicher freihalten
#define SaftLinearInterpolation // Lineare Interpolation beim Zugriff auf A-scans durchführen
#define SaftLinearInterpolation // Lineare Interpolation beim Zugriff auf A-scans durchführen
#define SaftUseConstantMemforGeometry // Geometriedaten im Constantmemory nutzen
#define SaftUseConstantMemforGeometry // Geometriedaten im Constantmemory nutzen
// #define SaftTextureForEmRecSosPathsTablesFloat1 // Use Float1-Textur for loading SOS-Paths -> Sum, Count separated
// #define SaftTextureForEmRecSosPathsTablesFloat2 // Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position
#define SaftTextureForEmRecSosPathsTablesFloat4 // Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position
//#define SaftTextureForEmRecSosPathsTablesFloat1 // Use Float1-Textur for loading SOS-Paths -> Sum, Count separated
//#define SaftTextureForEmRecSosPathsTablesFloat2 // Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position
#define SaftTextureForEmRecSosPathsTablesFloat4 // Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position
#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
#define SaftTextureForEmRecSosPathsTables // Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible
#endif
// Several SAFT_VARIANTs
#define SAFT_VARIANT_AscanPreintegration 0
#define SAFT_VARIANT_AscanInterpolation 1
#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing 2 // Use interpolation while Preprocessing
#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction 3 // Use interpolation while Reconstruction
#define SAFT_VARIANT_CalcStandardDeviation 4 // Not yet implemented
#define SAFT_VARIANT_SumUpOverBoarderIndices 5 // Not yet implemented
#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
#define SaftTextureForEmRecSosPathsTables // Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible
#endif
// Several SAFT_VARIANTs
#define SAFT_VARIANT_AscanPreintegration 0
#define SAFT_VARIANT_AscanInterpolation 1
#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing 2 // Use interpolation while Preprocessing
#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction 3 // Use interpolation while Reconstruction
#define SAFT_VARIANT_CalcStandardDeviation 4 // Not yet implemented
#define SAFT_VARIANT_SumUpOverBoarderIndices 5 // Not yet implemented
// Cache <-> shared Memory
//#define SaftPreferSharedMem // cudaFuncCachePreferShared: shared memory is 48 KB
#define SaftPreferL1SharedMem // cudaFuncCachePreferL1: shared memory is 16
//#define SaftPreferNone // cudaFuncCachePreferNone: no preference
// #define SaftPreferSharedMem // cudaFuncCachePreferShared: shared memory is 48 KB
#define SaftPreferL1SharedMem // cudaFuncCachePreferL1: shared memory is 16
// #define SaftPreferNone // cudaFuncCachePreferNone: no preference
// Receiver Cache mit shared Memory (nur bei kleinen Blockgroeßen)
//#define SaftReceiverSharedMemCacheReceiverDistance
//#define SaftCacheReceiverSOS
//#define SaftReceiverSharedMemCacheReceiverSOS // Use Shared Memory for Caching
//#define SaftRegisterCacheReceiverSOS // Use Register for Caching
// #define SaftReceiverSharedMemCacheReceiverDistance
// #define SaftCacheReceiverSOS
// #define SaftReceiverSharedMemCacheReceiverSOS // Use Shared Memory for Caching
// #define SaftRegisterCacheReceiverSOS // Use Register for Caching
// Berechnung der mittleren Schallgeschwindigkeit
//================================================
#define SaftUseHarmonicMean // harmonic Mean
#define SaftUseHarmonicMean // harmonic Mean
#define SaftTextureForBresenhamSosPaths // Texturmemory für SOS-Volumen nutzen (Version without not full implemented)
//#define SaftTextureForBresenhamInterpolated //iSOS-Version --> wird nun ueber Parameter uebergeben
//#define SaftUseFastMath //FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig.
#define SaftTextureForBresenhamSosPaths // Texturmemory für SOS-Volumen nutzen (Version without not full implemented)
// #define SaftTextureForBresenhamInterpolated //iSOS-Version --> wird nun ueber Parameter uebergeben
// #define SaftUseFastMath //FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig.
//#define SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert
#define SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
#define SOS_Version2 // korrekte Version mit Definitionen im Mittelpunkt
// #define SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert
#define SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
#define SOS_Version2 // korrekte Version mit Definitionen im Mittelpunkt
// MultiGPU
//================================================
@@ -163,18 +97,16 @@ typedef unsigned long long ullong;
// constant such that 64kB of constant is fully blocked by emitter/receiver combinations
const int MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY = 2340;
//Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred.
// Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred.
#define CUDA_CHECK(operation) performCUDAResultCheck(operation, __FILE__, __LINE__);
//Macro used to see when a particular line of code is executed on the host.
// Macro used to see when a particular line of code is executed on the host.
#define DEBUG_MARK std::cout << "[DEBUG] file " << __FILE__ << ", line " << __LINE__ << std::endl
//Convenient typedefs for containers
// Convenient typedefs for containers
typedef std::vector<cudaDeviceProp> DeviceProperties;
typedef std::vector<dim3> Dimensions;
/**
Most important class in the application.
- Haupt-Klasse der Applikation
@@ -183,399 +115,288 @@ typedef std::vector<dim3> Dimensions;
*/
class SAFTHandler
{
public:
SAFTHandler(int deviceId,
int deviceIndex,
float *aScan_ptr, ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath,
double *output_ptr, ///< Zeiger zu den Outputdaten //std::string const & outputPath,
double *Duration_ptr, ///< Zeiger auf Ausgabewert f<>r benoetigte Laufzeit des SAFT-Kernels
unsigned short *receiver_index_ptr, ///<
unsigned short *emitter_index_ptr, ///<
float *receiver_list_ptr, ///<
int receiver_list_Size, ///<
float *emitter_list_ptr, ///<
int emitter_list_Size, ///<
float *speed_vec_ptr,
int3 SOSGrid_XYZ,
float3 sosOffset, ///< Startpoint of SoSGrid
float SOS_RESOLUTION, ///< Aufloesung des SoSGrid
float *att_vec_ptr, //att_vec_ptr
public:
SAFTHandler(int deviceId, int deviceIndex,
float *aScan_ptr, ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath,
double *output_ptr, ///< Zeiger zu den Outputdaten //std::string const & outputPath,
double *Duration_ptr, ///< Zeiger auf Ausgabewert f<>r benoetigte Laufzeit des SAFT-Kernels
unsigned short *receiver_index_ptr, ///<
unsigned short *emitter_index_ptr, ///<
float *receiver_list_ptr, ///<
int receiver_list_Size, ///<
float *emitter_list_ptr, ///<
int emitter_list_Size, ///<
float *speed_vec_ptr, int3 SOSGrid_XYZ,
float3 sosOffset, ///< Startpoint of SoSGrid
float SOS_RESOLUTION, ///< Aufloesung des SoSGrid
float *att_vec_ptr, // att_vec_ptr
int aScanCount,
int aScanLength,
int3 IMAGE_SIZE_XYZ,
float sampleRate,
float3 regionOfInterestOffset,
float IMAGE_RESOLUTION,
dim3 const & fixedBlockDimensions,
float debugMode,
float debugModeParameter,
//bool useFixedPartialOutputWindow,
int aScanCount, int aScanLength, int3 IMAGE_SIZE_XYZ, float sampleRate, float3 regionOfInterestOffset, float IMAGE_RESOLUTION, dim3 const &fixedBlockDimensions, float debugMode,
float debugModeParameter,
// bool useFixedPartialOutputWindow,
bool SOSMode_3DVolume,
bool ATTMode_3DVolume,
bool SOSMode_3DVolume, bool ATTMode_3DVolume,
int SAFT_MODE,
int *SAFT_VARIANT,
int SAFT_VARIANT_Size,
int SAFT_MODE, int *SAFT_VARIANT, int SAFT_VARIANT_Size,
int *Abort_ptr
);
int *Abort_ptr);
void performReconstruction();
private:
private:
int *Abort_ptr; // Ist ein Fehler aufgetreten der zum Abburch geführt hat
// int Abort;
int *Abort_ptr; // Ist ein Fehler aufgetreten der zum Abburch geführt hat
//int Abort;
bool SOSMode_3DVolume, ATTMode_3DVolume;
bool SOSMode_3DVolume,
ATTMode_3DVolume;
int SAFT_MODE;
int *SAFT_VARIANT;
int *deviceSAFT_VARIANT;
int SAFT_VARIANT_Size;
int SAFT_MODE;
int *SAFT_VARIANT;
int *deviceSAFT_VARIANT;
int SAFT_VARIANT_Size;
int deviceId;
int deviceIndex;
float debugMode;
float debugModeParameter;
float debugMode;
float debugModeParameter;
DeviceProperties deviceProperties;
float
*aScan_ptr;
float *aScan_ptr;
// float
// *rec_vec_ptr,
// *send_vec_ptr;
// float
// *rec_vec_ptr,
// *send_vec_ptr;
// Zuordnungslisten in der geschaut wird welcher Emitter/Receiver genutzt wird (65535 = nicht genutzt, alles andere ist dann der Index)
unsigned short* hostLookUpGeometryMemoryListEmitterPtr; // Memory of hostLookUpGeometryMemoryListEmitter
unsigned short* hostLookUpGeometryMemoryListReceiverPtr; // Memory of hostLookUpGeometryMemoryListReceiver
int lookUpGeometryMemoryListEmitterSize; // Size of hostLookUpGeometryMemoryListEmitterPtr
int lookUpGeometryMemoryListReceiverSize;// Size of hostLookUpGeometryMemoryListReceiverPtr
unsigned short *hostLookUpGeometryMemoryListEmitterPtr; // Memory of hostLookUpGeometryMemoryListEmitter
unsigned short *hostLookUpGeometryMemoryListReceiverPtr; // Memory of hostLookUpGeometryMemoryListReceiver
int lookUpGeometryMemoryListEmitterSize; // Size of hostLookUpGeometryMemoryListEmitterPtr
int lookUpGeometryMemoryListReceiverSize; // Size of hostLookUpGeometryMemoryListReceiverPtr
unsigned short
*emitter_index_ptr,
*receiver_index_ptr;
unsigned short *emitter_index_ptr, *receiver_index_ptr;
float
*emitter_list_ptr,
*receiver_list_ptr;
float *emitter_list_ptr, *receiver_list_ptr;
int
receiver_list_Size,
emitter_list_Size;
int receiver_list_Size, emitter_list_Size;
double
*output_ptr;
double *output_ptr;
double
*Duration_ptr;
double *Duration_ptr;
float
Sos,
*speed_vec_ptr,
*att_vec_ptr;
float Sos, *speed_vec_ptr, *att_vec_ptr;
int3
SOSGrid_XYZ;
int3 SOSGrid_XYZ;
float3
sosOffset; ///< Startpoint of SoSGrid
float3 sosOffset; ///< Startpoint of SoSGrid
int
aScanCount,
aScanLength;
int aScanCount, aScanLength;
int3
IMAGE_SIZE_XYZ;
int3 IMAGE_SIZE_XYZ;
float3
regionOfInterestOffset; //imageStartpoint; TODO: umbenennen!
float3 regionOfInterestOffset; // imageStartpoint; TODO: umbenennen!
float
IMAGE_RESOLUTION, ///< Aufl<66>sung im OutputVolumen
IMAGE_RESOLUTION_FACTOR, ///< 1/Aufl<EFBFBD>sung im OutputVolumen
SOS_RESOLUTION, ///< Aufloesung des SoSGrid
SOS_RESOLUTION_FACTOR; ///< 1/Aufl<66>sung im SoS-Grid
float IMAGE_RESOLUTION, ///< Aufl<66>sung im OutputVolumen
IMAGE_RESOLUTION_FACTOR, ///< 1/Aufl<EFBFBD>sung im OutputVolumen
SOS_RESOLUTION, ///< Aufloesung des SoSGrid
SOS_RESOLUTION_FACTOR; ///< 1/Aufl<EFBFBD>sung im SoS-Grid
std::string
emitterGeometryPath,
receiverGeometryPath,
aScanSamplesPath,
outputPath;
std::string emitterGeometryPath, receiverGeometryPath, aScanSamplesPath, outputPath;
float *aScanSamples;
double *output;
//int aScanCount;
// int aScanCount;
int
//aScanSize,
aScan_Bytes,
//batchSize, // --> aScanCount
//aScanBatchSize;
aScanBatch_Bytes;
// aScanSize,
aScan_Bytes,
// batchSize, // --> aScanCount
// aScanBatchSize;
aScanBatch_Bytes;
float voxelSize;
float sampleRate;
//size_t
// size_t
uint64_t
//regionOfInterestVoxelCount,
outputVolume_VoxelCount,
//outputSize;
outputVolume_Bytes;
// regionOfInterestVoxelCount,
outputVolume_VoxelCount,
// outputSize;
outputVolume_Bytes;
float3 outputVolume_size_m; // ROI-Groesse in meter
float3 outputVolume_size_m; // ROI-Groesse in meter
uint64_t
partialOutputZLayerOffset;
uint64_t partialOutputZLayerOffset;
int
partialOutputZLayerOffsetCount,
partialOutputSoSZLayerCount,
currentZLayerCount,
partialSoSZLayerCount;
int partialOutputZLayerOffsetCount, partialOutputSoSZLayerCount, currentZLayerCount, partialSoSZLayerCount;
// Fuer AscanIndexInterpolation
// ------------------------------------------------------
int currentEmIndexUsedForAscanIndexCalculation;
float *deviceTextureAscanIndexFloat; // Texture adresses for precalculated AscanIndex data
//std::size_t deviceTextureAscanIndexFloatSize;
int currentEmIndexUsedForAscanIndexCalculation;
float *deviceTextureAscanIndexFloat; // Texture adresses for precalculated AscanIndex data
// std::size_t deviceTextureAscanIndexFloatSize;
cudaArray **deviceTextureAscanIndexFloatCuArray; // CudaArray for AscanIndex data
int maxSurfaceTexture3DDimension; // max Dimension in 3D --> Max size for Texture
int maxAscanIndexArraysInTexture; // = maxSurfaceTexture3DDimension/2;
int TableAscanIndexAllocationCount; // Anzahl der benoetigten AscanBlocks der Groesse 2048/4096
int maxSupportedTexturesForAscanIndex; // Definiert die maximal unterstuetzen Texturen fuer AscanIndex
int neededAscanBatchCount; // Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen
cudaArray **deviceTextureAscanIndexFloatCuArray; // CudaArray for AscanIndex data
int maxSurfaceTexture3DDimension; // max Dimension in 3D --> Max size for Texture
int maxAscanIndexArraysInTexture; // = maxSurfaceTexture3DDimension/2;
int TableAscanIndexAllocationCount; // Anzahl der benoetigten AscanBlocks der Groesse 2048/4096
int maxSupportedTexturesForAscanIndex; // Definiert die maximal unterstuetzen Texturen fuer AscanIndex
int neededAscanBatchCount; // Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen
// ------------------------------------------------------
double *currentHostOutputAdress;
// Pointer of Inputdata in memory of Ascanblock
float3
*receiver_list, // LookUpTable receiverNr -> coordinates
*emitter_list; // LookUpTable emitterNr -> coordinates
float3 *receiver_list, // LookUpTable receiverNr -> coordinates
*emitter_list; // LookUpTable emitterNr -> coordinates
unsigned short
*receiver_index, // Input Ascanblockdata: corresponding receiverNr
*emitter_index; // Input Ascanblockdata: corresponding emitterNr
unsigned short *receiver_index, // Input Ascanblockdata: corresponding receiverNr
*emitter_index; // Input Ascanblockdata: corresponding emitterNr
//float
// float
// *SoSData; // Input Ascanblockdata: Corresponding SOS value
float *speedOfSoundField; // Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen
float *attenuationField; // Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen
float *speedOfSoundField; // Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen
float *attenuationField; // Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen
#ifdef SaftUseSosAttFloat2
float2 *hostSosAttField;
#endif
float2 *hostSosAttField;
// Memorysizes
int
//speedOfSoundFieldVoxelCount, //
SOSVolume_VoxelCount, // Amount of Voxels of SOSVolume
//speedOfSoundFieldBytes, //
SOSVolume_Bytes, // Size of SOSVolume in Byte
speedOfSoundEmitterVoxelPathCountByteSize, // Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen
speedOfSoundEmitterVoxelPathSumByteSize; // Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel
// speedOfSoundFieldVoxelCount, //
SOSVolume_VoxelCount, // Amount of Voxels of SOSVolume
// speedOfSoundFieldBytes, //
SOSVolume_Bytes, // Size of SOSVolume in Byte
speedOfSoundEmitterVoxelPathCountByteSize, // Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen
speedOfSoundEmitterVoxelPathSumByteSize; // Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel
dim3
fixedBlockDimensions, // kann ws durch genericSAFTBlockDimensions ersetzt
genericSAFTBlockDimensions,
genericSAFTGridDimensions,
windowGridDimensions;
dim3 fixedBlockDimensions, // kann ws durch genericSAFTBlockDimensions ersetzt
genericSAFTBlockDimensions, genericSAFTGridDimensions, windowGridDimensions;
cudaArray **deviceAScansCuArray;
cudaArray **deviceAScansCuArray;
#ifdef SaftTextureForBresenhamSosPaths
cudaArray *deviceSosAttFieldCuArray;
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
cudaArray *deviceSpeedOfSoundFieldCuArray; // SOS volume
cudaArray *deviceAttenuationFieldCuArray; // ATT volume
#endif
int maxSoSReceiverArrayForTexture;
int TableVoxelToReceiverPathSosAllocationCount;
std::size_t receiver_list_Size_deviceMemory;
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
cudaArray *deviceSosAttFieldCuArray;
#endif
#endif
// Für Emitter ----- normal definieren
cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray; // SoSSum
cudaArray *deviceTableVoxelToEmitterPathCountCuArray; // Count
// Für Receiver ----- als Arrays definieren da zwei benoetigt
cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray; // SoSSum
cudaArray **deviceTableVoxelToReceiverPathCountCuArray; // Count
cudaArray *deviceTableVoxelToEmPathSosBothCuArray; // Emitter SoSSum + Count
cudaArray **deviceTableVoxelToRecPathSosBothCuArray; // Receiver SoSSum + Count
int maxSoSReceiverArrayForTexture;
int TableVoxelToReceiverPathSosAllocationCount;
std::size_t receiver_list_Size_deviceMemory;
#ifdef SaftTextureForEmRecSosPathsTables
// Für Emitter ----- normal definieren
cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray; //SoSSum
cudaArray *deviceTableVoxelToEmitterPathCountCuArray; //Count
// Für Receiver ----- als Arrays definieren da zwei benoetigt
cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray; //SoSSum
cudaArray **deviceTableVoxelToReceiverPathCountCuArray; //Count
#endif
#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
cudaArray *deviceTableVoxelToEmPathSosBothCuArray; //Emitter SoSSum + Count
cudaArray **deviceTableVoxelToRecPathSosBothCuArray; //Receiver SoSSum + Count
#endif
// Schallgeschwindigkeitskorrektur-Mode
float *deviceSpeedOfSoundField; // Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU
// Schallgeschwindigkeitskorrektur-Mode
float *deviceSpeedOfSoundField; // Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU
// Block-Mode
unsigned short *deviceEmitterIndex_block; // Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU
unsigned short *deviceEmitterIndex_block; // Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU
unsigned short *deviceReceiverIndex_block;
float3 *deviceListEmitterGeometry; // Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU
float3 *deviceListEmitterGeometry; // Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU
float3 *deviceListReceiverGeometry;
float *deviceSoSData_block; // Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU
// VoxelCountType // Adressen fuer Speicher der SoS-Pfade auf der GPU
// * deviceTableVoxelToEmitterPathCount,
// * deviceTableVoxelToReceiverPathCount;
float
*deviceTableVoxelToEmitterPathCountFloat, // Texture adresses for precalculated SOS data
*deviceTableVoxelToReceiverPathCountFloat,
*deviceTableVoxelToEmitterPathSosSum,
*deviceTableVoxelToReceiverPathSosSum;
float *deviceSoSData_block; // Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU
float *deviceTableVoxelToEmitterPathCountFloat, // Texture adresses for precalculated SOS data
*deviceTableVoxelToReceiverPathCountFloat, *deviceTableVoxelToEmitterPathSosSum, *deviceTableVoxelToReceiverPathSosSum;
bool *deviceValidEmitterReceiverCombinations;
int *deviceTransducerVectorAnalysisDistributionCounters;
// float3
// * deviceEmitterGeometry,
// * deviceReceiverGeometry;
int usedAmountOfEmitter, // amount of used emitter
usedAmountOfReceiver; // amount of used receiver
int usedAmountOfEmitter, // amount of used emitter
usedAmountOfReceiver; // amount of used receiver
// Output volume
double *deviceOutput;
double *deviceOutput;
//Streams used for synchronisation
cudaStream_t
copyStream,
calculationStream;
// Streams used for synchronisation
cudaStream_t copyStream, calculationStream;
//This variable describes the number of allocations used by the current SAFT mode
std::size_t aScanAllocationCount; // Anzahl der Speicher die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.
// This variable describes the number of allocations used by the current SAFT mode
std::size_t aScanAllocationCount; // Anzahl der Speicher die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.
int
invalidEmitterReceiverCombinationsCount,
validEmitterReceiverCombinationsCount;
int invalidEmitterReceiverCombinationsCount, validEmitterReceiverCombinationsCount;
Dimensions validBlockDimensions;
bool useAutoTuning;
// AutoTuningConfiguration autoTuningConfiguration;
// AutoTuningConfiguration autoTuningConfiguration;
size_t
partialOutputSize,
partialVolumeSize, // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde
partialSosPathSize, // Speicher(SOSATTPaths) , der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde
partialAscanIndexSize, // Speicher(AscanIndex) , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde
maxFeasibleZLayerCount, // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt.
maxFeasibleSosZLayerCount; // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt.
size_t partialOutputSize,
partialVolumeSize, // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde
partialSosPathSize, // Speicher(SOSATTPaths) , der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde
partialAscanIndexSize, // Speicher(AscanIndex) , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde
maxFeasibleZLayerCount, // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt.
maxFeasibleSosZLayerCount; // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt.
int
minimumAutoTuningThreadCount,
maximumAutoTuningThreadCount;
int minimumAutoTuningThreadCount, maximumAutoTuningThreadCount;
//New partial reconstruction data
// New partial reconstruction data
std::size_t partialSpeedOfSoundVoxelCount;
std::size_t partialOutputZLayerCount;
std::size_t zLayerVoxelCount;
std::size_t sosZLayerVoxelCount; // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp
std::size_t sosZLayerVoxelCount; // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp
std::size_t partialOutputVoxelCount;
double diff_time; // For Time Measurement
float transferRate; // For DataTransferrate Measurement
float performRate; // For PerformSAFTrate Measurement
cudaDeviceProp deviceProp; // Ausgabe der Frequenz
double diff_time; // For Time Measurement
float transferRate; // For DataTransferrate Measurement
float performRate; // For PerformSAFTrate Measurement
cudaDeviceProp deviceProp; // Ausgabe der Frequenz
// Core reconstruction
//Core reconstruction
void processAScans(ullong & duration);
void processAScans(ullong &duration);
void performCoreReconstruction();
//Pre-calculation
// Pre-calculation
//void precalculateAverageSpeedOfSound(int zLayer, int zLayerCount); // TODO: Funktion die nicht mehr benutzt wird?
// void analysisOfTransducerVectors();
// void normalisePerformanceStatisticsOutput();
// void printTransducerVectorStatistics();
//Auto-tuning
bool determineGridDimensions(dim3 const & blockDimensions, dim3 & gridDimensions);
// Auto-tuning
bool determineGridDimensions(dim3 const &blockDimensions, dim3 &gridDimensions);
void determineValidBlockDimensions();
void reduceKernelDimensions(dim3 const &gridDimensions, dim3 const &blockDimensions, dim3 &reducedGridDimensions, dim3 &reducedBlockDimensions);
void reduceKernelDimensions(dim3 const & gridDimensions, dim3 const & blockDimensions, dim3 & reducedGridDimensions, dim3 & reducedBlockDimensions);
//Pre-calculation kernels
// Pre-calculation kernels
//------------------------------------------------------------------------
#ifdef SaftUseConstantMemforGeometry
//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
#else
//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceSpeedOfSoundSumOutput);
void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, float * deviceSpeedOfSoundSumOutput);
#endif
void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float *deviceVoxelCountOutputFloat, float *deviceSpeedOfSoundSumOutput);
#ifdef SaftUseAscanIndexInterpolation
void precalculateAscanIndex
(
int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
//int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated
//int emitter_list_Size, ///< Number of emitter_array got from Matlab
//int receiver_list_Size, ///< Number of receiver_array got from Matlab
//float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to Receiver.
);
void precalculateAscanIndex(int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
// int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated
// int emitter_list_Size, ///< Number of emitter_array got from Matlab
// int receiver_list_Size, ///< Number of receiver_array got from Matlab
// float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to Receiver.
);
void precalculateAscanIndex_usePaths
(
int ascanIndex_i, ///< Offset of AscanIndex batch.
int aScanWindowSize, ///< Amount of Ascans in AscanIndex batch to process.
int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
// int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated -> No more necessary due to all Combinations-should be Calculated
// float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to Receiver.
);
void precalculateAscanIndex_usePaths(
int ascanIndex_i, ///< Offset of AscanIndex batch.
int aScanWindowSize, ///< Amount of Ascans in AscanIndex batch to process.
int currentSpeedOfSoundZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
int maxFeasibleSosZLayerCount ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
// int currentEmIndexUsedForAscanIndexCalculation, ///< current Index of Em for which the AscanIndex is calculated -> No more necessary due to all
// Combinations-should be Calculated float * deviceTextureAscanIndexFloatCuArray ///< Out: AscanIndex for the path from Emitter to voxel to
// Receiver.
);
#endif
// Initialize AScanIndexSurface
void fillCuArray
(
float useValue,
cudaArray **deviceTextureAscanIndexFloatCuArray, ///< CuArray to fill
int TableAscanIndexAllocationCount
);
//SAFT Kernel
void performSAFT(int aScanIndex, size_t aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, int currentEmIndexUsedForAscanIndexCalculation, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceAScansCuArray); //Ascans in CuArray f<>r Texturmemory
// Initialize AScanIndexSurface
void fillCuArray(float useValue,
cudaArray **deviceTextureAscanIndexFloatCuArray, ///< CuArray to fill
int TableAscanIndexAllocationCount);
// SAFT Kernel
void performSAFT(int aScanIndex, size_t aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer,
int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, int currentEmIndexUsedForAscanIndexCalculation, dim3 const &windowGridDimensions, dim3 const &gridDimensions,
dim3 const &blockDimensions, float *deviceSpeedOfSoundField, cudaArray *deviceAScansCuArray); // Ascans in CuArray f<>r Texturmemory
};
//std::string vectorToString(float3 const & vector);
//std::string voxelToString(dim3 const & voxel);
extern void memoryCheck();
extern std::size_t memoryGPUfree();
extern std::size_t memoryGPUtotal();
extern void performCUDAResultCheck(cudaError_t result, std::string const & file, int line);
extern void performCUDAResultCheck(cudaError_t result, std::string const &file, int line);