fix: fix a miss "{" bug

feat: refactor & clean cpp code in SAFT_TOFI
feat: refactor & clean kernerl dir in SAFT_TOFI
2024-11-21 10:25:45 +08:00 · 2024-11-21 09:49:34 +08:00 · 2024-11-19 16:49:21 +08:00
14 changed files with 4693 additions and 10087 deletions
--- a/SAFT_TOFI/CMakeLists.txt
+++ b/SAFT_TOFI/CMakeLists.txt
@@ -3,7 +3,10 @@ project(SaftTofi)
 set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
 enable_language(CUDA)
 find_package (OpenMP REQUIRED)
-add_library(SaftTofi SHARED ./src/SAFT_TOFI.cpp ./src/saft.cu ./src/processAScans.cpp ./src/saft.cpp )
+file(GLOB_RECURSE  cu_files ./src/*.cu)
+file(GLOB_RECURSE  cuh_files ./src/*.cuh) 
+
+add_library(SaftTofi SHARED ./src/SAFT_TOFI.cpp ./src/processAScans.cpp ./src/saft.cpp ${cu_files} ${cuh_files})
 target_include_directories(SaftTofi PRIVATE ../SAFT ./src /usr/local/cuda/include  ) 
 set_target_properties(SaftTofi PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 target_compile_options(SaftTofi PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
@@ -11,7 +14,7 @@ target_compile_options(SaftTofi PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
                       --use_fast_math
                       --ptxas-options=-v
                       -arch compute_30 -code compute_30,sm_30 
-                       >)
+                       >) 

 target_link_libraries(SaftTofi PRIVATE ${CUDA_RUNTIME_LIBRARY} )
 target_link_libraries(SaftTofi PRIVATE OpenMP::OpenMP_CXX )
--- a/SAFT_TOFI/src/SAFT_TOFI.cpp
+++ b/SAFT_TOFI/src/SAFT_TOFI.cpp
--- a/SAFT_TOFI/src/kernel/constantMemory.cuh
+++ b/SAFT_TOFI/src/kernel/constantMemory.cuh
@@ -1,33 +0,0 @@
-#include "saft.hpp"
-
-/*!
-	Emitter and receiver geometry held in constant memory, available across all functions in saft.cu because all of it is held in the same compilation unit.
-	- Emitter und Receiver Geometrie werden im Constant Memory gehalten, erreichbar f<>r alle Funktionen in Saft.cu  weil alle von ihnen in der selben Kompilierungs-Einheit gehalten werden.
-*/
-
-#ifdef SaftUseConstantMemforGeometry
-
-	#ifdef SaftUseHarmonicMean
-        
-        __constant__ float3 emitterPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-		__constant__ float3 receiverPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-
-		// __constant__ float3 emitterPOSharmon[157 * 4];
-		// __constant__ float3 receiverPOSharmon[157 * 9];
-
-		float3* constEmitterPtr = &emitterPOSharmon[0];
-		float3* constReceiverPtr = &receiverPOSharmon[0];
-	#endif
-
-
-	// LookUpTable for GeometryList and Memory Position
-    
-    __constant__ unsigned short lookUpGeometryMemoryListEmitter [MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-	__constant__ unsigned short lookUpGeometryMemoryListReceiver[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-
-	// __constant__ unsigned short lookUpGeometryMemoryListEmitter [157 * 4];
-	// __constant__ unsigned short lookUpGeometryMemoryListReceiver[157 * 9];
-
-	unsigned short* constLookUpGeometryMemoryListEmitterPtr = &lookUpGeometryMemoryListEmitter[0];
-	unsigned short* constLookUpGeometryMemoryListReceiverPtr = &lookUpGeometryMemoryListReceiver[0];
-#endif
--- a/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cu
+++ b/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cu
--- a/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cuh
+++ b/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cuh
--- a/SAFT_TOFI/src/kernel/rayTracing.cu
+++ b/SAFT_TOFI/src/kernel/rayTracing.cu
@@ -0,0 +1,3 @@
+#include "rayTracing.cuh"
+texture<float, cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField;  // Schritt 1. Textur anlegen	//TODO: fuer Float2 fall rausnehmen
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texRefSosAttField;       // Schritt 1. Textur anlegen
--- a/SAFT_TOFI/src/kernel/rayTracing.cuh
+++ b/SAFT_TOFI/src/kernel/rayTracing.cuh
@@ -1,676 +1,167 @@
-#include <stdio.h>
 #include "saft.hpp"

-// Structure of File
-//-------------------------------------------------------
-//
-// __device__ __forceinline__ void determineSpeedOfSoundFieldVoxelFloat 		// Teilpfade  (genutzt)// Bestimme den SOSVoxel der zu einer Position gehoert, mit Nachkommastelle
-//
-// __device__ __forceinline__ void processRayTracedVoxelTexture 				// Für den Fall SaftUseSosAttFloat1 erstmal lassen
-// __device__ __forceinline__ void processRayTracedVoxelTextureSosAtt			// AscanIndex // Addiere lokale SOS&ATT-Werte und # Voxel im Pfad
-//
-// Bresenham
-// __device__ __forceinline__ void performRayTracedSpeedAdditionTexture			// Teilpfade  // Dreidimensionale Version des Bresenham Line Algorithmus im Float-Format
+#define SQR(X) ((X) * (X))

-// -------------------------------------------------------
+extern texture<float, cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField;  // Schritt 1. Textur anlegen	//TODO: fuer Float2 fall rausnehmen
+extern texture<float2, cudaTextureType3D, cudaReadModeElementType> texRefSosAttField;       // Schritt 1. Textur anlegen

-// printf() is only supported
-// for devices of compute capability 2.0 and above
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
-	#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
-#endif
-
-#define SQR(X) ((X)*(X))
+#define OutputSOSPositionX 20  // Volumen Voxel im SOS-Grid!
+#define OutputSOSPositionY 20
+#define OutputSOSPositionZ 20  // Echte Z-SoS-Layer ohne Offset


-//#ifdef debug_CudaRayTraceKernel
-	#define OutputSOSPositionX  20			// Volumen Voxel im SOS-Grid!
-	#define OutputSOSPositionY  20
-	#define OutputSOSPositionZ  20 		    // Echte Z-SoS-Layer ohne Offset
-
-//	#define ER_PositionX  4				// Emitter Receiver Position im SOS-Grid!
-//	#define ER_PositionY  37
-//	#define ER_PositionZ  9
-//#endif
-
-#ifdef debug_CudaRayTraceKernelLive
-	#define OutputPositionX  250			// Volumen Voxel im Volumen!
-	#define OutputPositionY  250
-	#define OutputPositionZ  0
-
-	#define DebugSosVoxelX  10
-	#define DebugSosVoxelY  10
-	#define DebugSosVoxelZ  10
-
-	#define ER_PositionX  0				// Emitter Receiver Position im SOS-Grid!
-	#define ER_PositionY  32
-	#define ER_PositionZ  0
-#endif
-
-//Textur fuer SoSField anlegen
-
-#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-	texture<float, cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField; // Schritt 1. Textur anlegen
-#endif
-
-#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-	texture<float , cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField; // Schritt 1. Textur anlegen	//TODO: fuer Float2 fall rausnehmen
-	texture<float2, cudaTextureType3D, cudaReadModeElementType> texRefSosAttField; // Schritt 1. Textur anlegen
-#endif
-
-
-
-/**
-   Determines the voxel within the speed of sound field associated with a position.
-   - Bestimme den Voxel in dem Schallgeschwindigkeitsfeld der zu einer Position gehoert, mit Nachkommastelle
-*/
-__device__ __forceinline__ void determineSpeedOfSoundFieldVoxelFloat(		// __forceinline__ zwingt den Compiler diesen Code bei jeden Aufruf direkt einzubinden (nicht als Funktion).
-    float3 const & position, 	///< Position within the scanner.
-    float3 & voxel, 				///< This argument is written to. Output voxel.
-    float3 & sosOffset,
-    float & SOS_RESOLUTION_FACTOR
-    )
+__device__ __forceinline__ void determineSpeedOfSoundFieldVoxelFloat(  // __forceinline__ zwingt den Compiler diesen Code bei jeden Aufruf direkt einzubinden (nicht als Funktion).
+    float3 const &position,                                            ///< Position within the scanner.
+    float3 &voxel,                                                     ///< This argument is written to. Output voxel.
+    float3 &sosOffset, float &SOS_RESOLUTION_FACTOR)

 {
-
-	#ifndef SOS_Version2
-		voxel.x = (position.x - sosOffset.x ) * SOS_RESOLUTION_FACTOR + 0.5f;	// SoSVoxel aus Positionsdaten bestimmen
-		voxel.y = (position.y - sosOffset.y ) * SOS_RESOLUTION_FACTOR + 0.5f;
-		voxel.z = (position.z - sosOffset.z ) * SOS_RESOLUTION_FACTOR + 0.5f;
-	#else
-		voxel.x = (position.x - sosOffset.x ) * SOS_RESOLUTION_FACTOR;	// SoSVoxel aus Positionsdaten bestimmen
-		voxel.y = (position.y - sosOffset.y ) * SOS_RESOLUTION_FACTOR;
-		voxel.z = (position.z - sosOffset.z ) * SOS_RESOLUTION_FACTOR;
-	#endif
-
+    voxel.x = (position.x - sosOffset.x) * SOS_RESOLUTION_FACTOR;  // SoSVoxel aus Positionsdaten bestimmen
+    voxel.y = (position.y - sosOffset.y) * SOS_RESOLUTION_FACTOR;
+    voxel.z = (position.z - sosOffset.z) * SOS_RESOLUTION_FACTOR;
 }

-
-
-
 /**
   Process a voxel in the Bresenham algorithm. Float Format
   Accumulate speed of sound samples and keep track of the number of voxels in the path.
   - Verarbeite einen Voxel in dem Bresenham Algorithmus
   - Addiere Schallgeschwindigkeits-Sample
 */
-__device__ __forceinline__ void processRayTracedVoxelTexture(
-	float const * currentVoxelFloat, 			///< Bresenham speed of sound voxel float.
-    int & voxelCount, 				///< This argument is written to. Number of voxels in the path so far.
-    float & totalSpeed, 			///< This argument is written to. Speed of sound sample accumulator.
-    //float const * speedOfSoundField,///< Speed of sound field data containing samples.
-    cudaArray *deviceSpeedOfSoundFieldCuArray,		///< Pointer to cudaArray for SoSFieldData
-    int3 const & SOSGrid_XYZ		///< Size of SOS-Grid in XYZ
-    )
+__device__ __forceinline__ void processRayTracedVoxelTexture(float const *currentVoxelFloat,  ///< Bresenham speed of sound voxel float.
+                                                             int &voxelCount,                 ///< This argument is written to. Number of voxels in the path so far.
+                                                             float &totalSpeed,               ///< This argument is written to. Speed of sound sample accumulator.
+                                                             // float const * speedOfSoundField,///< Speed of sound field data containing samples.
+                                                             cudaArray *deviceSpeedOfSoundFieldCuArray,  ///< Pointer to cudaArray for SoSFieldData
+                                                             int3 const &SOSGrid_XYZ                     ///< Size of SOS-Grid in XYZ
+)

 {
-	#ifdef SaftUseHarmonicMean
-		totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-	#endif
-
-    #ifdef debug_CudaRayTraceKernel
-    		if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-    		{
-    			printf("pSOS currentVoxel [%d %d %d]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-    			//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-    			printf("  totalSpeed         = %2.10f\n", totalSpeed);
-    			//printf("  totalAttenuation   = %2.10f\n", totalAttenuation);
-    			printf("  voxelCount         = %d\n",     voxelCount);
-    		}
-    #endif
+    totalSpeed += 1 / (float)tex3D(texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
 }



-
-///////////////////////////////////////////////////////////////////////////////////////////
-// AscanIndex - Variante
-// kernel version Z.316
-///////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////
-/**
-   Process a voxel in the Bresenham algorithm. Float Format
-   Accumulate speed of sound samples and keep track of the number of voxels in the path.
-   - Addiere lokale Schallgeschwindigkeits-Werte und Anzahl besuchter Voxel im Pfad
- */
-// Aufruf:  processRayTracedVoxelTextureSosAtt(currentVoxelFloat, voxelCount, totalSpeed, totalAttenuation, deviceSosAttFieldCuArray, SOSGrid_XYZ);
-
 __device__ __forceinline__ void processRayTracedVoxelTextureSosAtt(
-	float const * currentVoxelFloat, 			///< SOS-coordinates of current Voxel for Bresenham in float[3].
-	float & voxelCount, 							///< This argument is written to. Number of voxels in the path so far. // TODO: not necessary here to sum up because it is already known from Bresenham length
-    float & totalSpeed, 						///< This argument is written to. Sum Up total SOS
-    float & totalAttenuation, 					///< This argument is written to. Sum Up total Attenuation
-    //float const * speedOfSoundField,			///< Speed of sound field data containing samples.
-    cudaArray *deviceSosAttFieldCuArray,		///< Pointer to cudaArray for SoSATTFieldData  	// TODO: not  necessary if access is used with texture memory
-    int3 const & SOSGrid_XYZ					///< Size of SOS-Grid in XYZ					// TODO: only necessary for boundary-check.
-    )
+    float const *currentVoxelFloat,  ///< SOS-coordinates of current Voxel for Bresenham in float[3].
+    float &voxelCount,               ///< This argument is written to. Number of voxels in the path so far. // TODO: not necessary here to sum up because it is already known from Bresenham length
+    float &totalSpeed,               ///< This argument is written to. Sum Up total SOS
+    float &totalAttenuation,         ///< This argument is written to. Sum Up total Attenuation
+    cudaArray *deviceSosAttFieldCuArray,  ///< Pointer to cudaArray for SoSATTFieldData  	// TODO: not  necessary if access is used with texture memory
+    int3 const &SOSGrid_XYZ               ///< Size of SOS-Grid in XYZ					// TODO: only necessary for boundary-check.
+)

 {
-	float2 SosAttValue;
-
-	#ifdef SaftUseHarmonicMean
-		//totalSpeed += 1/speedOfSoundSample;		// harmonisches Mittel
-//		#ifdef SaftTextureForBresenhamInterpolated
-//			#ifndef SOS_Version2
-//				totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f );
-//			#else
-//				totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f );
-//			#endif
-//		#else
-			#ifndef SOS_Version2
-				totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2] );
-			#else
-				//totalSpeed       += 1/tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-				//totalAttenuation +=   tex3D( texRefAbsorptionField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-
-
-				#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-					totalSpeed       += 1/tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-					//totalAttenuation +=   tex3D( texRefAbsorptionField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-				#endif
-				#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-//					if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-//					{
-//						//printf("pSOSATT pathPoint [%f %f %f]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-//						//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-//						printf(".  totalSpeed         = %2.10f\n", totalSpeed);
-//						printf(".  totalAttenuation   = %2.10f\n", totalAttenuation);
-//						printf(".  voxelCount         = %d\n",     voxelCount);
-//					}
-					SosAttValue = tex3D( texRefSosAttField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-					totalSpeed       += 1/SosAttValue.x;
-					totalAttenuation += SosAttValue.y;
-
-
-					//if (SosAttValue.y > 20)
-					//	printf("!!!!!!!!!!!!!!!!! pSOSATT pathPoint [%f %f %f] = %f\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2], SosAttValue.y);
-
-
-//					if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-//					{
-//						//printf("pSOSATT pathPoint [%f %f %f]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-//						//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-//						printf("..  totalSpeed         = %2.10f\n", totalSpeed);
-//						printf("..  totalAttenuation   = %2.10f\n", totalAttenuation);
-//						printf("..  voxelCount         = %d\n",     voxelCount);
-//					}
-				#endif
-
-
-			#endif
-//		#endif
-	#endif
-
-
-	#ifdef debug_CudaRayTraceKernel
-		if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-		{
-			printf("pSOSATT currentVoxel [%f %f %f]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-			//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-			printf("  totalSpeed         = %2.10f\n", totalSpeed);
-			printf("  totalAttenuation   = %2.10f\n", totalAttenuation);
-			printf("  voxelCount         = %d\n",     voxelCount);
-		}
-	#endif
-
+    float2 SosAttValue;
+    SosAttValue = tex3D(texRefSosAttField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
+    totalSpeed += 1 / SosAttValue.x;
+    totalAttenuation += SosAttValue.y;
 }


+__device__ __forceinline__ void performRayTracedSpeedAdditionTexture(float &voxelCount,  ///< This argument is written to. Number of voxels within the path traced.

+                                                                     float &totalSpeed,  ///< This argument is written to. Sum of the speed of sound samples in the path traced.
+                                                                     float &totalAttenuation,

+                                                                     float3 const &point1f,  ///< Vector array describing the Voxelcoordinates of emitters or receivers.
+                                                                     dim3 const &point2,     ///< Vector array describing the Voxelcoordinates of Voxels.

-///////////////////////////////////////////////////////////////////////////////////////////
-// aktuell genutze Variante fuer Teilpfade
-///////////////////////////////////////////////////////////////////////////////////////////
+                                                                     cudaArray *deviceSpeedOfSoundFieldCuArray,  ///< CuArray fuer SOSFieldTextur

-/**
-   - Dreidimensionale Version des Bresenham Line Algorithmus im Float-Format
-*/
-__device__ __forceinline__ void performRayTracedSpeedAdditionTexture(
-	float & voxelCount, 					///< This argument is written to. Number of voxels within the path traced.
-
-#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-	float & totalSpeed, 				///< This argument is written to. Sum of the speed of sound samples in the path traced.
-#endif
-#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-	float & totalSpeed, 				///< This argument is written to. Sum of the speed of sound samples in the path traced.
-	float & totalAttenuation,
-#endif
-
-	float3 const & point1f,				///< Vector array describing the Voxelcoordinates of emitters or receivers.
-	dim3 const & point2,				///< Vector array describing the Voxelcoordinates of Voxels.
-
-#ifndef SaftTextureForBresenhamSosPaths
-	float const * deviceSpeedOfSoundField, 		///< Array of speed of sound samples. Dimensions ordered by speed of indices, commencing with the fastest moving one: 1. x 2. y 3. z
-#else
-	cudaArray *deviceSpeedOfSoundFieldCuArray,	///< CuArray fuer SOSFieldTextur
-#endif
-
-	int3 const & SOSGrid_XYZ,			///< Size of SOS-Grid in XYZ
-	float3 & sosOffset,
-	float & SOS_RESOLUTION,
-	float & IMAGE_RESOLUTION,
-	float3 regionOfInterestOffset,
-
-	#ifdef SaftUseConstantMemforGeometry
-		int geometry 							///< emitters=0 or receivers=1.
-	#else
-		float3 const * geometry 				///< Vector array describing the positions of emitters or receivers.
-	#endif
-	)
+                                                                     int3 const &SOSGrid_XYZ,  ///< Size of SOS-Grid in XYZ
+                                                                     float3 &sosOffset, float &SOS_RESOLUTION, float &IMAGE_RESOLUTION, float3 regionOfInterestOffset,
+                                                                     int geometry  ///< emitters=0 or receivers=1.
+)

 {
+    voxelCount = 0.0f;
+    totalSpeed = 0.0f;
+    totalAttenuation = 0.0f;

-	voxelCount = 0.0f;
-	totalSpeed = 0.0f;
-	totalAttenuation = 0.0f;
+    // Voxel-Koordinaten ebenfalls in float umwandeln
+    float voxel1f[3] = {point1f.x, point1f.y, point1f.z};  // Point1 liegt im float3 Format vor enthaellt schon +0.5

-	// Voxel-Koordinaten ebenfalls in float umwandeln
-	float voxel1f[3]= {point1f.x,point1f.y,point1f.z};    					//Point1 liegt im float3 Format vor enthaellt schon +0.5
+    // float VoxelIncrement = IMAGE_RESOLUTION/SOS_RESOLUTION;
+    float voxel2f[3];

-	//float VoxelIncrement = IMAGE_RESOLUTION/SOS_RESOLUTION;
-	float voxel2f[3];
+    voxel2f[0] = (float)point2.x;  // + 0.5f;
+    voxel2f[1] = (float)point2.y;  // + 0.5f;
+    voxel2f[2] = (float)point2.z;  // + 0.5f;

-	#ifndef SOS_Version2
-		voxel2f[0] = (regionOfInterestOffset.x - sosOffset.x ) / SOS_RESOLUTION + 0.5f;	// Start des Bildes im SOS-Grid aus Positionsdaten bestimmen
-		voxel2f[1] = (regionOfInterestOffset.y - sosOffset.y ) / SOS_RESOLUTION + 0.5f;
-		voxel2f[2] = (regionOfInterestOffset.z - sosOffset.z ) / SOS_RESOLUTION + 0.5f;
-	#endif
+    int greatestDistanceDim = 0;       // Gibt die Richtung(X,Y oder Z) an mit der grueueten Entfernung
+    int slowDim1 = 0;                  // Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
+    int slowDim2 = 0;                  // Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
+    float greatestDistance_XYZ = 0.0;  // Grueuete Distanz in die Richtung mit der grueueten Entfernung
+    int fastDirectionSteps = 0;        // Schritte die gegangen werden.

+    float m_XYZ[3];
+    float pathPoint[3];
+    if ((abs(voxel1f[0] - voxel2f[0]) <= abs(voxel1f[2] - voxel2f[2])))      // X<Z
+        if ((abs(voxel1f[1] - voxel2f[1]) <= abs(voxel1f[0] - voxel2f[0])))  // Y<Z
+        {
+            greatestDistanceDim = 2;  // => Z
+            slowDim1 = 0;
+            slowDim2 = 1;
+        }
+        else if ((abs(voxel1f[2] - voxel2f[2]) <= abs(voxel1f[1] - voxel2f[1])))  // Z<Y
+        {
+            greatestDistanceDim = 1;  // => Y
+            slowDim1 = 0;
+            slowDim2 = 2;
+        }
+        else  // Z=Y
+        {
+            greatestDistanceDim = 2;  // => Z
+            slowDim1 = 0;
+            slowDim2 = 1;
+        }
+    else                                                                     // X>Z
+        if ((abs(voxel1f[1] - voxel2f[1]) <= abs(voxel1f[0] - voxel2f[0])))  // Y<X
+        {
+            greatestDistanceDim = 0;  // => X
+            slowDim1 = 1;
+            slowDim2 = 2;
+        }
+        else  // Y>X
+        {
+            greatestDistanceDim = 1;  // => Y
+            slowDim1 = 0;
+            slowDim2 = 2;
+        }

-		#ifdef debug_CudaRayTraceKernel
-			if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-		//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-			{
-//				printf("-> performRayTracedSpeedAdditionTexture\n");
-//				printf("	point2 (Vol Pos im SoS)  [%d %d %d]\n", point2.x , point2.y , point2.z);
-//				printf("	voxel1 (E/R Pos im SoS)  [%13.6f %13.6f %13.6f]\n", voxel1f[0] , voxel1f[1] , voxel1f[2] );	// Voxel 1 = ER Position
-				printf("	voxel1f                = [%3f %3f %3f]\n", voxel1f[0],voxel1f[1],voxel1f[2]);
-				printf("	voxel1f                = [%3i %3i %3i]\n", (int)voxel1f[0],(int)voxel1f[1],(int)voxel1f[2]);
-//				printf("	regionOfInterestOffset   [%f %f %f]\n", regionOfInterestOffset.x , regionOfInterestOffset.y , regionOfInterestOffset.z );	// Voxel 2 = Volumen Position
-//				printf("	sosOffset                [%f %f %f]\n", sosOffset.x , sosOffset.y , sosOffset.z );
-//				printf("	SOS_RESOLUTION  = %12.8f\n", SOS_RESOLUTION );
-//				printf("	VoxelIncrement  = %12.8f\n", VoxelIncrement );
-//				//printf("   -> voxel2Start (Vol Pos SoS)  [%f %f %f]\n", voxel2f[0] , voxel2f[1] , voxel2f[2] );	// Voxel 2 Start -> wird noch angepasst
-			}
+    // ist Steigung negativ? & Bestimmen der fastDirectionSteps vom diskretisierten Startpunkt zum Endpunkt bzw. Voxel
+    if (voxel1f[greatestDistanceDim] >= voxel2f[greatestDistanceDim])  // voxel2f > voxel1f   Endpukt > Startpkt -> Steigung positiv
+    {
+        fastDirectionSteps = floor(voxel1f[greatestDistanceDim] + 0.5f) - floor(voxel2f[greatestDistanceDim] + 0.5f) + 1;
+        pathPoint[greatestDistanceDim] = voxel2f[greatestDistanceDim];
+        pathPoint[slowDim1] = voxel2f[slowDim1];
+        pathPoint[slowDim2] = voxel2f[slowDim2];
+    }
+    else  // voxel2f < voxel1f   End point < Start point -> Slope negative
+    {
+        fastDirectionSteps = floor(voxel2f[greatestDistanceDim] + 0.5f) - floor(voxel1f[greatestDistanceDim] + 0.5f) + 1;
+        pathPoint[greatestDistanceDim] = voxel1f[greatestDistanceDim];
+        pathPoint[slowDim1] = voxel1f[slowDim1];
+        pathPoint[slowDim2] = voxel1f[slowDim2];
+    }

-		#endif
+    // Steigung bestimmen
+    // Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
+    greatestDistance_XYZ = ((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]);  // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
+    m_XYZ[greatestDistanceDim] = 1.0f;                                                                   // Wegen Rundungsfehler bei der Division, die auftreten koennen.
+    // m_XYZ[greatestDistanceDim] = (voxel2f[greatestDistanceDim] - voxel1f[greatestDistanceDim]) / greatestDistance_XYZ;
+    m_XYZ[slowDim1] = (voxel2f[slowDim1] - voxel1f[slowDim1]) / greatestDistance_XYZ;
+    m_XYZ[slowDim2] = (voxel2f[slowDim2] - voxel1f[slowDim2]) / greatestDistance_XYZ;

-		#ifndef SOS_Version2
-			// Herausfinden wo genau der Voxel anfaengt/die Koordinaten des Voxels
-			voxel2f[0] = (float)point2.x + VoxelIncrement - fmod(((float)point2.x-voxel2f[0]), VoxelIncrement);
-			voxel2f[1] = (float)point2.y + VoxelIncrement - fmod(((float)point2.y-voxel2f[1]), VoxelIncrement);
-			voxel2f[2] = (float)point2.z + VoxelIncrement - fmod(((float)point2.z-voxel2f[2]), VoxelIncrement);
-		#else
-			//Vom Mittelpunkt ausgehen
-			voxel2f[0] = (float)point2.x;// + 0.5f;
-			voxel2f[1] = (float)point2.y;// + 0.5f;
-			voxel2f[2] = (float)point2.z;// + 0.5f;
-		#endif
+    int j = 0;
+    for (j = fastDirectionSteps; j > 0; j--)  //(Alle Punkte innerhalb der Schleife berechnen)
+    {
+        processRayTracedVoxelTextureSosAtt(pathPoint, voxelCount, totalSpeed, totalAttenuation, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
+        pathPoint[greatestDistanceDim] = pathPoint[greatestDistanceDim] + m_XYZ[greatestDistanceDim];
+        pathPoint[slowDim1] = pathPoint[slowDim1] + m_XYZ[slowDim1];
+        pathPoint[slowDim2] = pathPoint[slowDim2] + m_XYZ[slowDim2];
+    }

-
-		#ifdef debug_CudaRayTraceKernel
-
-			if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-		//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-			{
-				printf("   -> voxel2Angepasst (Vol Pos SoS)  [%13.6f %13.6f %13.6f]\n", voxel2f[0] , voxel2f[1] , voxel2f[2] );	// Voxel 2 nach anpassung
-				//printf("-> performRayTracedSpeedAdditionTexture\n");
-//				printf("	voxel1 (E/R Pos im SoS)  [%f %f %f]\n", voxel1f[0] , voxel1f[1] , voxel1f[2] );	// Voxel 1 = ER Position
-//				printf("	voxel2 (Vol Pos im SoS)  [%f %f %f]\n", voxel2f[0] , voxel2f[1] , voxel2f[2] );	// Voxel 2 = Volumen Position
-//				printf("	regionOfInterestOffset   [%f %f %f]\n", regionOfInterestOffset.x , regionOfInterestOffset.y , regionOfInterestOffset.z );	// Voxel 2 = Volumen Position
-//				printf("	sosOffset                [%f %f %f]\n", sosOffset.x , sosOffset.y , sosOffset.z );
-//				printf("	SOS_RESOLUTION  = %f\n", SOS_RESOLUTION );
-//				printf("	VoxelIncrement  = %f\n", VoxelIncrement );
-//				printf("	point2 (Vol Pos im SoS)  [%f %f %f]\n", point2.x , point2.y , point2.z);
-
-			}
-
-		#endif
-
-		int greatestDistanceDim = 0;		// Gibt die Richtung(X,Y oder Z) an mit der grueueten Entfernung
-		int slowDim1 = 0;					// Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
-		int slowDim2 = 0;					// Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
-		float greatestDistance_XYZ = 0.0;	// Grueuete Distanz in die Richtung mit der grueueten Entfernung
-		#ifdef debug_CudaRayTraceKernel
-			bool m_positiv = 0;				// Steigung positiv?
-		#endif
-		int fastDirectionSteps = 0;			// Schritte die gegangen werden.
-
-		#if defined(debug_CudaRayTraceKernel) || (! defined(SOS_Version2))
-			float distance_XYZ[3];
-			float mFastDirectionSteps_XYZ[3];
-			float mDistance_XYZ[3];
-		#endif
-		float m_XYZ[3];
-		float pathPoint[3];
-		#ifdef SOS_Version3
-			float endPoint[3];
-		#endif
-
-		 //  Welcher Abstand in den Dimensionen XYZ ist der groesste? Um Richtung zu bestimmen
-			#ifndef SOS_Version2
-				if ( (abs(voxel1f[0]-voxel2f[0]) <= abs(voxel1f[2]-voxel2f[2])))      		// X<Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<Z
-						greatestDistanceDim = 2;                                           				// => Z
-					else if ( (abs(voxel1f[2]-voxel2f[2]) <= abs(voxel1f[1]-voxel2f[1])))		// Z<Y
-						greatestDistanceDim = 1;                                           				// => Y
-					else                                                    					// Z=Y
-						greatestDistanceDim = 2;                                           				// => Z
-				else                                                      				// X>Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<X
-						greatestDistanceDim = 0;                                           				// => X
-					else                                                    					// Y>X
-						greatestDistanceDim = 1;                                           				// => Y
-			#else
-				if ( (abs(voxel1f[0]-voxel2f[0]) <= abs(voxel1f[2]-voxel2f[2])))      		// X<Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<Z
-					{
-						greatestDistanceDim = 2;                                           				// => Z
-						slowDim1 = 0;
-						slowDim2 = 1;
-					}
-					else if ( (abs(voxel1f[2]-voxel2f[2]) <= abs(voxel1f[1]-voxel2f[1])))		// Z<Y
-					{
-						greatestDistanceDim = 1;                                           				// => Y
-						slowDim1 = 0;
-						slowDim2 = 2;
-					}
-					else                                                    					// Z=Y
-					{
-						greatestDistanceDim = 2;                                           				// => Z
-						slowDim1 = 0;
-						slowDim2 = 1;
-					}
-				else                                                      				// X>Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<X
-					{
-						greatestDistanceDim = 0;                                           				// => X
-						slowDim1 = 1;
-						slowDim2 = 2;
-					}
-					else                                                    					// Y>X
-					{
-						greatestDistanceDim = 1;                                           				// => Y
-						slowDim1 = 0;
-						slowDim2 = 2;
-					}
-			#endif
-
-			#ifndef SOS_Version2
-				#ifdef debug_CudaRayTraceKernel
-					greatestDistance_XYZ=((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]); // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
-
-					distance_XYZ[0] = (voxel2f[0] - voxel1f[0]);    // Abstand in X-Richtung
-					distance_XYZ[1] = (voxel2f[1] - voxel1f[1]);    // Abstand in Y-Richtung
-					distance_XYZ[2] = (voxel2f[2] - voxel1f[2]);    // Abstand in Z-Richtung
-				#endif
-
-				pathPoint[0] = voxel1f[0];	// Voxel1f bzw. Emitter als Startpunkt nutzen
-				pathPoint[1] = voxel1f[1];
-				pathPoint[2] = voxel1f[2];
-
-				#ifdef debug_CudaRayTraceKernel
-
-					if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-				//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-					{
-						// Ausgabe der berechneten Werte
-						printf("	\n");
-						printf("	voxel1f                       = [%3f %3f %3f]\n", voxel1f[0],voxel1f[1],voxel1f[2]);
-						printf("	voxel2f                       = [%3f %3f %3f]\n", voxel2f[0],voxel2f[1],voxel2f[2]);
-						printf("	greatestDistanceDim(X0 Y1 Z2) = %i\n", greatestDistanceDim);
-						printf("	greatestDistance_XYZ          = %f\n", greatestDistance_XYZ);
-						printf("	distance_XYZ                  = [%3f %3f %3f]\n", distance_XYZ[0],distance_XYZ[1],distance_XYZ[2]);
-						//printf("	m_positiv                     = %i\n", m_positiv);
-						//printf("	fastDirectionSteps            = %i\n", fastDirectionSteps);	// Fehler wenn negativ!
-						printf("	pathPoint                     = [%3f %3f %3f]\n", pathPoint[0],pathPoint[1],pathPoint[2]);
-					}
-
-				#endif
-
-				#ifdef debug_CudaRayTraceKernel
-					// Steigung bestimmen
-					// Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
-					mDistance_XYZ[0] = distance_XYZ[0] / greatestDistance_XYZ;
-					mDistance_XYZ[1] = distance_XYZ[1] / greatestDistance_XYZ;
-					mDistance_XYZ[2] = distance_XYZ[2] / greatestDistance_XYZ;
-				#endif
-
-				// ist Steigung negativ? & Bestimmen der fastDirectionSteps vom diskretisierten Startpunkt zum Endpunkt bzw. Voxel
-				if (voxel2f[greatestDistanceDim] >= pathPoint[greatestDistanceDim])    // voxel2f > voxel1f   Endpukt > Startpkt -> Steigung positiv
-				{
-					#ifdef debug_CudaRayTraceKernel
-						m_positiv = 1;
-					#endif
-					fastDirectionSteps = floor(voxel2f[greatestDistanceDim] ) - floor(pathPoint[greatestDistanceDim] ) + 1;
-				}
-				else                                					// voxel2f < voxel1f   Endpukt < Startpkt -> Steigung negativ
-				{
-					#ifdef debug_CudaRayTraceKernel
-						m_positiv = 0;
-					#endif
-					fastDirectionSteps = floor(pathPoint[greatestDistanceDim] ) - floor(voxel2f[greatestDistanceDim] ) + 1;
-				}
-
-				// Steigung bezogen auf die Anzahl der noetigen ganzen Schritte
-				mFastDirectionSteps_XYZ[0] = distance_XYZ[0]/(float)(fastDirectionSteps-1);
-				mFastDirectionSteps_XYZ[1] = distance_XYZ[1]/(float)(fastDirectionSteps-1);
-				mFastDirectionSteps_XYZ[2] = distance_XYZ[2]/(float)(fastDirectionSteps-1);
-
-				m_XYZ[0] = mFastDirectionSteps_XYZ[0] ;  // Steigung bezogen auf die Anzahl der nuetigen ganzen Schritte in jede Richtung nutzen
-				m_XYZ[1] = mFastDirectionSteps_XYZ[1] ;
-				m_XYZ[2] = mFastDirectionSteps_XYZ[2] ;
-			#else
-
-				// ist Steigung negativ? & Bestimmen der fastDirectionSteps vom diskretisierten Startpunkt zum Endpunkt bzw. Voxel
-				if (voxel1f[greatestDistanceDim] >= voxel2f[greatestDistanceDim])    // voxel2f > voxel1f   Endpukt > Startpkt -> Steigung positiv
-				{
-					fastDirectionSteps = floor(voxel1f[greatestDistanceDim] + 0.5f ) - floor(voxel2f[greatestDistanceDim] + 0.5f ) + 1;
-
-					pathPoint[greatestDistanceDim] = voxel2f[greatestDistanceDim];
-					pathPoint[slowDim1] = voxel2f[slowDim1];
-					pathPoint[slowDim2] = voxel2f[slowDim2];
-
-					#ifdef SOS_Version3
-						endPoint[0] = voxel2f[0];
-						endPoint[1] = voxel1f[1];
-						endPoint[2] = voxel1f[2];
-					#endif
-				}
-				else                                					// voxel2f < voxel1f   Endpukt < Startpkt -> Steigung negativ
-				{
-					fastDirectionSteps = floor(voxel2f[greatestDistanceDim] + 0.5f ) - floor(voxel1f[greatestDistanceDim] + 0.5f ) + 1;
-
-					pathPoint[greatestDistanceDim] = voxel1f[greatestDistanceDim];
-					pathPoint[slowDim1] = voxel1f[slowDim1];
-					pathPoint[slowDim2] = voxel1f[slowDim2];
-
-					#ifdef SOS_Version3
-						endPoint[0] = voxel2f[0];
-						endPoint[1] = voxel2f[1];
-						endPoint[2] = voxel2f[2];
-					#endif
-				}
-
-
-				#ifdef debug_CudaRayTraceKernel
-					greatestDistance_XYZ=((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]); // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
-
-					distance_XYZ[0] = (voxel2f[0] - voxel1f[0]);    // Abstand in X-Richtung
-					distance_XYZ[1] = (voxel2f[1] - voxel1f[1]);    // Abstand in Y-Richtung
-					distance_XYZ[2] = (voxel2f[2] - voxel1f[2]);    // Abstand in Z-Richtung
-
-					if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-				//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-					{
-						// Ausgabe der berechneten Werte
-						printf("	\n");
-						printf("	voxel1f                       = [%3f %3f %3f]\n", voxel1f[0],voxel1f[1],voxel1f[2]);
-						printf("	voxel2f                       = [%3f %3f %3f]\n", voxel2f[0],voxel2f[1],voxel2f[2]);
-						printf("	greatestDistanceDim(X0 Y1 Z2) = %i\n", greatestDistanceDim);
-						printf("	greatestDistance_XYZ          = %f\n", greatestDistance_XYZ);
-						printf("	distance_XYZ                  = [%3f %3f %3f]\n", distance_XYZ[0],distance_XYZ[1],distance_XYZ[2]);
-						//printf("	m_positiv                     = %i\n", m_positiv);
-						//printf("	fastDirectionSteps            = %i\n", fastDirectionSteps);	// Fehler wenn negativ!
-						printf("	pathPoint                     = [%3f %3f %3f]\n", pathPoint[0],pathPoint[1],pathPoint[2]);
-					}
-
-					// Steigung bestimmen
-					// Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
-					mDistance_XYZ[0] = distance_XYZ[0] / greatestDistance_XYZ;
-					mDistance_XYZ[1] = distance_XYZ[1] / greatestDistance_XYZ;
-					mDistance_XYZ[2] = distance_XYZ[2] / greatestDistance_XYZ;
-				#endif
-
-				// Steigung bestimmen
-				// Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
-				greatestDistance_XYZ=((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]); // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
-				m_XYZ[greatestDistanceDim] = 1.0f;		// Wegen Rundungsfehler bei der Division, die auftreten koennen.
-				//m_XYZ[greatestDistanceDim] = (voxel2f[greatestDistanceDim] - voxel1f[greatestDistanceDim]) / greatestDistance_XYZ;
-				m_XYZ[slowDim1] = (voxel2f[slowDim1] - voxel1f[slowDim1]) / greatestDistance_XYZ;
-				m_XYZ[slowDim2] = (voxel2f[slowDim2] - voxel1f[slowDim2]) / greatestDistance_XYZ;
-			#endif
-
-
-			#ifdef debug_CudaRayTraceKernel
-
-				if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-				//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-				{
-
-					// Ausgabe der berechneten Werte
-					//printf("	greatestDistance_XYZ    = %f\n", greatestDistance_XYZ);
-					//printf("	distance_XYZ            = [%3f %3f %3f]\n", distance_XYZ[0],distance_XYZ[1],distance_XYZ[2]);
-					//printf("	m_positiv               = %i\n", m_positiv);
-					printf("	fastDirectionSteps -1   = %i\n", (fastDirectionSteps-1));
-					printf("	pathPoint               = [%3f %3f %3f]\n", pathPoint[0],pathPoint[1],pathPoint[2]);
-					printf("	mFastDirectionSteps_XYZ = [%3f %3f %3f]\n", mFastDirectionSteps_XYZ[0],mFastDirectionSteps_XYZ[1],mFastDirectionSteps_XYZ[2]);
-					printf("	m_XYZ                   = [%3f %3f %3f]\n", m_XYZ[0],m_XYZ[1],m_XYZ[2]);
-
-					// Punkte auf Pfad ablaufen
-//					printf("	\n");
-//					printf("	Float Bresenham mit Anfangspkt- und Endpktkorrektur Texture\n");
-//					printf("	=========================================================================================================\n");
-//					printf("	Step    | currentVoxel  |      X      |      Y      |      Z      ||    SoSvalue    |   delta_SoSvalue\n");
-//					printf("	---------------------------------------------------------------------------------------------------------\n");
-				}
-			#endif
-
-
-
-			#ifndef SOS_Version2
-				// Startpunkt (voxel1f) festlegen
-				pathPoint[0] = voxel1f[0];
-				pathPoint[1] = voxel1f[1];
-				pathPoint[2] = voxel1f[2];
-			#endif
-
-			// Innere schleife fuer den Bresenham ohne Endpunkt
-			#ifndef SOS_Version2
-				int j = 1;
-				for  (j=1; j<=(fastDirectionSteps-1); j++) //(Start und Endpunkt werden ausserhalb der Schleife festgelegt)
-			#else
-				#ifndef SOS_Version3
-					int j=0;
-					for  (j=fastDirectionSteps; j>0; j--) //(Alle Punkte innerhalb der Schleife berechnen)
-				#else
-					int j = 1;
-					for  (j=1; j<=(fastDirectionSteps-1); j++) //(Endpunkt wird ausserhalb der Schleife festgelegt)
-				#endif
-			#endif
-				{
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, speedOfSoundField, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-
-					#ifndef SaftTextureForBresenhamSosPaths
-						// AuskommentiertprocessRayTracedVoxelFloat(pathPoint, voxelCount, totalSpeed, deviceSpeedOfSoundField, SOSGrid_XYZ);
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, speedOfSoundField, SOSGrid_XYZ);
-					#else
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-
-								#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-									processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-								#endif
-
-								#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-									processRayTracedVoxelTextureSosAtt(pathPoint, voxelCount, totalSpeed, totalAttenuation, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-
-										#ifdef debug_CudaRayTraceKernel
-											if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhaengig
-											//if ((pathPoint[0] == DebugSosVoxelX) && (pathPoint[1] == DebugSosVoxelY) && (pathPoint[2] == DebugSosVoxelZ) && ((int)voxel1f[0] == ER_PositionX) && ((int)voxel1f[1] == ER_PositionY) && ((int)voxel1f[2] == ER_PositionZ)) // Auch von ER-Position
-											//if ((pathPoint[0] == DebugSosVoxelX) && (pathPoint[1] == DebugSosVoxelY) && (pathPoint[2] == DebugSosVoxelZ)) // Nur von SOS_XYZ-Position
-											//if ( ((int)voxel1f[0] == ER_PositionX) && ((int)voxel1f[1] == ER_PositionY) && ((int)voxel1f[2] == ER_PositionZ) ) // Nur von ER-Position
-											{
-												printf("pSOSATT currentVoxel [%f %f %f] - j(%i)\n", pathPoint[0], pathPoint[1], pathPoint[2], j);
-												//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-
-												float2 SosAttValue = tex3D( texRefSosAttField,   pathPoint[0] + 0.5f, pathPoint[1] + 0.5f, pathPoint[2] + 0.5f);
-												//totalSpeed       += SosAttValue.x;
-												//totalAttenuation += SosAttValue.y;
-
-												printf("  totalSpeed         = %2.10f\n", SosAttValue.x, totalSpeed);
-												printf("  totalAttenuation + %2.10f  = %2.10f\n", SosAttValue.y, totalAttenuation);
-												printf("  voxelCount         = %d\n",     voxelCount);
-											}
-										#endif
-
-//											SosAttValue = tex3D( texRefSosAttField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-//											totalSpeed       += SosAttValue.x;
-//											totalAttenuation += SosAttValue.y;
-								#endif
-
-					#endif
-
-					#ifdef debug_CudaRayTraceKernel
-						if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhaengig
-						//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-						{
-							//printf("	Step %3i| [%3i %3i %3i] | % 11.6f | % 11.6f | % 11.6f ||  % 13.6f | % 13.6f \n", j, (int)floor(pathPoint[0]), (int)floor(pathPoint[1]), (int)floor(pathPoint[2]), pathPoint[0], pathPoint[1], pathPoint[2], totalSpeed, speedOfSoundField[((int)floor(pathPoint[2]) * SOSGrid_XYZ.y + (int)floor(pathPoint[1])) * SOSGrid_XYZ.x + (int)floor(pathPoint[0])]);
-							printf("	Step %3i| [%3i %3i %3i] | % 17.12f  % 17.12f  % 17.12f ||  % 13.6f | % 13.6f \n", j, (int)floor(pathPoint[0]+0.5f), (int)floor(pathPoint[1]+0.5f), (int)floor(pathPoint[2]+0.5f), pathPoint[0], pathPoint[1], pathPoint[2], voxelCount/totalSpeed, (float)tex3D( texRefSpeedOfSoundField, pathPoint[0] + 0.5f, pathPoint[1] + 0.5f, pathPoint[2] + 0.5f ) );
-						}
-					#endif
-
-					#ifndef SOS_Version2
-						pathPoint[0] = pathPoint[0] + m_XYZ[0];
-						pathPoint[1] = pathPoint[1] + m_XYZ[1];
-						pathPoint[2] = pathPoint[2] + m_XYZ[2];
-					#else
-						pathPoint[greatestDistanceDim] = pathPoint[greatestDistanceDim] + m_XYZ[greatestDistanceDim];
-						pathPoint[slowDim1] = pathPoint[slowDim1] + m_XYZ[slowDim1];
-						pathPoint[slowDim2] = pathPoint[slowDim2] + m_XYZ[slowDim2];
-					#endif
-				}
-
-			#if (!defined (SOS_Version2)) || defined(SOS_Version3)
-	    	    // Endpunkt
-
-					#ifndef SaftTextureForBresenhamSosPaths
-					// AuskommentiertprocessRayTracedVoxelFloat(endPoint, voxelCount, totalSpeed, deviceSpeedOfSoundField, SOSGrid_XYZ);
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, speedOfSoundField, SOSGrid_XYZ);
-					#else
-						processRayTracedVoxelTexture(endPoint, voxelCount, totalSpeed, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-						// TODO: Wieder einen ganzen Schritt entfernen und nur Distanz des Voxels dazufuegen --> dazu erst voxelCount auf float umstellen
-						voxelCount--;
-						pathPoint[greatestDistanceDim] = pathPoint[greatestDistanceDim] - m_XYZ[greatestDistanceDim];
-						pathPoint[slowDim1] = pathPoint[slowDim1] - m_XYZ[slowDim1];
-						pathPoint[slowDim2] = pathPoint[slowDim2] - m_XYZ[slowDim2];
-						voxelCount += sqrtf( SQR(pathPoint[1]-endPoint[1]) + SQR(pathPoint[2]-endPoint[2]) + SQR(pathPoint[3]-endPoint[3]) );
-					#endif
-
-					#ifdef debug_CudaRayTraceKernel
-						if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-							//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-						{
-							//printf("	Step %3i| [%3i %3i %3i] | % 11.6f | % 11.6f | % 11.6f  ||  % 13.6f | % 13.6f \n", j, (int)floor(voxel2f[0]), (int)floor(voxel2f[1]), (int)floor(voxel2f[2]), voxel2f[0], voxel2f[1], voxel2f[2], totalSpeed, speedOfSoundField[((int)floor(voxel2f[2]) * SOSGrid_XYZ.y + (int)floor(voxel2f[1])) * SOSGrid_XYZ.x + (int)floor(voxel2f[0])]);
-							printf("endPkt	Step %3i| [%3i %3i %3i] | % 17.12f  % 17.12f  % 17.12f ||  % 13.6f | % 13.6f \n", j, (int)floor(endPoint[0]+0.5f), (int)floor(endPoint[1]+0.5f), (int)floor(endPoint[2]+0.5f), endPoint[0], endPoint[1], endPoint[2], voxelCount/totalSpeed, (float)tex3D( texRefSpeedOfSoundField, endPoint[0] + 0.5f, endPoint[1] + 0.5f, endPoint[2] + 0.5f ) );
-	//						printf("<- performRayTracedSpeedAdditionTexture\n");
-						}
-					#endif
-			#else
-						// keinen Endpunkt extra dazurechnen.
-			#endif
-
-			// Anzahl der Besuchten Voxel liegt von Anfang an fest, daher nicht noetig einzeln aufzuaddieren!
-			voxelCount=fastDirectionSteps;
+    // Anzahl der Besuchten Voxel liegt von Anfang an fest, daher nicht noetig einzeln aufzuaddieren!
+    voxelCount = fastDirectionSteps;
 }
--- a/SAFT_TOFI/src/kernel/saftKernel.cu
+++ b/SAFT_TOFI/src/kernel/saftKernel.cu
@@ -0,0 +1,561 @@
+#include "saftKernel.cuh"
+
+texture<float, cudaTextureType2D, cudaReadModeElementType> texRefAscans;  // Schritt 1. Textur anlegen
+
+// Texture for loading AscanIndexes without ATT-Correction (float)
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_0;
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_1;
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_2;
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_3;
+
+// Texture for loading AscanIndexes with ATT-Correction (float2)
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_0;
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_1;
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_2;
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_3;
+
+__global__ void saftKernelAscanIndex_SOS_ATT(  // Version SoSATT-Korrektur
+
+    int const ascanIndexBatchOffset,           ///< Offset of AScans if more then one AScan-Batch is used for Reconstruction
+    float const aScanWindowSize,               ///< Amount of AScans used here for Reconstruction
+    int const maxAscanIndexArraysInTexture,    ///< Maximum amount in A-Scans in one Texture
+    int const TableAscanIndexAllocationCount,  ///< Used amount of Textures (currently limited to maximum 4 Textures)
+
+    int3 const IMAGE_SIZE_XYZ,           ///< XYZ des Outputvolumens
+    float3 const SosVoxelStartPosition,  ///< Offset of  SOS-Grids
+    float const IMAGE_RESOLUTION,        ///< Resolution of Output-Volume
+    float const VoxelIncrement,          ///< Step-Width of one Voxel in SOS-Korrdinates
+
+    int const blockIndexOffset,    ///<
+    int const speedOfSoundZLayer,  ///<
+    dim3 const gridDimensions,     ///<
+    dim3 const blockDimensions,    ///<
+    float const debugMode,           ///<
+    float const debugModeParameter,  ///<
+    int const *deviceSAFT_VARIANT,   ///<
+    double *output
+
+)
+{
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 1. Determine which Voxel is to be calculated in this Kernel
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    dim3 blockVoxel(  // Which Block(xyz) corresponds to this Thread?
+        (threadIdx.x / blockDimensions.y) % blockDimensions.x, threadIdx.x % blockDimensions.y, threadIdx.x / (blockDimensions.x * blockDimensions.y));
+
+    // Index of Block for this Thread
+    unsigned long long int blockIndex = ((blockIndexOffset + blockIdx.z) * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+
+    dim3 regionOfInterestVoxel(                                                                   // Which Voxel corresponds to this Thread? Start with 0.
+        ((blockIndex / gridDimensions.y) % gridDimensions.x) * blockDimensions.x + blockVoxel.x,  // Medium speed index
+        (blockIndex % gridDimensions.y) * blockDimensions.y + blockVoxel.y,                       // Fastest index
+        (blockIndex / (gridDimensions.x * gridDimensions.y)) * blockDimensions.z + blockVoxel.z   // Slowest index
+    );
+
+    // If Voxel is outside the reconstructed Image leave Kernel
+    if ((regionOfInterestVoxel.x >= IMAGE_SIZE_XYZ.x) || (regionOfInterestVoxel.y >= IMAGE_SIZE_XYZ.y) || (regionOfInterestVoxel.z >= IMAGE_SIZE_XYZ.z))
+        return;
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 2. Determine
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //    - SOSvoxel in which voxel is located
+    //    - Index of OutputVolume, and tables of Emitter, Receiver coordinates and SOSpaths
+    //    - Variable declarations
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Memory-Index for this Thread for Output-Array of this Voxel
+    unsigned long long int memoryIndex =
+        (((unsigned long long int)IMAGE_SIZE_XYZ.y * ((unsigned long long int)regionOfInterestVoxel.z - (unsigned long long int)blockIndexOffset) + (unsigned long long int)regionOfInterestVoxel.y) *
+             (unsigned long long int)IMAGE_SIZE_XYZ.x +
+         (unsigned long long int)regionOfInterestVoxel.x);
+    float3 SosVoxelf;  // SoS-Voxel Koordinates in float
+    // Determine SOS-Voxel-Position
+    SosVoxelf.x = (SosVoxelStartPosition.x + (VoxelIncrement * regionOfInterestVoxel.x));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.y = (SosVoxelStartPosition.y + (VoxelIncrement * regionOfInterestVoxel.y));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.z = (SosVoxelStartPosition.z + (VoxelIncrement * regionOfInterestVoxel.z));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // TexturIndex for access on Texturmemory depending of Voxel
+    float TexturIndexZ_AscanIndex = 0.0f;           // Z-Index for access on Texturmemory
+    float const TexturIndexX = SosVoxelf.x + 0.5f;  // Due to Access over Texturmemory +0.5f.
+    float const TexturIndexY = SosVoxelf.y + 0.5f;
+    float const SosVoxelTextureZ = (SosVoxelf.z - (float)speedOfSoundZLayer) + 0.5f;  // Z offset inside precalculated SOS paths
+
+    float voxelValue = 0.0;  // reflection value, which is summed up in Ascan-Loop = Outputvalue
+
+    __syncthreads();
+
+    if (ascanIndexBatchOffset == 0)  // Initialisierung beim ersten Kernelaufruf sprich ascanIndexBatchOffset == 0
+    {
+        voxelValue = 0.0;
+    }
+    else
+    {
+        voxelValue = (float)output[memoryIndex];  // Alle anderen Kernelaufrufe muessen zuerst den alten Wert laden, ist bei mehreren Durchlaeufen noetig
+    }
+
+    __syncthreads();
+
+    if (TableAscanIndexAllocationCount == 1)
+    {
+        float2 currentSOSVoxel_AscanIndexAttValues;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            currentSOSVoxel_AscanIndexAttValues = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            __syncthreads();
+            voxelValue += currentSOSVoxel_AscanIndexAttValues.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues.x - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 2)
+    {
+        float2 currentSOSVoxel_AscanIndexAttValues;
+
+        // #pragma unroll 3
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            // 1ten Teil mit selben Index laden
+            TexturIndexZ_AscanIndex = 2 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            currentSOSVoxel_AscanIndexAttValues = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+
+            voxelValue += currentSOSVoxel_AscanIndexAttValues.y *
+                          tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues.x - 0.5f, (float)(ascanIndexBatchOffset) + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexAttValues = tex3D(texTableAscanIndexFloat2_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues.y *
+                          tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues.x - 0.5f, (float)(ascanIndexBatchOffset) + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 3)
+    {
+
+        float2 currentSOSVoxel_AscanIndexAttValues_0;
+        float2 currentSOSVoxel_AscanIndexAttValues_1;
+        float2 currentSOSVoxel_AscanIndexAttValues_2;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            // load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexAttValues_0 = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_1 = tex3D(texTableAscanIndexFloat2_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_2 = tex3D(texTableAscanIndexFloat2_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            __syncthreads();
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_0.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_0.x - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_1.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_1.x - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_2.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_2.x - 0.5f, Offset_2 + ascanIndex_i);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 4)
+    {
+        float2 currentSOSVoxel_AscanIndexAttValues_0;
+        float2 currentSOSVoxel_AscanIndexAttValues_1;
+        float2 currentSOSVoxel_AscanIndexAttValues_2;
+        float2 currentSOSVoxel_AscanIndexAttValues_3;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_3 = (float)ascanIndexBatchOffset + 3.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            // syncthreads();
+            //  load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexAttValues_0 = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_1 = tex3D(texTableAscanIndexFloat2_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_2 = tex3D(texTableAscanIndexFloat2_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_3 = tex3D(texTableAscanIndexFloat2_3, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            __syncthreads();
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_0.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_0.x - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_1.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_1.x - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_2.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_2.x - 0.5f, Offset_2 + ascanIndex_i);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_3.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_3.x - 0.5f, Offset_3 + ascanIndex_i);
+        }
+    }
+
+    __syncthreads();
+    output[memoryIndex] = (double)voxelValue;
+}
+
+__global__ void saftKernelAscanIndex_SOS(  // Version SoS-Korrektur
+
+    int const ascanIndexBatchOffset,           ///< Offset of AScans if more then one AScan-Batch is used for Reconstruction
+    float const aScanWindowSize,               ///< Amount of AScans used here for Reconstruction
+    int const maxAscanIndexArraysInTexture,    ///< Maximum amount in A-Scans in one Texture
+    int const TableAscanIndexAllocationCount,  ///< Used amount of Textures (currently limited to maximum 4 Textures)
+
+    int3 const IMAGE_SIZE_XYZ,           ///< XYZ des Outputvolumens
+    float3 const SosVoxelStartPosition,  ///< Offset of  SOS-Grids
+    float const IMAGE_RESOLUTION,        ///< Resolution of Output-Volume
+    float const VoxelIncrement,          ///< Step-Width of one Voxel in SOS-Korrdinates
+
+    int const blockIndexOffset, int const speedOfSoundZLayer, dim3 const gridDimensions, dim3 const blockDimensions, double *output
+
+)
+{
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 1. Determine which Voxel is to be calculated in this Kernel
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    dim3 blockVoxel(  // Which Block(xyz) corresponds to this Thread?
+        (threadIdx.x / blockDimensions.y) % blockDimensions.x, threadIdx.x % blockDimensions.y, threadIdx.x / (blockDimensions.x * blockDimensions.y));
+
+    // Index of Block for this Thread
+    long blockIndex = ((blockIndexOffset + blockIdx.z) * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+
+    dim3 regionOfInterestVoxel(                                                                   // Which Voxel corresponds to this Thread? Start with 0.
+        ((blockIndex / gridDimensions.y) % gridDimensions.x) * blockDimensions.x + blockVoxel.x,  // Medium speed index
+        (blockIndex % gridDimensions.y) * blockDimensions.y + blockVoxel.y,                       // Fastest index
+        (blockIndex / (gridDimensions.x * gridDimensions.y)) * blockDimensions.z + blockVoxel.z   // Slowest index
+    );
+
+    // If Voxel is outside the reconstructed Image leave Kernel
+    if ((regionOfInterestVoxel.x >= IMAGE_SIZE_XYZ.x) || (regionOfInterestVoxel.y >= IMAGE_SIZE_XYZ.y) || (regionOfInterestVoxel.z >= IMAGE_SIZE_XYZ.z))
+        return;
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 2. Determine
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //    - SOSvoxel in which voxel is located
+    //    - Index of OutputVolume, and tables of Emitter, Receiver coordinates and SOSpaths
+    //    - Variable declarations
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Memory-Index for this Thread for Output-Array of this Voxel
+    long memoryIndex = ((IMAGE_SIZE_XYZ.y * (regionOfInterestVoxel.z - blockIndexOffset) + regionOfInterestVoxel.y) * IMAGE_SIZE_XYZ.x + regionOfInterestVoxel.x);
+    float3 SosVoxelf;  // SoS-Voxel Koordinates in float
+    // Determine SOS-Voxel-Position
+    SosVoxelf.x = (SosVoxelStartPosition.x + (VoxelIncrement * regionOfInterestVoxel.x));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.y = (SosVoxelStartPosition.y + (VoxelIncrement * regionOfInterestVoxel.y));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.z = (SosVoxelStartPosition.z + (VoxelIncrement * regionOfInterestVoxel.z));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // TexturIndex for access on Texturmemory depending of Voxel
+    float TexturIndexZ_AscanIndex = 0.0f;           // Z-Index for access on Texturmemory
+    float const TexturIndexX = SosVoxelf.x + 0.5f;  // Due to Access over Texturmemory +0.5f.
+    float const TexturIndexY = SosVoxelf.y + 0.5f;
+    float const SosVoxelTextureZ = (SosVoxelf.z - (float)speedOfSoundZLayer) + 0.5f;  // Z offset inside precalculated SOS paths
+    float voxelValue = 0.0;                                                           // reflection value, which is summed up in Ascan-Loop = Outputvalue
+
+    float currentSOSVoxel_AscanIndexValues;
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 3. SAFT-Algorithmus
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //  - Index aus Textur lesen und fuer zugriff auf A-Scan nutzen
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Vorgehen Ascanindexvariante
+    // 1. Bestimme Koordinaten in SOS-Koordinaten für festen Emitter und 1413 Receiver
+    //		Index = X+Xmax*Y+Xmax*Ymax*(Zmax*RecNr+Z).
+    //		Xmax = 128
+    //		Ymax = 128
+    //		Zmax = 2
+    //		=> X = x
+    //		=> Y = y
+    //		=> Z = Zmax*RecNr+z
+    // 2. Lade an dieser Stelle den Interpolierten Index.
+    //		2.a Über Texturmemory
+    //		2.b über alle 8 benachbarten Voxel oder 64 bei tricubic-Interpolation in 3D
+    // 3. Lade Ascan-Sample an diesem Index und summiere auf
+
+    __syncthreads();
+    if (ascanIndexBatchOffset == 0)  // Initialisierung beim ersten Kernelaufruf sprich ascanIndexBatchOffset == 0
+    {
+        voxelValue = 0.0f;
+    }
+    else
+    {
+        voxelValue = (float)output[memoryIndex];  // Alle anderen Kernelaufrufe muessen zuerst den alten Wert laden, ist bei mehreren Durchlaeufen noetig
+    }
+
+    __syncthreads();
+    // float VoxelAscanIndex2 = 0.0f;
+    // float voxelValue2 = 0.0f;
+
+    if (TableAscanIndexAllocationCount == 1)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < 1; ascanIndex_i+=1.0f)	// bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // nutze immer nur 1tes Surface
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan a
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 2)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 3)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+
+            // 3ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 2 * maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+
+    else if (TableAscanIndexAllocationCount == 4)
+    {
+        float currentSOSVoxel_AscanIndexValues_0;
+        float currentSOSVoxel_AscanIndexValues_1;
+        float currentSOSVoxel_AscanIndexValues_2;
+        float currentSOSVoxel_AscanIndexValues_3;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_3 = (float)ascanIndexBatchOffset + 3.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 4
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            // syncthreads();
+            //  load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexValues_0 = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_1 = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_2 = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_3 = tex3D(texTableAscanIndexFloat1_3, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_0 - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_1 - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_2 - 0.5f, Offset_2 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_3 - 0.5f, Offset_3 + ascanIndex_i);
+
+            // if (((int)ascanIndex_i & 31) == 0) __syncthreads();
+        }
+    }
+    else
+    {
+        // Do nothing due to only 4 are defined
+    }
+    __syncthreads();
+    output[memoryIndex] = (double)voxelValue;
+}
+
+__global__ void saftKernelAscanIndex(  // Version ohne-Korrektur
+
+    int const ascanIndexBatchOffset,           ///< Offset of AScans if more then one AScan-Batch is used for Reconstruction
+    float const aScanWindowSize,               ///< Amount of AScans used here for Reconstruction
+    int const maxAscanIndexArraysInTexture,    ///< Maximum amount in A-Scans in one Texture
+    int const TableAscanIndexAllocationCount,  ///< Used amount of Textures (currently limited to maximum 4 Textures)
+
+    int3 const IMAGE_SIZE_XYZ,           ///< XYZ des Outputvolumens
+    float3 const SosVoxelStartPosition,  ///< Offset of  SOS-Grids
+    float const IMAGE_RESOLUTION,        ///< Resolution of Output-Volume
+    float const VoxelIncrement,          ///< Step-Width of one Voxel in SOS-Korrdinates
+
+    int const blockIndexOffset, int const speedOfSoundZLayer, dim3 const gridDimensions, dim3 const blockDimensions, double *output
+
+)
+{
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 1. Determine which Voxel is to be calculated in this Kernel
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    dim3 blockVoxel(  // Which Block(xyz) corresponds to this Thread?
+        (threadIdx.x / blockDimensions.y) % blockDimensions.x, threadIdx.x % blockDimensions.y, threadIdx.x / (blockDimensions.x * blockDimensions.y));
+
+    // Index of Block for this Thread
+    long blockIndex = ((blockIndexOffset + blockIdx.z) * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+
+    dim3 regionOfInterestVoxel(                                                                   // Which Voxel corresponds to this Thread? Start with 0.
+        ((blockIndex / gridDimensions.y) % gridDimensions.x) * blockDimensions.x + blockVoxel.x,  // Medium speed index
+        (blockIndex % gridDimensions.y) * blockDimensions.y + blockVoxel.y,                       // Fastest index
+        (blockIndex / (gridDimensions.x * gridDimensions.y)) * blockDimensions.z + blockVoxel.z   // Slowest index
+    );
+
+    // If Voxel is outside the reconstructed Image leave Kernel
+    if ((regionOfInterestVoxel.x >= IMAGE_SIZE_XYZ.x) || (regionOfInterestVoxel.y >= IMAGE_SIZE_XYZ.y) || (regionOfInterestVoxel.z >= IMAGE_SIZE_XYZ.z))
+        return;
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 2. Determine
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //    - SOSvoxel in which voxel is located
+    //    - Index of OutputVolume, and tables of Emitter, Receiver coordinates and SOSpaths
+    //    - Variable declarations
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Memory-Index for this Thread for Output-Array of this Voxel
+    long memoryIndex = ((IMAGE_SIZE_XYZ.y * (regionOfInterestVoxel.z - blockIndexOffset) + regionOfInterestVoxel.y) * IMAGE_SIZE_XYZ.x + regionOfInterestVoxel.x);
+    float3 SosVoxelf;  // SoS-Voxel Koordinates in float
+    // Determine SOS-Voxel-Position
+    SosVoxelf.x = (SosVoxelStartPosition.x + (VoxelIncrement * regionOfInterestVoxel.x));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.y = (SosVoxelStartPosition.y + (VoxelIncrement * regionOfInterestVoxel.y));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.z = (SosVoxelStartPosition.z + (VoxelIncrement * regionOfInterestVoxel.z));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // TexturIndex for access on Texturmemory depending of Voxel
+    float TexturIndexZ_AscanIndex = 0.0f;           // Z-Index for access on Texturmemory
+    float const TexturIndexX = SosVoxelf.x + 0.5f;  // Due to Access over Texturmemory +0.5f.
+    float const TexturIndexY = SosVoxelf.y + 0.5f;
+    float const SosVoxelTextureZ = (SosVoxelf.z - (float)speedOfSoundZLayer) + 0.5f;  // Z offset inside precalculated SOS paths
+    float voxelValue = 0.0;                                                           // reflection value, which is summed up in Ascan-Loop = Outputvalue
+
+    float currentSOSVoxel_AscanIndexValues;
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 3. SAFT-Algorithmus
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //  - Index aus Textur lesen und fuer zugriff auf A-Scan nutzen
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Vorgehen Ascanindexvariante
+    // 1. Bestimme Koordinaten in SOS-Koordinaten für festen Emitter und 1413 Receiver
+    //		Index = X+Xmax*Y+Xmax*Ymax*(Zmax*RecNr+Z).
+    //		Xmax = 128
+    //		Ymax = 128
+    //		Zmax = 2
+    //		=> X = x
+    //		=> Y = y
+    //		=> Z = Zmax*RecNr+z
+    // 2. Lade an dieser Stelle den Interpolierten Index.
+    //		2.a Über Texturmemory
+    //		2.b über alle 8 benachbarten Voxel oder 64 bei tricubic-Interpolation in 3D
+    // 3. Lade Ascan-Sample an diesem Index und summiere auf
+
+    __syncthreads();
+    if (ascanIndexBatchOffset == 0)  // Initialisierung beim ersten Kernelaufruf sprich ascanIndexBatchOffset == 0
+    {
+        voxelValue = 0.0f;
+    }
+    else
+    {
+        voxelValue = (float)output[memoryIndex];  // Alle anderen Kernelaufrufe muessen zuerst den alten Wert laden, ist bei mehreren Durchlaeufen noetig
+    }
+
+    __syncthreads();
+    // float VoxelAscanIndex2 = 0.0f;
+    // float voxelValue2 = 0.0f;
+
+    if (TableAscanIndexAllocationCount == 1)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // nutze immer nur 1tes Surface
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 2)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 3)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+
+            // 3ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 2 * maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+
+    else if (TableAscanIndexAllocationCount == 4)
+    {
+        float currentSOSVoxel_AscanIndexValues_0;
+        float currentSOSVoxel_AscanIndexValues_1;
+        float currentSOSVoxel_AscanIndexValues_2;
+        float currentSOSVoxel_AscanIndexValues_3;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_3 = (float)ascanIndexBatchOffset + 3.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            //__syncthreads();
+            // load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexValues_0 = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_1 = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_2 = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_3 = tex3D(texTableAscanIndexFloat1_3, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_0 - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_1 - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_2 - 0.5f, Offset_2 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_3 - 0.5f, Offset_3 + ascanIndex_i);
+        }
+    }
+    __syncthreads();
+    output[memoryIndex] = (double)voxelValue;
+}
--- a/SAFT_TOFI/src/kernel/saftKernel.cuh
+++ b/SAFT_TOFI/src/kernel/saftKernel.cuh
--- a/SAFT_TOFI/src/kernel/saftPrivate.cu
+++ b/SAFT_TOFI/src/kernel/saftPrivate.cu
@@ -0,0 +1,624 @@
+#include "precalculateSpeedOfSoundKernel.cuh"
+#include "rayTracing.cuh"
+#include "saftKernel.cuh"
+#include "saft.hpp"
+
+void SAFTHandler::precalculateAscanIndex_usePaths(int ascanIndex_i, int aScanWindowSize, int currentSpeedOfSoundZLayer, int maxFeasibleSosZLayerCount)
+{
+    cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathSosBoth = cudaCreateChannelDesc(32, 32, 32, 32,
+                                                                                             cudaChannelFormatKindFloat);  // Schritt 2.1 Output-Kanal anlegen und
+                                                                                                                           // beschreiben - Float4
+                                                                                                                           // Both Emitter Path Tables
+    // --------------------------------------------------------
+    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[1] = cudaAddressModeClamp;
+    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[2] = cudaAddressModeClamp;
+    switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+    {
+    case 0:
+        texTableVoxelToEmitterPathSosBoth_preprocess.filterMode = cudaFilterModePoint;
+        break;
+    case 1:
+        texTableVoxelToEmitterPathSosBoth_preprocess.filterMode = cudaFilterModeLinear;
+        break;
+    }
+    texTableVoxelToEmitterPathSosBoth_preprocess.normalized = 0;
+    CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToEmitterPathSosBoth_preprocess, deviceTableVoxelToEmPathSosBothCuArray, &texChannelDescTableVoxelToEmRecPathSosBoth));
+
+    // Texturmemory fuer Receiver - SosPathsTables
+    // ===================================================================================================================
+    // Both Receiver Path Tables
+    // ------------------------------------------------------
+    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[1] = cudaAddressModeClamp;
+    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[2] = cudaAddressModeClamp;
+    switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+    {
+    case 0:
+        texTableVoxelToReceiverPathSosBoth0_preprocess.filterMode = cudaFilterModePoint;
+        break;
+    case 1:
+        texTableVoxelToReceiverPathSosBoth0_preprocess.filterMode = cudaFilterModeLinear;
+        break;
+    }
+    texTableVoxelToReceiverPathSosBoth0_preprocess.normalized = 0;
+
+    CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth0_preprocess, deviceTableVoxelToRecPathSosBothCuArray[0], &texChannelDescTableVoxelToEmRecPathSosBoth));
+
+    if (TableVoxelToReceiverPathSosAllocationCount > 1)
+    {                                                                                          // TODO: mit Arrays flexibel programmieren, wenn moeglich!!!
+        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[1] = cudaAddressModeClamp;
+        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableVoxelToReceiverPathSosBoth1_preprocess.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableVoxelToReceiverPathSosBoth1_preprocess.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableVoxelToReceiverPathSosBoth1_preprocess.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth1_preprocess, deviceTableVoxelToRecPathSosBothCuArray[1], &texChannelDescTableVoxelToEmRecPathSosBoth));
+    }
+
+    if (TableVoxelToReceiverPathSosAllocationCount > 2)
+    {
+        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[1] = cudaAddressModeClamp;
+        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableVoxelToReceiverPathSosBoth2_preprocess.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableVoxelToReceiverPathSosBoth2_preprocess.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableVoxelToReceiverPathSosBoth2_preprocess.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth2_preprocess, deviceTableVoxelToRecPathSosBothCuArray[2], &texChannelDescTableVoxelToEmRecPathSosBoth));
+    }
+
+    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1, 1);
+    dim3 blocksPerGrid(1, 1, 1);  
+    blocksPerGrid.x = SOSGrid_XYZ.y;
+    blocksPerGrid.y = maxFeasibleSosZLayerCount;
+    blocksPerGrid.z = 1;
+
+    // Step 2. Bereite Output-Textur fuer AscanIndex vor
+
+    if (TableAscanIndexAllocationCount > 0)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat0, deviceTextureAscanIndexFloatCuArray[0]);
+    }
+    if (TableAscanIndexAllocationCount > 1)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat1, deviceTextureAscanIndexFloatCuArray[1]);
+    }
+    if (TableAscanIndexAllocationCount > 2)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat2, deviceTextureAscanIndexFloatCuArray[2]);
+    }
+    if (TableAscanIndexAllocationCount > 3)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat3, deviceTextureAscanIndexFloatCuArray[3]);
+    }
+
+    // Step 3. Fuehre Kernel aus mit #Threads: SOS.x*SOS.y . Innerhalb werden
+    // immer 1024/2048 A-Scans durchlaufen und in AscanIndex-Textur geschrieben
+
+    if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false))
+    {  // ====================================================
+       // Blockmode with SOS-value per Ascan
+
+        precalculateAscanIndex_usePathsKernel<<<blocksPerGrid, threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
+                                                                                  aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
+                                                                                                    // vorberechnet werden können
+                                                                                  deviceSosAttFieldCuArray,
+                                                                                  currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
+                                                                                                              ///< grid the pre-calculation is performed
+                                                                                                              ///< for.
+                                                                                  maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
+                                                                                                              ///< sound grid the pre-calculation is
+                                                                                                              ///< performed for.
+                                                                                  // currentEmIndexUsedForAscanIndexCalculation,	///< current Index of Em
+                                                                                  // for which the AscanIndex is calculated
+
+                                                                                  maxSoSReceiverArrayForTexture,
+
+                                                                                  deviceEmitterIndex_block,   // Speicheradresse fuer EmitterIndexdaten
+                                                                                  deviceReceiverIndex_block,  // Speicheradresse fuer ReceiverIndexdaten
+
+                                                                                  TableAscanIndexAllocationCount,  ///< Anzahl der benoetigten AscanBlocks
+                                                                                                                   ///< der Groesse 2048/4096
+                                                                                  maxAscanIndexArraysInTexture,    ///< maximale Anzahl an Em/Rec in einem
+                                                                                                                   ///< CUDA Array (fest definiert fuer
+                                                                                                                   ///< bestimmung welche Textur genutzt
+                                                                                                                   ///< wird)
+
+                                                                                  deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
+                                                                                                                        ///< the path from transducer to
+                                                                                                                        ///< voxel.
+
+                                                                                  SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
+                                                                                  deviceSAFT_VARIANT);
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false))
+    {  // ====================================================
+       // 3DVolume Mode with SOS-Correction no ATT-Correction
+
+        precalculateAscanIndex_usePathsKernel_SOS<<<blocksPerGrid,
+                                                    threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
+                                                                       aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
+                                                                                         // vorberechnet werden können
+                                                                       deviceSosAttFieldCuArray,
+                                                                       currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
+                                                                                                   ///< grid the pre-calculation is performed
+                                                                                                   ///< for.
+                                                                       maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
+                                                                                                   ///< sound grid the pre-calculation is
+                                                                                                   ///< performed for.
+                                                                       // currentEmIndexUsedForAscanIndexCalculation,		///< current
+                                                                       // Index of Em for which the AscanIndex is calculated
+
+                                                                       maxSoSReceiverArrayForTexture,
+
+                                                                       deviceEmitterIndex_block,   // Speicheradresse fuer EmitterIndexdaten
+                                                                       deviceReceiverIndex_block,  // Speicheradresse fuer ReceiverIndexdaten
+
+                                                                       TableAscanIndexAllocationCount,  ///< Anzahl der benoetigten AscanBlocks
+                                                                                                        ///< der Groesse 2048/4096
+                                                                       maxAscanIndexArraysInTexture,    ///< maximale Anzahl an Em/Rec in einem
+                                                                                                        ///< CUDA Array (fest definiert fuer
+                                                                                                        ///< bestimmung welche Textur genutzt
+                                                                                                        ///< wird)
+
+                                                                       deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
+                                                                                                             ///< the path from transducer to
+                                                                                                             ///< voxel.
+
+                                                                       SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
+                                                                       deviceSAFT_VARIANT);
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true))
+    {  // ====================================================
+       // 3DVolume Mode with SOS- and ATT-Correction
+
+        precalculateAscanIndex_usePathsKernel_SOS_ATT<<<blocksPerGrid,
+                                                        threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
+                                                                           aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
+                                                                                             // vorberechnet werden können
+                                                                           deviceSosAttFieldCuArray,
+                                                                           currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
+                                                                                                       ///< grid the pre-calculation is performed
+                                                                                                       ///< for.
+                                                                           maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
+                                                                                                       ///< sound grid the pre-calculation is
+                                                                                                       ///< performed for.
+                                                                           // currentEmIndexUsedForAscanIndexCalculation,		///< current
+                                                                           // Index of Em for which the AscanIndex is calculated
+                                                                           maxSoSReceiverArrayForTexture,
+                                                                           deviceEmitterIndex_block,             // Speicheradresse fuer EmitterIndexdaten
+                                                                           deviceReceiverIndex_block,            // Speicheradresse fuer ReceiverIndexdaten
+                                                                           TableAscanIndexAllocationCount,       ///< Anzahl der benoetigten AscanBlocks
+                                                                                                                 ///< der Groesse 2048/4096
+                                                                           maxAscanIndexArraysInTexture,         ///< maximale Anzahl an Em/Rec in einem
+                                                                                                                 ///< CUDA Array (fest definiert fuer
+                                                                                                                 ///< bestimmung welche Textur genutzt
+                                                                                                                 ///< wird)
+                                                                           deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
+                                                                                                                 ///< the path from transducer to
+                                                                                                                 ///< voxel.
+                                                                           SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
+                                                                           deviceSAFT_VARIANT);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+
+    // ==================================================== cudaUnbindTexture
+    // Texturmemory fuer Emitter - SosPathsTables entbinden
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToEmitterPathSosBoth_preprocess));
+    // Texturmemory fuer Receiver - SosPathsTables entbinden
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth0_preprocess));
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth1_preprocess));
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth2_preprocess));
+}
+
+void SAFTHandler::precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceListGeometry, int geometryElementCount, float *deviceVoxelCountOutputFloat,
+                                                  float *deviceSpeedOfSoundSumOutput)
+{
+    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1,
+                         1);  // max. 512 oder 1024	Threads werden vorgegeben und
+    // dim3 threadsPerBlock (SOSGrid_XYZ.x,SOSGrid_XYZ.y,1);	// max. 512 oder
+    // 1024	Threads werden vorgegeben und
+    dim3 blocksPerGrid(1, 1, 1);  // max. 65.535			Bloecke im Grid
+                                  // berechnet.	Initialisierung
+    blocksPerGrid.x = SOSGrid_XYZ.y;
+    blocksPerGrid.y = sosZLayerCount;
+    blocksPerGrid.z = 1;
+
+    cudaChannelFormatDesc texChannelDescSosAttField = cudaCreateChannelDesc(32, 32, 0, 0,
+                                                                            cudaChannelFormatKindFloat);  // Schritt 2.1 Output-Kanal
+                                                                                                          // anlegen und beschreiben
+
+    texRefSosAttField.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+    texRefSosAttField.addressMode[1] = cudaAddressModeClamp;
+    texRefSosAttField.addressMode[2] = cudaAddressModeClamp;
+
+    if (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing] == 1)
+    {
+        texRefSosAttField.filterMode = cudaFilterModeLinear;  // Lineare Interpolation
+    }
+    else
+    {
+        texRefSosAttField.filterMode = cudaFilterModePoint;  // Nearest Neighbor
+    }
+    texRefSosAttField.normalized = 0;
+
+    CUDA_CHECK(cudaBindTextureToArray(&texRefSosAttField, deviceSosAttFieldCuArray,
+                                      &texChannelDescSosAttField));  // Schritt 4.1 3DArray an Texturmemory
+                                                                     // binden
+
+    if (deviceListGeometry == 0)
+    {
+        cudaBindSurfaceToArray(outSurfRefTableVoxelToEmPathSosBoth, deviceTableVoxelToEmPathSosBothCuArray);
+    }
+
+    if (deviceListGeometry == 1)
+    {
+        if (TableVoxelToReceiverPathSosAllocationCount > 0)
+        {
+            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth0, deviceTableVoxelToRecPathSosBothCuArray[0]);
+        }
+        if (TableVoxelToReceiverPathSosAllocationCount > 1)
+        {
+            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth1, deviceTableVoxelToRecPathSosBothCuArray[1]);
+        }
+        if (TableVoxelToReceiverPathSosAllocationCount > 2)
+        {
+            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth2, deviceTableVoxelToRecPathSosBothCuArray[2]);
+        }
+    }
+
+    precalculateAverageSpeedOfSoundKernel<<<blocksPerGrid, threadsPerBlock>>>(deviceSosAttFieldCuArray, firstZLayer, sosZLayerCount, deviceListGeometry, geometryElementCount,
+                                                                              maxSoSReceiverArrayForTexture,  // maximale Anzahl an Receivern in einem
+                                                                                                              // CUDA Array
+
+                                                                              // deviceVoxelCountOutput,
+                                                                              deviceVoxelCountOutputFloat, deviceSpeedOfSoundSumOutput,
+                                                                              //            regionOfInterestOffset,
+                                                                              SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter);
+    CUDA_CHECK(cudaGetLastError());
+
+    CUDA_CHECK(cudaUnbindTexture(&texRefSosAttField));
+}
+
+void SAFTHandler::fillCuArray(float useValue,
+                              cudaArray **deviceTextureAscanIndexFloatCuArray,  ///< CuArray to fill
+                              int TableAscanIndexAllocationCount)
+{
+    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1,
+                         1);  // determine neccessary amount of threads
+                              // // max. 512 oder 1024
+    dim3 blocksPerGrid(1, 1,
+                       1);  // determine neccessary amount of blocks in grid	// max. 65.535
+    blocksPerGrid.x = SOSGrid_XYZ.y;
+    blocksPerGrid.y = maxFeasibleSosZLayerCount;
+    blocksPerGrid.z = 1;
+
+    // Step 1. Bereite Output-Textur fuer AscanIndex vor
+    if (TableAscanIndexAllocationCount > 0)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat0, deviceTextureAscanIndexFloatCuArray[0]);
+    }
+    if (TableAscanIndexAllocationCount > 1)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat1, deviceTextureAscanIndexFloatCuArray[1]);
+    }
+    if (TableAscanIndexAllocationCount > 2)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat2, deviceTextureAscanIndexFloatCuArray[2]);
+    }
+    if (TableAscanIndexAllocationCount > 3)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat3, deviceTextureAscanIndexFloatCuArray[3]);
+    }
+
+    // Step 2. Fuere Kernel aus mit #Threads: SOS.x*SOS.y . Innerhalb werden immer
+    // 1024/2048 A-Scans durchgegangen und in AscanIndex-Textur geschrieben
+    fillCuArrayKernel<<<blocksPerGrid, threadsPerBlock>>>(useValue,
+                                                          deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS
+                                                                                                ///< samples in the path
+                                                                                                ///< from transducer to
+                                                                                                ///< voxel.
+                                                          maxAscanIndexArraysInTexture,
+                                                          TableAscanIndexAllocationCount,  ///< Amount of Surfaces in
+                                                                                           ///< the Array of cuArrays
+                                                          maxFeasibleSosZLayerCount, ATTMode_3DVolume, debugMode, debugModeParameter);
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+void SAFTHandler::performSAFT(
+    int aScanIndex,              ///< The A-scan index is increased by the A-scan batch size in every iteration. It describes the offset into the A-scan samples the SAFT kernel is operating with.
+    size_t aScanWindowSize,      ///< A-scan batch size in terms of number of samples within one window.
+    int3 IMAGE_SIZE_XYZ,         ///< Bildbereichsgroesse/ROI in Voxel
+    int3 SOSGrid_XYZ,            ///< SoSGridgroesse in Voxel
+    int blockIndexOffset,        ///< Additional offset added to the z component of the block index, required because of the adjustments for partial reconstruction in different z-layers.
+    int outputWindowVoxelCount,  ///< Number of Voxels in the output window.
+    int speedOfSoundZLayer,      ///< current SoS z-layer Offset in the speed of sound grid.
+    int speedOfSoundVoxelsWithinZLayers,  ///< Number of z-layers in the speed of sound grid touched by the z-layers of the active zone of reconstruction in the region of interest.
+    int maxFeasibleSosZLayerCount,
+    int currentEmIndexUsedForAscanIndexCalculation,  ///< current Index of Em for which the AscanIndex is calculated
+    dim3 const &windowGridDimensions,  ///< Grid dimensions to be used to launch the SAFT kernel. It is smaller than the full grid dimensions and only represents the current reconstruction window.
+    dim3 const &gridDimensions,        ///< Full grid dimensions of the reconstruction.
+    dim3 const &blockDimensions,       ///< Block dimensions to be used with the SAFT kernel.
+    float *deviceSpeedOfSoundField,    ///< Pointer to SoSGrid.
+    cudaArray *deviceAScansCuArray
+    //    cudaStream_t stream ///< Stream to execute the SAFT kernel on.
+)
+{
+
+    dim3 reducedGridDimensions, reducedBlockDimensions;
+
+    reduceKernelDimensions(windowGridDimensions, blockDimensions, reducedGridDimensions, reducedBlockDimensions);
+
+    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex_SOS_ATT, cudaFuncCachePreferL1));
+    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex_SOS, cudaFuncCachePreferL1));
+    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex, cudaFuncCachePreferL1));
+
+    //	Texture Memory Adressing-mode 			// http://docs.nvidia.com/cuda/pdf/CUDA_C_Programming_Guide.pdf  ->  3.2.11.1. Texture Memory S. 42
+    // cudaAddressModeClamp 	- Return values at the boarders if out-of range - default
+    // cudaAddressModeBorder	- Return 0 if out-of range
+    // cudaAddressModeMirror	- Mirror the values    		- For normalized coordinates
+    // cudaAddressModeWrap      - Repeating the values    	- For normalized coordinates
+
+    // Texturmemory fuer Ascans
+    cudaChannelFormatDesc texChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);  // Beschreibung des RueckgabeFormats der Textur
+    texRefAscans.addressMode[0] = cudaAddressModeBorder;                                                    // Texturreferenz beschreiben
+    texRefAscans.addressMode[1] = cudaAddressModeBorder;
+
+    if (SAFT_VARIANT[SAFT_VARIANT_AscanInterpolation] == 1)
+    {
+        texRefAscans.filterMode = cudaFilterModeLinear;  // Lineare Interpolation
+    }
+    else
+    {
+        texRefAscans.filterMode = cudaFilterModePoint;  // Nearest Neighbor
+    }
+
+    texRefAscans.normalized = 0;
+    CUDA_CHECK(cudaBindTextureToArray(&texRefAscans, deviceAScansCuArray, &texChannelDesc));
+
+    if (ATTMode_3DVolume == false)
+    {  // ========= 3DVolume Mode without ATT-Correction
+
+        cudaChannelFormatDesc texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc<float>();  // Should do the same
+
+        // AscanIndex Path Tables ------------------------------------------------------
+        texTableAscanIndexFloat1_0.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableAscanIndexFloat1_0.addressMode[1] = cudaAddressModeClamp;
+        texTableAscanIndexFloat1_0.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableAscanIndexFloat1_0.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableAscanIndexFloat1_0.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableAscanIndexFloat1_0.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_0, deviceTextureAscanIndexFloatCuArray[0], &texChannelDescTableAscanIndexFloat));
+
+        if (TableAscanIndexAllocationCount > 1)
+        {                                                                      // TODO: mit Arrays flexibel programmieren!!!
+            texTableAscanIndexFloat1_1.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat1_1.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat1_1.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat1_1.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat1_1.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat1_1.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_1, deviceTextureAscanIndexFloatCuArray[1], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 2)
+        {
+            texTableAscanIndexFloat1_2.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat1_2.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat1_2.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat1_2.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat1_2.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat1_2.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_2, deviceTextureAscanIndexFloatCuArray[2], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 3)
+        {
+            texTableAscanIndexFloat1_3.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat1_3.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat1_3.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat1_3.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat1_3.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat1_3.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat));
+        }
+    }
+    else if (ATTMode_3DVolume == true)
+    {  // ========= 3DVolume Mode with ATT-Correction
+
+        cudaChannelFormatDesc texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc<float2>();  // Should do the same
+
+        // AscanIndex Path Tables ------------------------------------------------------
+        texTableAscanIndexFloat2_0.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableAscanIndexFloat2_0.addressMode[1] = cudaAddressModeClamp;
+        texTableAscanIndexFloat2_0.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableAscanIndexFloat2_0.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableAscanIndexFloat2_0.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableAscanIndexFloat2_0.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_0, deviceTextureAscanIndexFloatCuArray[0], &texChannelDescTableAscanIndexFloat));
+
+        if (TableAscanIndexAllocationCount > 1)
+        {                                                                      // TODO: mit Arrays flexibel programmieren!!!
+            texTableAscanIndexFloat2_1.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat2_1.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat2_1.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat2_1.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat2_1.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat2_1.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_1, deviceTextureAscanIndexFloatCuArray[1], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 2)
+        {
+            texTableAscanIndexFloat2_2.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat2_2.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat2_2.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat2_2.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat2_2.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat2_2.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_2, deviceTextureAscanIndexFloatCuArray[2], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 3)
+        {
+            texTableAscanIndexFloat2_3.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat2_3.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat2_3.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat2_3.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat2_3.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat2_3.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat));
+        }
+    }
+    // Vorberechnung der Koordinaten --> schnelleres Bestimmen der Voxelposition
+    float VoxelIncrement = IMAGE_RESOLUTION / SOS_RESOLUTION;
+    float3 SosVoxelStartPosition;
+    SosVoxelStartPosition.x = (regionOfInterestOffset.x - sosOffset.x) / SOS_RESOLUTION;  // Start des Bildes im SOS-Grid aus Positionsdaten bestimmen
+    SosVoxelStartPosition.y = (regionOfInterestOffset.y - sosOffset.y) / SOS_RESOLUTION;
+    SosVoxelStartPosition.z = (regionOfInterestOffset.z - sosOffset.z) / SOS_RESOLUTION;
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // Anzahl der Teiltabellen, bereits hier berechnen oder übergeben
+    int TableAscanIndexAllocationCount = ceil((float)aScanWindowSize / (float)maxAscanIndexArraysInTexture);  // float is important due to ceiling
+    // printf( "TableAscanIndexAllocationCount = (%i/%i) = %i = %i\n", aScanWindowSize, maxAscanIndexArraysInTexture, TableAscanIndexAllocationCount,
+    // ceil(aScanWindowSize/maxAscanIndexArraysInTexture));
+
+    // Call of 3 SAFT Versions - AscanIndex-Varainten
+    // #####################################################################################################################################################################
+    // ==================================================== Blockmode with SOS-value per Ascan
+    // ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
+    // ==================================================== 3DVolume Mode with SOS- and ATT-Correction
+
+    if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false))
+    {  // ==================================================== Blockmode with SOS-value per Ascan
+        printf("\n\n --- SAFT (AscanIndex) without SOS currently not implemented --- \n");
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false))
+    {  
+		// ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
+        saftKernelAscanIndex_SOS<<<reducedGridDimensions, reducedBlockDimensions>>>(  // , 0, stream>>>(
+
+            aScanIndex,
+
+            (float)aScanWindowSize,          // maxAscanIndexArraysInTexture
+            maxAscanIndexArraysInTexture,    //  maxSoSReceiverArrayForTexture
+            TableAscanIndexAllocationCount,  // Anzahl der genutzten Teiltabellen
+            IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement, blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions, deviceOutput);
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true))
+    {  // ==================================================== 3DVolume Mode with SOS- and ATT-Correction
+
+        saftKernelAscanIndex_SOS_ATT<<<reducedGridDimensions, reducedBlockDimensions>>>(  // , 0, stream>>>(
+            aScanIndex,
+            (float)aScanWindowSize,          // maxAscanIndexArraysInTexture
+            maxAscanIndexArraysInTexture,    //  maxSoSReceiverArrayForTexture
+            TableAscanIndexAllocationCount,  // Anzahl der genutzten Teiltabellen
+            IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement,
+            blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions,
+            debugMode, debugModeParameter, deviceSAFT_VARIANT,
+            deviceOutput
+
+        );
+    }
+
+    // Unbind Textures
+    CUDA_CHECK(cudaUnbindTexture(&texRefAscans));
+
+    if (ATTMode_3DVolume == false)
+    {  // ========= 3DVolume Mode without ATT-Correction
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_0));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_1));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_2));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_3));
+    }
+    else if (ATTMode_3DVolume == true)
+    {  // ========= 3DVolume Mode with ATT-Correction
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_0));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_1));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_2));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_3));
+    }
+    CUDA_CHECK(cudaGetLastError());
+}
--- a/SAFT_TOFI/src/processAScans.cpp
+++ b/SAFT_TOFI/src/processAScans.cpp
--- a/SAFT_TOFI/src/saft.cpp
+++ b/SAFT_TOFI/src/saft.cpp
@@ -1,126 +1,91 @@
+#include <string.h>
+
+#include <cmath>
+#include <cstdlib>
+#include <ctime>
 #include <iostream>
 #include <vector>

-#include <cstdlib>
-#include <ctime>
-#include <cmath>
-#include <string.h>
-
-//#include <ail/file.hpp>
-//#include <ail/string.hpp>
-//#include <ail/time.hpp>
+// #include <ail/file.hpp>
+// #include <ail/string.hpp>
+// #include <ail/time.hpp>

 #include "saft.hpp"

-
-
-
 /**
   Clumsy constructor of the core reconstruction class.
   - Unbeholfener Konstruktor der Kern Rekonstuktionsklasse
 */
-SAFTHandler::SAFTHandler(
-    int deviceId, 			///< CUDA ID of the device to be used.
-    int deviceIndex,		///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
-    float *aScan_ptr,    	///< Zeiger zu den AScandaten
-    double *output_ptr,   	///< Zeiger zu den Volumen-Daten
-    double *Duration_ptr, 	///< Zeiger auf Rueckgabewert fuer Matlab fuer Laufzeit des Kernels
-	unsigned short *receiver_index_ptr, ///<
-	unsigned short *emitter_index_ptr,  ///<
-	float *receiver_list_ptr, 			///<
-	int receiver_list_Size,
-	float *emitter_list_ptr,			///<
-	int emitter_list_Size,
-    float *speed_vec_ptr,				///< Zeiger auf die SoS-Daten in Block-/Gridmode
-    int3 SOSGrid_XYZ,
-	float3 sosOffset, 					///< Startpoint of SoSGrid
-	float SOS_RESOLUTION,				///< Aufloesung des SoSGrid
-	float *att_vec_ptr,					///< Zeiger auf die Att-Daten inm Gridmode
+SAFTHandler::SAFTHandler(int deviceId,                        ///< CUDA ID of the device to be used.
+                         int deviceIndex,                     ///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
+                         float *aScan_ptr,                    ///< Zeiger zu den AScandaten
+                         double *output_ptr,                  ///< Zeiger zu den Volumen-Daten
+                         double *Duration_ptr,                ///< Zeiger auf Rueckgabewert fuer Matlab fuer Laufzeit des Kernels
+                         unsigned short *receiver_index_ptr,  ///<
+                         unsigned short *emitter_index_ptr,   ///<
+                         float *receiver_list_ptr,            ///<
+                         int receiver_list_Size,
+                         float *emitter_list_ptr,  ///<
+                         int emitter_list_Size,
+                         float *speed_vec_ptr,  ///< Zeiger auf die SoS-Daten in Block-/Gridmode
+                         int3 SOSGrid_XYZ,
+                         float3 sosOffset,      ///< Startpoint of SoSGrid
+                         float SOS_RESOLUTION,  ///< Aufloesung des SoSGrid
+                         float *att_vec_ptr,    ///< Zeiger auf die Att-Daten inm Gridmode

-    int aScanCount,
-    int aScanLength,
-    int3 IMAGE_SIZE_XYZ,
-    float sampleRate,
-    float3 regionOfInterestOffset,
-    float IMAGE_RESOLUTION,
-    dim3 const & fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
-    float debugMode,
-    float debugModeParameter,
-    bool SOSMode_3DVolume,
-    bool ATTMode_3DVolume,
+                         int aScanCount, int aScanLength, int3 IMAGE_SIZE_XYZ, float sampleRate, float3 regionOfInterestOffset, float IMAGE_RESOLUTION,
+                         dim3 const &fixedBlockDimensions,  ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
+                         float debugMode, float debugModeParameter, bool SOSMode_3DVolume, bool ATTMode_3DVolume,

-    int SAFT_MODE,
-	int *SAFT_VARIANT,
-	int SAFT_VARIANT_Size,
+                         int SAFT_MODE, int *SAFT_VARIANT, int SAFT_VARIANT_Size,

-	int *Abort_ptr							///<  If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch;
-    ):
-    // Initialisation der Klassenvariablen mit den uebergebenen Werten (aehnlich Konstruktor)
-    // Initializer list of class variables
-    deviceId(deviceId),
-    deviceIndex(deviceIndex),
+                         int *Abort_ptr  ///<  If there is not enough memory abort reconstruction. Wenn Fehler --> Abbruch;
+                         )
+    : deviceId(deviceId),
+      deviceIndex(deviceIndex),

-	aScan_ptr(aScan_ptr),			//aScanSamplesPath(aScanSamplesPath),
+      aScan_ptr(aScan_ptr),  // aScanSamplesPath(aScanSamplesPath),

-	output_ptr(output_ptr),			//Path(Path),
-	Duration_ptr(Duration_ptr),
+      output_ptr(output_ptr),  // Path(Path),
+      Duration_ptr(Duration_ptr),

-	receiver_index_ptr(receiver_index_ptr), 	//
-	emitter_index_ptr(emitter_index_ptr),  		//
-	receiver_list_ptr(receiver_list_ptr), 		//
-	receiver_list_Size(receiver_list_Size),
-	emitter_list_ptr(emitter_list_ptr),		//
-	emitter_list_Size(emitter_list_Size),
-	speed_vec_ptr(speed_vec_ptr), 			///< SoS-Daten im Blockmode oder SoSGrid
-	SOSGrid_XYZ(SOSGrid_XYZ),	  			// Groesse des SoSGrids
-	sosOffset(sosOffset), 					///< Startpoint of SoSGrid
-	SOS_RESOLUTION(SOS_RESOLUTION),			///< Aufloesung des SoSGrid
+      receiver_index_ptr(receiver_index_ptr),  //
+      emitter_index_ptr(emitter_index_ptr),    //
+      receiver_list_ptr(receiver_list_ptr),    //
+      receiver_list_Size(receiver_list_Size),
+      emitter_list_ptr(emitter_list_ptr),  //
+      emitter_list_Size(emitter_list_Size),
+      speed_vec_ptr(speed_vec_ptr),    ///< SoS-Daten im Blockmode oder SoSGrid
+      SOSGrid_XYZ(SOSGrid_XYZ),        // Groesse des SoSGrids
+      sosOffset(sosOffset),            ///< Startpoint of SoSGrid
+      SOS_RESOLUTION(SOS_RESOLUTION),  ///< Aufloesung des SoSGrid

-	att_vec_ptr(att_vec_ptr),				///< Att-Daten als ATTGrid
+      att_vec_ptr(att_vec_ptr),  ///< Att-Daten als ATTGrid

-	aScanCount(aScanCount),
-	aScanLength(aScanLength),
-	IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
-	sampleRate(sampleRate),
-	regionOfInterestOffset(regionOfInterestOffset),
-	IMAGE_RESOLUTION(IMAGE_RESOLUTION),
+      aScanCount(aScanCount),
+      aScanLength(aScanLength),
+      IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
+      sampleRate(sampleRate),
+      regionOfInterestOffset(regionOfInterestOffset),
+      IMAGE_RESOLUTION(IMAGE_RESOLUTION),

-    fixedBlockDimensions(fixedBlockDimensions),
-	debugMode(debugMode),
-	debugModeParameter(debugModeParameter),
-	SOSMode_3DVolume(SOSMode_3DVolume),
-	ATTMode_3DVolume(ATTMode_3DVolume),
+      fixedBlockDimensions(fixedBlockDimensions),
+      debugMode(debugMode),
+      debugModeParameter(debugModeParameter),
+      SOSMode_3DVolume(SOSMode_3DVolume),
+      ATTMode_3DVolume(ATTMode_3DVolume),

-	SAFT_MODE(SAFT_MODE),
-	SAFT_VARIANT(SAFT_VARIANT),
-	SAFT_VARIANT_Size(SAFT_VARIANT_Size),
+      SAFT_MODE(SAFT_MODE),
+      SAFT_VARIANT(SAFT_VARIANT),
+      SAFT_VARIANT_Size(SAFT_VARIANT_Size),

-	Abort_ptr(Abort_ptr)
+      Abort_ptr(Abort_ptr)
 {
-	#ifdef debug_OutputFunctions
-		printf( "==> SAFTHandler::SAFTHandler - Start\n");
-	#endif
-
-	#ifdef debug_OutputInfo
-		printf( "SAFTHandler Constructor\n");
-	#endif
-
-    aScanAllocationCount = USED_ASCANSMEMORYREGIONS;	// Anzahl der A-Scan-Speicherbereiche die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer A-ScanCopy genutzt werden sollen.
-	maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX;		// Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;
-
-	IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION;   // Auflösung im OutputVolumen
-	SOS_RESOLUTION_FACTOR   = 1 / SOS_RESOLUTION;	  // Auflösung im SoS-Volumen
-
-	#ifdef debug_OutputVariables
-		printf( "IMAGE_RESOLUTION_FACTOR = %e\n", IMAGE_RESOLUTION_FACTOR);
-		printf( "SOS_RESOLUTION_FACTOR 	= %e\n", SOS_RESOLUTION_FACTOR);
-		printf( "Samplerate = %e\n", sampleRate);
-	#endif
-
-	#ifdef debug_OutputFunctions
-		printf( "<== SAFTHandler::SAFTHandler - End\n");
-	#endif
+    aScanAllocationCount = USED_ASCANSMEMORYREGIONS;  // Anzahl der A-Scan-Speicherbereiche die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer A-ScanCopy genutzt werden sollen.
+    maxSupportedTexturesForAscanIndex = MAX_SUPPORTEDTEXTURES_FORASCANINDEX;  // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;

+    IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION;  // Auflösung im OutputVolumen
+    SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION;      // Auflösung im SoS-Volumen
 }

 /**
@@ -129,271 +94,100 @@ SAFTHandler::SAFTHandler(
 */
 void SAFTHandler::performReconstruction()
 {
-	#ifdef debug_OutputFunctions
-		printf( "==> SAFTHandler::performReconstruction - Start\n");
-	#endif
+    aScanSamples = (float *)aScan_ptr;                      // Ascan-Data
+    emitter_index = (unsigned short *)emitter_index_ptr;    // Index for associating emitter to corresponding coordinates
+    receiver_index = (unsigned short *)receiver_index_ptr;  // Index for associating receiver to corresponding coordinates
+    emitter_list = (float3 *)emitter_list_ptr;              // Lookuptable for emitter coordinates
+    receiver_list = (float3 *)receiver_list_ptr;            // Lookuptable for receiver coordinates
+    output = (double *)output_ptr;                          // Output-Data

+    speedOfSoundField = (float *)speed_vec_ptr;  // For SOS Correction
+    // SoSData 		  	= (float*) speed_vec_ptr;	// Fuer Blockmode
+    attenuationField = (float *)att_vec_ptr;  // For Attenuation Correction

-	//Pointeruebergabe der AScan-Daten, Geometrie-Daten und Output-Daten von Matlab
-	#ifdef debug_OutputInfo
-		printf( "Give Pointer Names for AScan, Geometry, Output and SoS-Data from Matlab\n");
-	#endif
-	aScanSamples   		= (float*) aScan_ptr;								// Ascan-Data
-	emitter_index  		= (unsigned short*) emitter_index_ptr;			// Index for associating emitter to corresponding coordinates
-	receiver_index 		= (unsigned short*) receiver_index_ptr;			// Index for associating receiver to corresponding coordinates
-	emitter_list   		= (float3*) emitter_list_ptr;					// Lookuptable for emitter coordinates
-	receiver_list  		= (float3*) receiver_list_ptr;					// Lookuptable for receiver coordinates
-	output 		   		= (double*) output_ptr;							// Output-Data
+    // Read out GPU-Device Properties
+    // ----------------------------------------------------------
+    // List of all deviceProperties: http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__DEVICE_g5aa4f47938af8276f08074d09b7d520c.html

-	speedOfSoundField 	= (float*) speed_vec_ptr;	// For SOS Correction
-	//SoSData 		  	= (float*) speed_vec_ptr;	// Fuer Blockmode
-	attenuationField  	= (float*) att_vec_ptr;    	// For Attenuation Correction
+    // Determine the number of GPU-Devices in System
+    int deviceCount;
+    CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+    deviceProperties.reserve(static_cast<std::size_t>(deviceCount));  // Request Vector for all GPU Devices with the size deviceCount

+    // Determine the number of GPU-Devices in System
+    cudaDeviceProp &device = deviceProperties[deviceId];
+    CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));  // Read out Properties of current used GPU-Device in this thread

-	#ifdef debug_OutputInfo													// Name des Device mit ID ausgeben
-		printf( "Device ID: %i\n", deviceId);
-	#endif
+    CUDA_CHECK(cudaSetDevice(deviceId));

-	#ifdef debug_OutputFunctions
-	printf( "==> loadDevices - Start\n");
-	#endif
+    deviceProperties.push_back(device);  // Add element at the end of the vector outputProb
+    // Determine minimum supported Surface size. Dependent on device.maxTexture3D[2] and device.maxSurface3D[2]
+    maxSurfaceTexture3DDimension = (device.maxTexture3D[2] < device.maxSurface3D[2]) ? device.maxTexture3D[2] : device.maxSurface3D[2];
+    // printf("DEVICE  =>  maxSurfaceTexture3DDimension = %d  (device.maxTexture3D[2] = %d  -  device.maxSurface3D[2] = %d)\n",  maxSurfaceTexture3DDimension, device.maxTexture3D[2],
+    // device.maxSurface3D[2]);	// Set maximum Size of Texture

-	// Read out GPU-Device Properties
-	// ----------------------------------------------------------
-	// List of all deviceProperties: http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__DEVICE_g5aa4f47938af8276f08074d09b7d520c.html
-
-	// Determine the number of GPU-Devices in System
-	int deviceCount;
-	CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
-	deviceProperties.reserve(static_cast<std::size_t>(deviceCount));	// Request Vector for all GPU Devices with the size deviceCount
-
-	// Determine the number of GPU-Devices in System
-	cudaDeviceProp & device = deviceProperties[deviceId];
-	CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));		// Read out Properties of current used GPU-Device in this thread
-	//printf("%i. %s\n", deviceId, device.name);
-	//#ifdef debug_OutputInfo
-		//printf( "Device used: %18s  (HW-ID %i) (Idx %i)\n", device.name , deviceId, deviceIndex);	// Name des Device mit ID ausgeben
-	//#endif
-	CUDA_CHECK(cudaSetDevice(deviceId));
-
-	#ifdef debug_OutputInfo
-		printf("Reset Device\n");	// Reset Device
-	#endif
-	
-    // CUDA_CHECK(cudaDeviceReset()); 2019: commented to remove re-initialization when called, avoids blocked threads later on
-
-		//printf("%i. %s\n", deviceId, deviceProperties[deviceId].name);
-		//printf("DEVICE  =>  Maximum 3D texture dimensions:  [%d %d %d]\n",  device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]);
-		//printf("DEVICE  =>  Maximum width, height, and depth for a 3D surface reference bound to a CUDA array:  [%d %d %d]\n",  device.maxSurface3D[0], device.maxSurface3D[1], device.maxSurface3D[2]);
-
-	#ifdef debug_OutputInfo
-			printf("%i. %s\n", deviceId, device.name);
-			printf("   Byte Total Global Mem:  %lld \n",  device.totalGlobalMem);
-			printf("   Compute Capability:     %i.%i\n",  device.major,device.minor);
-
-			printf("   Name:                            %s\n",  device.name);
-			printf("   Major revision number:           %d\n",  device.major);
-			printf("   Minor revision number:           %d\n",  device.minor);
-			printf("   Total global memory:             %lld\n",  device.totalGlobalMem);
-			printf("   Total shared memory per block:   %u\n",  device.sharedMemPerBlock);
-			printf("   Total registers per block:       %d\n",  device.regsPerBlock);
-			printf("   Warp size:                       %d\n",  device.warpSize);
-			printf("   Maximum memory pitch:            %lld\n",  device.memPitch);
-			printf("   Maximum threads per block:       %d\n",  device.maxThreadsPerBlock);
-			printf("   Maximum 3D texture dimensions:  [%d %d %d]\n",  device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]);
-			for (int i = 0; i < 3; ++i)
-				printf("        Maximum dimension %d of block:  %lld\n", i, device.maxThreadsDim[i]);
-			for (int i = 0; i < 3; ++i)
-				printf("        Maximum dimension %d of grid:   %lld\n", i, device.maxGridSize[i]);
-			printf("   Clock rate:                      %d\n",  device.clockRate);
-			printf("   Total constant memory:           %u\n",  device.totalConstMem);
-			printf("   Texture alignment:               %u\n",  device.textureAlignment);
-			printf("   Concurrent copy and execution:   %s\n",  (device.deviceOverlap ? "Yes" : "No"));
-			printf("   Number of multiprocessors:       %d\n",  device.multiProcessorCount);
-			printf("   Kernel execution timeout:        %s\n\n",  (device.kernelExecTimeoutEnabled ? "Yes" : "No"));
-			printf("   Maximum 3D texture dimensions:  [%d %d %d]\n",  device.maxTexture3D[0], device.maxTexture3D[1], device.maxTexture3D[2]);
-			printf("   Maximum width, height, and depth for a 3D surface reference bound to a CUDA array:  [%d %d %d]\n",  device.maxSurface3D[0], device.maxSurface3D[1], device.maxSurface3D[2]);
-	#endif
-
-	deviceProperties.push_back(device);		// Add element at the end of the vector outputProb
-
-//	printf("   Maximum memory pitch:            %lld\n",  device.memPitch);
-//	printf("   Texture alignment:               %u\n",  device.textureAlignment);
-//	printf("   Texture Pitch alignment:         %u\n",  device.texturePitchAlignment);
-
-	// Determine minimum supported Surface size. Dependent on device.maxTexture3D[2] and device.maxSurface3D[2]
-	maxSurfaceTexture3DDimension = (device.maxTexture3D[2]<device.maxSurface3D[2]) ? device.maxTexture3D[2]:device.maxSurface3D[2];
-		//printf("DEVICE  =>  maxSurfaceTexture3DDimension = %d  (device.maxTexture3D[2] = %d  -  device.maxSurface3D[2] = %d)\n",  maxSurfaceTexture3DDimension, device.maxTexture3D[2], device.maxSurface3D[2]);	// Set maximum Size of Texture
-
-	//Set the maximal used number of SOS-ZLayers, dependend on SAFT_VARIANT-Parameter 3DVolumeInterpolationAtReconstruction (=3)
+    // Set the maximal used number of SOS-ZLayers, dependend on SAFT_VARIANT-Parameter 3DVolumeInterpolationAtReconstruction (=3)
+    
+	//按照目前配置必定 maxFeasibleSosZLayerCount = 2;！！！
 	switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
-		 {
-			case  0:	// Mit Textur -> 1ne SOS-ZLayer
-				maxFeasibleSosZLayerCount = 1;
-				break;
-			case  1:	// Mit Textur & Interpolation -> 2 SOS-ZLayer
-				maxFeasibleSosZLayerCount = 2;
-				break;
-		 }
-		#ifdef debug_OutputVariables
-			printf( "Set maxFeasibleSosZLayerCount = %u\n", maxFeasibleSosZLayerCount);
-		#endif
+    {
+    case 0:  // Mit Textur -> 1ne SOS-ZLayer
+        maxFeasibleSosZLayerCount = 1;
+        break;
+    case 1:  // Mit Textur & Interpolation -> 2 SOS-ZLayer
+        maxFeasibleSosZLayerCount = 2;
+        break;
+    }

-			//printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
-			//printf( "maxSurfaceTexture3DDimension= %i\n", maxSurfaceTexture3DDimension);
+    maxAscanIndexArraysInTexture = maxSurfaceTexture3DDimension / maxFeasibleSosZLayerCount;  // Max Anzahl der Ascans in einer Teiltabelle (1024)

-	// Fuer Ascan-Index-Varainte von SAFT werden mehrere Texturen benoetigt, da die Anzahl der Z_layer limitiert ist.
-	// Um 3D-Interpolation zu ermoeglichen muessen jeweils 2 Z-Layer pro A-Scan vorhanden sein.
-	// --> 2*nAscans < maxSurfaceTexture3DDimension(Fermi & Kepler: 2048) ==> maximal 1024 Em/Rec - Kombinationen koennen in einem Surface/Textur gespeichert werden
-	// maxSurfaceTexture3DDimension					=  maximale Groesse die erlaubt ist (2048)
-	// TableAscanIndexAllocationCount  				=  Anzahl der TeilSurfaces ==> auch Anzahl der benoetigten Durchlaeufe (aktuell 4 Texturen)
-	// maxFeasibleSosZLayerCount				   	=  Anzahl der SoS-Zlayer die gleichzeitig im Speicher pro EM/REC-Kombi vorgehalten werden (1 oder 2 bei Interpolierten Variante)
-	// maxAscanIndexArraysInTexture			   		=  Anzahl der Ascans in einer Teiltabelle (1024)
-	// maxSupportedTexturesForAscanIndex			=  MAX_SUPPORTEDTEXTURES_FORASCANINDEX (=4) // Definiert die aktuell maximal unterstuetzen Texturen im Code fuer AscanIndex
-	// neededAscanBatchCount						=  Anzahl an benoetigten Durchlaeufe des SAFTs um alle Ascans abarbeiten zu koennen
+    // memoryCheck();		// Freier Speicher am Anfang ausgeben

-	maxAscanIndexArraysInTexture = maxSurfaceTexture3DDimension/maxFeasibleSosZLayerCount;	// Max Anzahl der Ascans in einer Teiltabelle (1024)
+    // if ((strcmp(device.name, "GeForce GTX 690") == 0)||(strcmp(device.name, "GeForce GTX 590") == 0)){
+    if (memoryGPUfree() <= 2500000000)
+    {  // IF GPUMemory < 2.5GB only 1 Surface can be used
+        maxSupportedTexturesForAscanIndex = 1;
+    }

+    neededAscanBatchCount = ceil((float)aScanCount / (maxSurfaceTexture3DDimension / maxFeasibleSosZLayerCount) / maxSupportedTexturesForAscanIndex);
+    // Determine amount of PartSurfaces
+    if (neededAscanBatchCount > 1)
+    {
+        TableAscanIndexAllocationCount = maxSupportedTexturesForAscanIndex;  // Wenn mehr als ein Durlauf nötig --> so viele wie möglich nutzen
+    }
+    else
+    {
+        TableAscanIndexAllocationCount = (int)ceil((float)aScanCount / (maxAscanIndexArraysInTexture));  // Wenn nur ein Durlauf nötig --> so wenige wie nötig nutzen
+    }

-	#ifdef debug_OutputAScanIndexMemoryDivision
-		printf("%s :\n", device.name);
-		printf("    Total memory %lld Bytes\n", memoryGPUtotal() );
-		printf("    Free  memory %lld Bytes\n", memoryGPUfree() );
-		printf(" => Used memory %lld Bytes\n", (memoryGPUtotal()-memoryGPUfree()));
-	#endif
-	//memoryCheck();		// Freier Speicher am Anfang ausgeben
+    // Set Block and Grid-Dimensions for GPU Threads
+    genericSAFTBlockDimensions = fixedBlockDimensions;  // fixedBlockDimensions = Parameter BlockDim_XYZ
+    genericSAFTGridDimensions =
+        dim3((IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x - 1) / genericSAFTBlockDimensions.x,  // hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
+             (IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y - 1) / genericSAFTBlockDimensions.y,  // in Blockgroesse geteilt werden kann, muss ein weiterer
+             (IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z - 1) / genericSAFTBlockDimensions.z   // Block berechnet werden. Wenn insgesamt zu viele werden sie im Kernel aussortiert.
+        );

-	// if ((strcmp(device.name, "GeForce GTX 690") == 0)||(strcmp(device.name, "GeForce GTX 590") == 0)){
-	if (memoryGPUfree() <= 2500000000){	// IF GPUMemory < 2.5GB only 1 Surface can be used
-		maxSupportedTexturesForAscanIndex = 1;
-		#ifdef debug_OutputAScanIndexMemoryDivision
-			printf("Free GPU Memory: %lld < 2.5GB\n --> reduce maxSupportedTexturesForAscanIndex 4 -> %i \n", memoryGPUfree(), maxSupportedTexturesForAscanIndex );
-			//printf("GeForce GTX 690/590 \n --> reduce maxSupportedTexturesForAscanIndex 4 -> %i \n", maxSupportedTexturesForAscanIndex );
-		#endif
-	}
-
-	#ifdef debug_OutputAScanIndexMemoryDivision
-		printf( "--> maxSupportedTexturesForAscanIndex %i \n", maxSupportedTexturesForAscanIndex);
-		//printf( "--> TableAscanIndexAllocationCount %i    \n", TableAscanIndexAllocationCount);
-	#endif
-
-	neededAscanBatchCount = ceil((float)aScanCount/(maxSurfaceTexture3DDimension/maxFeasibleSosZLayerCount)/maxSupportedTexturesForAscanIndex);
-	#ifdef debug_OutputAScanIndexMemoryDivision
-		printf("aScanCount %i -> neededAscanBatchCount = %i!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", aScanCount, neededAscanBatchCount);
-
-		//printf("totalGlobalMem = %lld Byte\n", device.totalGlobalMem);
-		//printf("multiProcessorCount = %i MultiProcessors\n", device.multiProcessorCount);
-	#endif
-
-	// Determine amount of PartSurfaces
-	if (neededAscanBatchCount > 1){
-		TableAscanIndexAllocationCount = maxSupportedTexturesForAscanIndex;								// Wenn mehr als ein Durlauf nötig --> so viele wie möglich nutzen
-	} else {
-		TableAscanIndexAllocationCount = (int)ceil((float)aScanCount/(maxAscanIndexArraysInTexture));   // Wenn nur ein Durlauf nötig --> so wenige wie nötig nutzen
-	}
-
-
-
-	// Set Block and Grid-Dimensions for GPU Threads
-	genericSAFTBlockDimensions = fixedBlockDimensions;	// fixedBlockDimensions = Parameter BlockDim_XYZ
-	genericSAFTGridDimensions = dim3(
-		(IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x-1)/ genericSAFTBlockDimensions.x,	// hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
-		(IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y-1)/ genericSAFTBlockDimensions.y,	// in Blockgroesse geteilt werden kann, muss ein weiterer
-		(IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z	// Block berechnet werden. Wenn insgesamt zu viele werden sie im Kernel aussortiert.
-		);
-	#ifdef debug_OutputVariables
-		printf( "genericSAFTBlockDimensions X,Y,Z = (%i %i %i)\n",genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z);
-		printf( "genericSAFTGridDimensions  X,Y,Z = (%i %i %i)\n",genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z);
-	#endif
-
-
-
-	// Outputsize of SOS Volume
+    // Outputsize of SOS Volume
    SOSVolume_VoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z;
    SOSVolume_Bytes = SOSVolume_VoxelCount * sizeof(float);
-	#ifdef debug_OutputVariables
-		printf("  SOSVolume_VoxelCount [%ix%ix%i] = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, SOSVolume_VoxelCount);
-		printf("  SOSVolume_Bytes = SOSVolumeVoxelCount(%i) x sizeof(float = 4) = %i\n", SOSVolume_VoxelCount, SOSVolume_Bytes);
-	#endif
-
-

    // Warn if Outputsize of Volume is too big for 32Bit Sytems
-	outputVolume_VoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen
-	outputVolume_Bytes = outputVolume_VoxelCount * sizeof(double);	// Speicherbedarf fuer alle Voxel im Volumen
+    outputVolume_VoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z;  // Anzahl der Voxel im Volumen
+    outputVolume_Bytes = outputVolume_VoxelCount * sizeof(double);                                                   // Speicherbedarf fuer alle Voxel im Volumen

-	#ifdef debug_OutputVariables
-		printf("  outputVolume_VoxelCount [%ix%ix%i]= %lld\n",IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, outputVolume_VoxelCount);
-		printf("  outputVolume_Bytes [%lld x sizeof(double = 8)] = %lld\n", outputVolume_VoxelCount, outputVolume_Bytes);
-	#endif
+    aScan_Bytes = aScanLength * sizeof(float);
+    aScanBatch_Bytes = aScanCount * aScan_Bytes;

-	//Hier auf maximale Outputgroesse von 32-BitSystem ueberpruefen --> falls Probleme mit 32-Bitsystemen hier noch Abfrage und Abbruch implementieren
-	//if (outputVolume_VoxelCount > 536870912) // 536870912 = 2^32 / sizeof(double)
-	//	std::cout << "outputVolume_Bytes > 2^32 the upper limit of unsigned integer!!!\n => Reconstruction only in 64-Bit Systems";
-
-	//Groesse der Datenbloecke fuer die Blockverarbeitung wird mit aScanCount angegeben
-	#ifdef debug_OutputVariables
-		printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
-	#endif
-	aScan_Bytes = aScanLength * sizeof(float);
-	aScanBatch_Bytes = aScanCount * aScan_Bytes;
-	#ifdef debug_OutputVariables
-		printf( "aScan_Bytes = aScanLength(%i) * sizeof(float=4) = %i\n", aScanLength, aScan_Bytes);
-		printf( "aScanCount = %i\n", aScanCount);
-		printf( "aScanBatch_Bytes = aScanCount * aScan_Bytes ( = %i * sizeof(float)) = %i\n", aScanLength, aScanBatch_Bytes);
-	#endif
-
-	#ifdef debug_OutputInfo
-    	printf("\nParameter for Image Reconstruction\n");
-    	printf(  "========================================================================\n");
-    	printf(  "IMAGE_SIZE_XYZ:               [%i x %i x %i]\n", IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z);
-    	printf(  "outputVolume_VoxelCount:       %lld\n", outputVolume_VoxelCount);
-    	printf(  "Increment vector/Resolution:   %f\n", IMAGE_RESOLUTION);
-    	printf(  "IMAGE_STARTPOINT in m:        [%f x %f x %f]\n", regionOfInterestOffset.x, regionOfInterestOffset.y, regionOfInterestOffset.z);
-       	outputVolume_size_m.x = IMAGE_SIZE_XYZ.x * IMAGE_RESOLUTION;
-        outputVolume_size_m.y = IMAGE_SIZE_XYZ.y * IMAGE_RESOLUTION;
-        outputVolume_size_m.z = IMAGE_SIZE_XYZ.z * IMAGE_RESOLUTION;
-    	printf(  "Volume Size in m:             [%f x %f x %f]\n", outputVolume_size_m.x, outputVolume_size_m.y, outputVolume_size_m.z);
-		printf(  "aScanCount:                    %i\n", aScanCount);
-		printf(  "========================================================================\n\n");
-	#endif
-
-
-	#ifdef debug_OutputPerformance
-		struct timeval startProcessAscans, stopProcessAscans;
-		gettimeofday(&startProcessAscans, NULL);
-	#endif
-
-		//perform processing with AScan-Data
-		//===========================================================================================================
-		//===========================================================================================================
-		ullong duration;
-		processAScans(duration);
-		//===========================================================================================================
-		//===========================================================================================================
-
-	#ifdef debug_OutputPerformance
-		diff_time = (double)((stopProcessAscans.tv_sec * 1000000.0 + stopProcessAscans.tv_usec) - (startProcessAscans.tv_sec * 1000000.0 + startProcessAscans.tv_usec));
-		printf ("########################################################################\n");
-		printf ("### GPU (%18s: HW-ID %i, Idx %i) ### Free Memory =                    	%4.0f µs\n", deviceProperties[deviceId].name, deviceId, deviceIndex, diff_time);
-		printf ("########################################################################\n");
-	#endif
-
-	Duration_ptr[(deviceIndex+1)] = (double)duration;	// Für jede GPU einen Laufzeitwert in µs übermitteln, Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig
-
-	#ifdef debug_OutputVariables
-		printf( "  GPU (%s:ID %i,Index %i): => Duration_ptr[%i] = duration(%i µs) = %.2f s\n", device.name, deviceId, deviceIndex, (deviceIndex+1), duration, Duration_ptr[(deviceIndex+1)]/1000/1000);
-	#endif
-
-	#ifdef debug_OutputInfo
-		printf("Reset Device\n"); // Reset Device
-	#endif
-	// CUDA_CHECK(cudaDeviceReset()); // news 2019 commented, see above reason.
-
-	#ifdef debug_OutputFunctions
-		printf( "<== SAFTHandler::performReconstruction - End\n");
-	#endif
+    // perform processing with AScan-Data
+    //===========================================================================================================
+    //===========================================================================================================
+    ullong duration;
+    processAScans(duration);
+    //===========================================================================================================
+    //===========================================================================================================

+    Duration_ptr[(deviceIndex + 1)] = (double)duration;  // Für jede GPU einen Laufzeitwert in µs übermitteln, Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig
 }

 /**
@@ -402,96 +196,42 @@ void SAFTHandler::performReconstruction()
   - Der SAFT Kernel erwartet Argumente in den die Grid Dimension auf drei Dimensionen reduziert wurde und die Block-Dimensionen auf nur eine Dimension reduziert ist.
   - Das haengt auch von den Eigenschaften der verfuegbaren HW ab (shader model)
 */
-void SAFTHandler::reduceKernelDimensions(
-    dim3 const & gridDimensions, ///< Input grid dimensions.
-    dim3 const & blockDimensions, ///< Input block dimensions.
-    dim3 & reducedGridDimensions, ///< Reduced output grid dimensions.
-    dim3 & reducedBlockDimensions ///< Reduced output block dimensions.
-    )
+void SAFTHandler::reduceKernelDimensions(dim3 const &gridDimensions,   ///< Input grid dimensions.
+                                         dim3 const &blockDimensions,  ///< Input block dimensions.
+                                         dim3 &reducedGridDimensions,  ///< Reduced output grid dimensions.
+                                         dim3 &reducedBlockDimensions  ///< Reduced output block dimensions.
+)
 {
-
-	#ifdef debug_OutputFunctions
-		printf( "==> SAFTHandler::reduceKernelDimensions - Start\n");
-	#endif
-
-
-	if(deviceProperties[deviceId].maxGridSize[2] > 1)
+    if (deviceProperties[deviceId].maxGridSize[2] > 1)
    {
        reducedGridDimensions = gridDimensions;
-		#ifdef debug_OutputParameter
-        	printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
-		#endif
    }
    else
    {
-        reducedGridDimensions = dim3(
-            gridDimensions.x * gridDimensions.y,
-            gridDimensions.z,
-            1
-            );
-		#ifdef debug_OutputParameter
-        	printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
-		#endif
+        reducedGridDimensions = dim3(gridDimensions.x * gridDimensions.y, gridDimensions.z, 1);
    }

    reducedBlockDimensions = dim3(blockDimensions.x * blockDimensions.y * blockDimensions.z);
-	#ifdef debug_OutputParameter
-    	printf( "reducedBlockDimensions X,Y,Z = (%i %i %i)\n", reducedBlockDimensions.x, reducedBlockDimensions.y, reducedBlockDimensions.z);
-	#endif
-
-	#ifdef debug_OutputFunctions
-		printf( "<== SAFTHandler::reduceKernelDimensions - End\n");
-	#endif
-
 }

-
 /**
   Determine free memory available on the current device.
 */
 std::size_t memoryGPUfree()
 {
-	#ifdef debug_OutputFunctions
-		printf( "==> memoryGPUfree - Start\n");
-	#endif
-
-	std::size_t
-        totalMemory,
-        freeMemory;
+    std::size_t totalMemory, freeMemory;
    CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
-
-//		printf("   Total memory %lld Bytes\n", totalMemory);
-//		printf("   Free  memory %lld Bytes\n", freeMemory);
-//		printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory));
-
-    #ifdef debug_OutputFunctions
-    	printf( "<== memoryGPUfree - End\n");
-	#endif
    return freeMemory;
 }

-
 /**
   Determine free memory available on the current device.
 */
 std::size_t memoryGPUtotal()
 {
-	#ifdef debug_OutputFunctions
-		printf( "==> current - Start\n");
-	#endif
-
-	std::size_t
-        totalMemory,
-        freeMemory;
+    std::size_t totalMemory, freeMemory;
    CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));

-//		printf("   Total memory %lld Bytes\n", totalMemory);
-//		printf("   Free  memory %lld Bytes\n", freeMemory);
-//		printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory));
-
-    #ifdef debug_OutputFunctions
-    	printf( "<== current - End\n");
-	#endif
    return totalMemory;
 }

@@ -501,111 +241,125 @@ std::size_t memoryGPUtotal()
 */
 void memoryCheck()
 {
-	#ifdef debug_OutputFunctions
-		printf( "==> memoryCheck - Start\n");
-	#endif
+#ifdef debug_OutputFunctions
+    printf("==> memoryCheck - Start\n");
+#endif

-	std::size_t
-        totalMemory,
-        freeMemory;
-	float check;
+    std::size_t totalMemory, freeMemory;
+    float check;
    CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
+    // totalMemory
+    check = 1024.0f * 1024.0f * 1024.0f * 1024.0f;
+    if (totalMemory >= check)
+    {
+        printf("   Total memory %.3f TB\n", totalMemory / check);
+    }
+    else
+    {
+        check /= 1024.0f;
+        if (totalMemory >= check)
+        {
+            printf("   Total memory %.3f GB\n", totalMemory / check);
+        }
+        else
+        {
+            check /= 1024.0f;
+            if (totalMemory >= check)
+            {
+                printf("   Total memory %.3f MB\n", totalMemory / check);
+            }
+            else
+            {
+                check /= 1024.0f;
+                if (totalMemory >= check)
+                {
+                    printf("   Total memory %.3f kB\n", totalMemory / check);
+                }
+                else
+                {
+                    check /= 1024.0f;
+                    if (totalMemory >= check)
+                        printf("   Total memory %.3f Bytes\n", totalMemory / check);
+                }
+            }
+        }
+    }

-    //#if defined(debug_OutputInfo) || defined(debug_OutputMaxMemory)
-//		printf("   Total memory %lld Bytes\n", totalMemory);
-//		printf("   Free  memory %lld Bytes\n", freeMemory);
-//		printf(" => Used memory %lld Bytes\n", (totalMemory-freeMemory));
-
-
-    	// totalMemory
-		check = 1024.0f*1024.0f*1024.0f*1024.0f;
-	    if (totalMemory >= check){
-	    	printf("   Total memory %.3f TB\n", totalMemory/check);
-	    } else {
-			check /= 1024.0f;
-			if (totalMemory >= check){
-				printf("   Total memory %.3f GB\n", totalMemory/check);
-			} else {
-				check /= 1024.0f;
-				if (totalMemory >= check){
-					printf("   Total memory %.3f MB\n", totalMemory/check);
-				} else {
-					check /= 1024.0f;
-					if (totalMemory >= check){
-						printf("   Total memory %.3f kB\n", totalMemory/check);
-					} else {
-						check /= 1024.0f;
-						if (totalMemory >= check)
-							printf("   Total memory %.3f Bytes\n", totalMemory/check);
-					}
-				}
-			}
-	    }
-
-		// freeMemory
-		check = 1024.0f*1024.0f*1024.0f*1024.0f;
-	    if (freeMemory >= check){
-	    	printf("   Free memory %.3f TB\n", freeMemory/check);
-	    } else {
-			check /= 1024.0f;
-			if (freeMemory >= check){
-				printf("   Free memory %.3f GB\n", freeMemory/check);
-			} else {
-				check /= 1024.0f;
-				if (freeMemory >= check){
-					printf("   Free memory %.3f MB\n", freeMemory/check);
-				} else {
-					check /= 1024.0f;
-					if (freeMemory >= check){
-						printf("   Free memory %.3f kB\n", freeMemory/check);
-					} else {
-						check /= 1024.0f;
-						if (freeMemory >= check)
-							printf("   Free memory %.3f Bytes\n", freeMemory/check);
-					}
-				}
-			}
-	    }
-
-		// Used Memory
-		check = 1024.0f*1024.0f*1024.0f*1024.0f;
-	    if ((totalMemory-freeMemory) >= check){
-	    	printf("   Used memory %.3f TB\n", (totalMemory-freeMemory)/check);
-	    } else {
-			check /= 1024.0f;
-			if ((totalMemory-freeMemory) >= check){
-				printf("   Used memory %.3f GB\n", (totalMemory-freeMemory)/check);
-			} else {
-				check /= 1024.0f;
-				if ((totalMemory-freeMemory) >= check){
-					printf("   Used memory %.3f MB\n", (totalMemory-freeMemory)/check);
-				} else {
-					check /= 1024.0f;
-					if ((totalMemory-freeMemory) >= check){
-						printf("   Used memory %.3f kB\n", (totalMemory-freeMemory)/check);
-					} else {
-						check /= 1024.0f;
-						if ((totalMemory-freeMemory) >= check)
-							printf("   Used memory %.3f Bytes\n", (totalMemory-freeMemory)/check);
-					}
-				}
-			}
-	    }
-
-
-	//#endif
-
-    #ifdef debug_OutputFunctions
-    	printf( "<== memoryCheck - End\n");
-	#endif
+    // freeMemory
+    check = 1024.0f * 1024.0f * 1024.0f * 1024.0f;
+    if (freeMemory >= check)
+    {
+        printf("   Free memory %.3f TB\n", freeMemory / check);
+    }
+    else
+    {
+        check /= 1024.0f;
+        if (freeMemory >= check)
+        {
+            printf("   Free memory %.3f GB\n", freeMemory / check);
+        }
+        else
+        {
+            check /= 1024.0f;
+            if (freeMemory >= check)
+            {
+                printf("   Free memory %.3f MB\n", freeMemory / check);
+            }
+            else
+            {
+                check /= 1024.0f;
+                if (freeMemory >= check)
+                {
+                    printf("   Free memory %.3f kB\n", freeMemory / check);
+                }
+                else
+                {
+                    check /= 1024.0f;
+                    if (freeMemory >= check)
+                        printf("   Free memory %.3f Bytes\n", freeMemory / check);
+                }
+            }
+        }
+    }

+    // Used Memory
+    check = 1024.0f * 1024.0f * 1024.0f * 1024.0f;
+    if ((totalMemory - freeMemory) >= check)
+    {
+        printf("   Used memory %.3f TB\n", (totalMemory - freeMemory) / check);
+    }
+    else
+    {
+        check /= 1024.0f;
+        if ((totalMemory - freeMemory) >= check)
+        {
+            printf("   Used memory %.3f GB\n", (totalMemory - freeMemory) / check);
+        }
+        else
+        {
+            check /= 1024.0f;
+            if ((totalMemory - freeMemory) >= check)
+            {
+                printf("   Used memory %.3f MB\n", (totalMemory - freeMemory) / check);
+            }
+            else
+            {
+                check /= 1024.0f;
+                if ((totalMemory - freeMemory) >= check)
+                {
+                    printf("   Used memory %.3f kB\n", (totalMemory - freeMemory) / check);
+                }
+                else
+                {
+                    check /= 1024.0f;
+                    if ((totalMemory - freeMemory) >= check)
+                        printf("   Used memory %.3f Bytes\n", (totalMemory - freeMemory) / check);
+                }
+            }
+        }
+    }
 }

-
-
-
-
-
 /**
   Generic CUDA call wrapper.
   Check the result of a CUDA operation and throw an exception if an error occurred.
@@ -614,21 +368,19 @@ void memoryCheck()
   - <20>berpr<70>ft die Ergebnisse einer CUDA Operation und wirft eine Exception wenn ein Fehler auftritt
   - Das wird wird mit einer Kombination mit einem Makro in saft.hpp genutzt.
 */
-//inline	// Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
-void performCUDAResultCheck(
-    cudaError_t result, ///< Result of the CUDA operation.
-    std::string const & file, ///< Path to the source code file.
-    int line ///< Line within the source code
-    )
+// inline	// Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
+void performCUDAResultCheck(cudaError_t result,       ///< Result of the CUDA operation.
+                            std::string const &file,  ///< Path to the source code file.
+                            int line                  ///< Line within the source code
+)
 {
-    if(result != cudaSuccess)
+    if (result != cudaSuccess)
    {
-    	//printf("A CUDA operation failed in file \"%s\" (line %i):  %s \n", file, line, cudaGetErrorString(result).c_str() );
-    	printf("%s\n", cudaGetErrorString( cudaGetLastError() ) );
+        // printf("A CUDA operation failed in file \"%s\" (line %i):  %s \n", file, line, cudaGetErrorString(result).c_str() );
+        printf("%s\n", cudaGetErrorString(cudaGetLastError()));

-    	//std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
-        //std::cout << errorMessage << std::endl;
+        // std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
+        // std::cout << errorMessage << std::endl;
        printf("-> Error occurred");
    }
 }
-
--- a/SAFT_TOFI/src/saft.cu
+++ b/SAFT_TOFI/src/saft.cu
@@ -1,16 +0,0 @@
-#include <iostream>
-
-#include "saft.hpp"
-
-/*!
-	This is the central CUDA file which really just includes the other modules.
-	This is done because CUDA does not support external references for referencing data from other compilation units.
-	- Dies ist das zentrale CUDA-File welches nur die anderen Module einbindet
-	- Das wird gemacht, weil CUDA keine externen Referenzen unterst<73>tzt, um Daten von anderen Compilierungs Einheiten zu referenzieren.
-*/
-
-#include "kernel/constantMemory.cuh"	// Deklaration der Daten, die sich im Constant-Memory befinden Geometriedaten
-#include "kernel/rayTracing.cuh"		// GPU-Code für Bresenham
-#include "kernel/precalculateSpeedOfSoundKernel.cuh"	// GPU-Code Partitionierung für Bresenham. Ruft den Bresenham auf.
-#include "kernel/saftKernel.cuh"		// GPU-Kernel für SAFT
-
--- a/SAFT_TOFI/src/saft.hpp
+++ b/SAFT_TOFI/src/saft.hpp
@@ -1,154 +1,88 @@
 #pragma once

-#include <string>
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
-
 #include <stdint.h>
-#include <stdio.h>      // standard input/output
-#include <vector>       // stl vector header
+#include <stdio.h>  // standard input/output

+#include <string>
+#include <vector>  // stl vector header

-typedef unsigned char  uchar;
+typedef unsigned char uchar;
 typedef unsigned short ushort;
-typedef unsigned long  ulong;
-typedef unsigned long  long ullong;
-
-
-//Define Outputs for Debugmode
-//============================
-	//#define debug_OutputFormat_German					// German Format , instead . for numbers
-	//#define debug_OutputFunctions						// Funktionenaufrufe ausgeben
-	//#define debug_OutputVariables						// Werte der Variablen ausgeben
-	//#define debug_OutputParameter						// Uebersicht der Eingabedaten anzeigen sowie Infobloeke in den einzelnen Schritten
-	//#define debug_OutputMemory						// Speicherverwaltung, Malloc, Free, Groessen
-	//#define debug_OutputMaxMemory						// Gibt aktuellen Speicherverbrauch an, wenn memoryCheck aufgerufen wird
-	//#define debug_OutputInfo							// Gibt Infos zu Schritten, Variablen,... aus
-	//#define debug_OutputPerformance					// Gibt die Laufzeiten und die eizelnen Multi-GPU Performanzwerte von ProcessAscans aus
-	//#define debug_OutputStepsPerformance				// Gibt die Laufzeiten und für die einzelnen Schritte in performCoreReconstruction aus (Copy Ascans, Precalc, PerfCoreReconstruction, copy back)
-	//#define debug_OutputStepsPrecalculation			// Gibt Infos ueber die einzelnen Schritte der Precalculation Steps an
-	//#define debug_OutputHostStepsPerformance			// Gibt die Laufzeiten für die eizelnen Schritte auf dem HOST aus (Preintegrated Ascans)
-	//#define debug_OutputSAFTHandlerThreadPerformance 	// Gibt die Gesamt-Laufzeiten der einzelnen Multi-GPU Threads aus
-	//#define debug_OutputMultiGpu						// Einteilung des Volumens auf mehrerer GPUs ausgeben
-	//#define debug_OutputStreams						// Gibt die Schritte der Berechnung der Streams aus
-	//#define debug_OutputSOSPaths						// Gibt die Schritte und Werte der SOSPfadberechnung aus
-	//#define debug_OutputSOSStepsParameter				// Einteilung der ZLayer in SOSZlayer
-	//#define debug_OutputLookUpGeometryMemoryList		// Debugausgabe fuer die LookUpGeometryMemoryList (Constant Memory)
-	//#define debug_OutputAScanIndexMemoryDivision		// Debugausgabe für die Einteilung in Surfaces da mehrere Surfaces benoetigt werden
-
-	//#define OutputVolume						// Ausgabe des  Volumens
-
-// Debugging CUDA Kernels
-//================================================
-	//#define debug_CudaSAFTKernelModes			// Use variable debugMode for different calulations methods and output
-		//#define debug_CudaSAFTKernel_EnableAnalyticAverageSpeedCalculation	// Fuer Fehlerberchnungen
-	//#define debug_CudaSAFTKernel
-	//#define debug_CudaPrecalculateKernel
-	//#define debug_CudaPrecalculateAscanIndexKernel		// Kernel function for PrecalculateAscanIndex
-	//#define debug_CudaPrecalculateAscanIndexKernelProxy	// Proxy function for PrecalculateAscanIndex
-	//#define debug_CudaFillCuArrayKernelProxy
-	//#define debug_CudaSAFTAscanIndexKernel				    // Kernel function for SAFTAscanIndex
-	//#define debug_CudaSAFTAscanIndexKernelDataAccess      // Access and sum up values from Ascans
-	//#define debug_CudaRayTraceKernel
-	//#define debug_CudaRayTraceKernelLive
-
-
-	//#define DebugSetMemoryToZero  // Set SOSPathMemory to Zero as Initialisation
-
-
-// Define specific Hardware-Versions
-	#define GTX_590
-	//#define GTX_690
-	//#define GTX_TITAN
-
-	#if defined(GTX_590)
-		#define GTX_Fermi
-	#endif
-	#if defined(GTX_690) || defined(GTX_TITAN)
-		#define GTX_Kepler
-	#endif
+typedef unsigned long ulong;
+typedef unsigned long long ullong;

 // Memory management of GPU and Errordetection
 //================================================
-	//#define SaftNoTexture					// Instead of TextureMemory use Memory access on GPU Device Memory // out-of-date
-	//#define SaftCorrectSumAllAscan	    // Recalculation if too big values occur
-
-
-// SAFT-Implementierung 2-stufig mit AscanIndexInterpolation
-//============================================================
-	#define SaftUseAscanIndexInterpolation
-		#define noGeometryLoading
-		//#define SaftUseAscanIndexInterpolation_PartWise	// Kernel mit nur einem Durchlauf durchfuehren, sonst ueber Ascans im Kernel laufen
-		#define AscanTextureUse1Float	// Textur mit nur Float1 für SOS berechnen
+// #define SaftNoTexture					// Instead of TextureMemory use Memory access on GPU Device Memory // out-of-date
+// #define SaftCorrectSumAllAscan	    // Recalculation if too big values occur

+// #define SaftUseAscanIndexInterpolation_PartWise	// Kernel mit nur einem Durchlauf durchfuehren, sonst ueber Ascans im Kernel laufen
+#define AscanTextureUse1Float  // Textur mit nur Float1 für SOS berechnen

 // Integration der A-scans im Vornherein durchfuehren um Samplebreite an zu rekonstruierende Aufloesung anzupassen
 //=======================================================================================================================
-	#define preAscanIntegrationToMatchSamplerateToResolution  // Integration der Ascans ueber Fensterbreite durchfuehren
-		//#define debug_preAscanIntegration
-			#define DebugSammleMin  2990		// Grenzen feeru Degbugausgabe der Werte
-			#define DebugSammleMax  3000
-		//#define preAscanIntegrationVersion1Michael	// direkt übernommene Version von Michael
-		#define preAscanIntegrationVersion2Ernst	// korrigierte Variante mit genauerer Fensterbreite
-
+#define preAscanIntegrationToMatchSamplerateToResolution  // Integration der Ascans ueber Fensterbreite durchfuehren
+                                                          // #define debug_preAscanIntegration
+#define DebugSammleMin 2990                               // Grenzen feeru Degbugausgabe der Werte
+#define DebugSammleMax 3000
+// #define preAscanIntegrationVersion1Michael	// direkt übernommene Version von Michael
+#define preAscanIntegrationVersion2Ernst  // korrigierte Variante mit genauerer Fensterbreite

 // Parameter fuer SAFT-Kernel
 //=======================================================================================================================
-	#define USED_ASCANSMEMORYREGIONS  		     1	// Anzahl der A-Scan-Speicherbereiche die alloziert werden
-													// Hier reicht einer statt zwei! 2 nur nötig wenn A-scans Spückweise mit Streams fuer kopiert werden sollen.
-	#define MAX_SUPPORTEDTEXTURES_FORASCANINDEX  4	// Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;
-	#define MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE  710 // Definiert maximale #Receiver in einem SOSPATH-Textur
-	#define MATLABSAVETY_MB  25 						  // in MB. Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann, daher zur Sicherheit Speicher freihalten
+#define USED_ASCANSMEMORYREGIONS \
+    1                                                // Anzahl der A-Scan-Speicherbereiche die alloziert werden
+                                                     // Hier reicht einer statt zwei! 2 nur nötig wenn A-scans Spückweise mit Streams fuer kopiert werden sollen.
+#define MAX_SUPPORTEDTEXTURES_FORASCANINDEX 4        // Definiert die im Code maximal unterstuetzen Texturen fuer AscanIndex;
+#define MAX_SUPPORTEDRECEIVER_FORSOSPATHTEXTURE 710  // Definiert maximale #Receiver in einem SOSPATH-Textur
+#define MATLABSAVETY_MB 25                           // in MB. Matlab belegt zusaetzlich GPU-Speicher, der bei Grenzfällen zum absturz fuehren kann, daher zur Sicherheit Speicher freihalten

+#define SaftLinearInterpolation  // Lineare Interpolation beim Zugriff auf A-scans durchführen

-	#define SaftLinearInterpolation  // Lineare Interpolation beim Zugriff auf A-scans durchführen
+#define SaftUseConstantMemforGeometry  // Geometriedaten im Constantmemory nutzen

-	#define SaftUseConstantMemforGeometry				// Geometriedaten im Constantmemory nutzen
+// #define SaftTextureForEmRecSosPathsTablesFloat1	// Use Float1-Textur for loading SOS-Paths -> Sum, Count separated
+// #define SaftTextureForEmRecSosPathsTablesFloat2		// Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position
+#define SaftTextureForEmRecSosPathsTablesFloat4  // Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position

-	//#define SaftTextureForEmRecSosPathsTablesFloat1	// Use Float1-Textur for loading SOS-Paths -> Sum, Count separated
-	//#define SaftTextureForEmRecSosPathsTablesFloat2		// Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position
-	#define SaftTextureForEmRecSosPathsTablesFloat4		// Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position
-
-	#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
-		#define SaftTextureForEmRecSosPathsTables		// Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible
-	#endif
-
-	// Several SAFT_VARIANTs
-	#define SAFT_VARIANT_AscanPreintegration					0
-	#define SAFT_VARIANT_AscanInterpolation						1
-	#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing	2	// Use interpolation while Preprocessing
-	#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction	3	// Use interpolation while Reconstruction
-	#define SAFT_VARIANT_CalcStandardDeviation					4	// Not yet implemented
-	#define SAFT_VARIANT_SumUpOverBoarderIndices				5	// Not yet implemented
+#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
+#define SaftTextureForEmRecSosPathsTables  // Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible
+#endif

+// Several SAFT_VARIANTs
+#define SAFT_VARIANT_AscanPreintegration 0
+#define SAFT_VARIANT_AscanInterpolation 1
+#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing 2   // Use interpolation while Preprocessing
+#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction 3  // Use interpolation while Reconstruction
+#define SAFT_VARIANT_CalcStandardDeviation 4                  // Not yet implemented
+#define SAFT_VARIANT_SumUpOverBoarderIndices 5                // Not yet implemented

 // Cache <-> shared Memory
-    //#define SaftPreferSharedMem 	// cudaFuncCachePreferShared: shared memory is 48 KB
-    #define SaftPreferL1SharedMem 	// cudaFuncCachePreferL1: shared memory is 16
-	//#define SaftPreferNone			// cudaFuncCachePreferNone: no preference
+// #define SaftPreferSharedMem 	// cudaFuncCachePreferShared: shared memory is 48 KB
+#define SaftPreferL1SharedMem  // cudaFuncCachePreferL1: shared memory is 16
+// #define SaftPreferNone			// cudaFuncCachePreferNone: no preference

 // Receiver Cache mit shared Memory (nur bei kleinen Blockgroeßen)
-    //#define SaftReceiverSharedMemCacheReceiverDistance
-	//#define SaftCacheReceiverSOS
-		//#define SaftReceiverSharedMemCacheReceiverSOS	// Use Shared Memory for Caching
-		//#define SaftRegisterCacheReceiverSOS			// Use Register for Caching
-
+// #define SaftReceiverSharedMemCacheReceiverDistance
+// #define SaftCacheReceiverSOS
+// #define SaftReceiverSharedMemCacheReceiverSOS	// Use Shared Memory for Caching
+// #define SaftRegisterCacheReceiverSOS			// Use Register for Caching

 // Berechnung der mittleren Schallgeschwindigkeit
 //================================================
-	#define SaftUseHarmonicMean       	// harmonic Mean
+#define SaftUseHarmonicMean  // harmonic Mean

-	#define SaftTextureForBresenhamSosPaths	 // Texturmemory für SOS-Volumen nutzen (Version without not full implemented)
-			//#define SaftTextureForBresenhamInterpolated	//iSOS-Version --> wird nun ueber Parameter uebergeben
-			//#define SaftUseFastMath				//FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig.
+#define SaftTextureForBresenhamSosPaths  // Texturmemory für SOS-Volumen nutzen (Version without not full implemented)
+                                         // #define SaftTextureForBresenhamInterpolated	//iSOS-Version --> wird nun ueber Parameter uebergeben
+// #define SaftUseFastMath				//FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig.

-	//#define SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert
-	#define SaftUseSosAttFloat2		// Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-
-	#define SOS_Version2	// korrekte Version mit Definitionen im Mittelpunkt
+// #define SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert
+#define SaftUseSosAttFloat2  // Nutze nur eine Textur fuer beide Volumen (Sos+Att)

+#define SOS_Version2  // korrekte Version mit Definitionen im Mittelpunkt

 // MultiGPU
 //================================================
@@ -163,18 +97,16 @@ typedef unsigned long  long ullong;
 // constant such that 64kB of constant is fully blocked by emitter/receiver combinations
 const int MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY = 2340;

-//Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred.
+// Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred.
 #define CUDA_CHECK(operation) performCUDAResultCheck(operation, __FILE__, __LINE__);

-//Macro used to see when a particular line of code is executed on the host.
+// Macro used to see when a particular line of code is executed on the host.
 #define DEBUG_MARK std::cout << "[DEBUG] file " << __FILE__ << ", line " << __LINE__ << std::endl

-
-//Convenient typedefs for containers
+// Convenient typedefs for containers
 typedef std::vector<cudaDeviceProp> DeviceProperties;
 typedef std::vector<dim3> Dimensions;

-
 /**
   Most important class in the application.
   - Haupt-Klasse der Applikation
@@ -183,399 +115,288 @@ typedef std::vector<dim3> Dimensions;
 */
 class SAFTHandler
 {
-public:
-    SAFTHandler(int deviceId,
-    			int deviceIndex,
-    			float *aScan_ptr,    				///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath,
-    			double *output_ptr,  				///< Zeiger zu den Outputdaten //std::string const & outputPath,
-    			double *Duration_ptr,  				///< Zeiger auf Ausgabewert f<>r benoetigte Laufzeit des SAFT-Kernels
-    			unsigned short *receiver_index_ptr, ///<
-    			unsigned short *emitter_index_ptr,  ///<
-    			float *receiver_list_ptr, 			///<
-    			int receiver_list_Size, 			///<
-    			float *emitter_list_ptr,			///<
-    			int emitter_list_Size, 				///<
-    			float *speed_vec_ptr,
-    			int3 SOSGrid_XYZ,
-    			float3 sosOffset, 					///< Startpoint of SoSGrid
-    			float SOS_RESOLUTION,				///< Aufloesung des SoSGrid
-    			float *att_vec_ptr,    				//att_vec_ptr
+   public:
+    SAFTHandler(int deviceId, int deviceIndex,
+                float *aScan_ptr,                    ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath,
+                double *output_ptr,                  ///< Zeiger zu den Outputdaten //std::string const & outputPath,
+                double *Duration_ptr,                ///< Zeiger auf Ausgabewert f<>r benoetigte Laufzeit des SAFT-Kernels
+                unsigned short *receiver_index_ptr,  ///<
+                unsigned short *emitter_index_ptr,   ///<
+                float *receiver_list_ptr,            ///<
+                int receiver_list_Size,              ///<
+                float *emitter_list_ptr,             ///<
+                int emitter_list_Size,               ///<
+                float *speed_vec_ptr, int3 SOSGrid_XYZ,
+                float3 sosOffset,      ///< Startpoint of SoSGrid
+                float SOS_RESOLUTION,  ///< Aufloesung des SoSGrid
+                float *att_vec_ptr,    // att_vec_ptr

-    			int aScanCount,
-    			int aScanLength,
-    			int3 IMAGE_SIZE_XYZ,
-    		    float sampleRate,
-    			float3 regionOfInterestOffset,
-    			float IMAGE_RESOLUTION,
-    			dim3 const & fixedBlockDimensions,
-    			float debugMode,
-    			float debugModeParameter,
-    			//bool useFixedPartialOutputWindow,
+                int aScanCount, int aScanLength, int3 IMAGE_SIZE_XYZ, float sampleRate, float3 regionOfInterestOffset, float IMAGE_RESOLUTION, dim3 const &fixedBlockDimensions, float debugMode,
+                float debugModeParameter,
+                // bool useFixedPartialOutputWindow,

-    			bool SOSMode_3DVolume,
-    			bool ATTMode_3DVolume,
+                bool SOSMode_3DVolume, bool ATTMode_3DVolume,

-    			int SAFT_MODE,
-    			int *SAFT_VARIANT,
-    			int SAFT_VARIANT_Size,
+                int SAFT_MODE, int *SAFT_VARIANT, int SAFT_VARIANT_Size,

-    			int *Abort_ptr
-    			);
+                int *Abort_ptr);

    void performReconstruction();

-private:
+   private:
+    int *Abort_ptr;  // Ist ein Fehler aufgetreten der zum Abburch geführt hat
+    // int Abort;

-    int *Abort_ptr;	// Ist ein Fehler aufgetreten der zum Abburch geführt hat
-    //int Abort;
+    bool SOSMode_3DVolume, ATTMode_3DVolume;

-    bool SOSMode_3DVolume,
-    	 ATTMode_3DVolume;
-
-	int SAFT_MODE;
-	int *SAFT_VARIANT;
-	int *deviceSAFT_VARIANT;
-	int SAFT_VARIANT_Size;
+    int SAFT_MODE;
+    int *SAFT_VARIANT;
+    int *deviceSAFT_VARIANT;
+    int SAFT_VARIANT_Size;

    int deviceId;
    int deviceIndex;
-    float  debugMode;
-    float  debugModeParameter;
+    float debugMode;
+    float debugModeParameter;

    DeviceProperties deviceProperties;

-    float
-    	*aScan_ptr;
+    float *aScan_ptr;

-//    float
-//    	*rec_vec_ptr,
-//    	*send_vec_ptr;
+    //    float
+    //    	*rec_vec_ptr,
+    //    	*send_vec_ptr;

    // Zuordnungslisten in der geschaut wird welcher Emitter/Receiver genutzt wird (65535 = nicht genutzt, alles andere ist dann der Index)
-	unsigned short* hostLookUpGeometryMemoryListEmitterPtr;  // Memory of hostLookUpGeometryMemoryListEmitter
-	unsigned short* hostLookUpGeometryMemoryListReceiverPtr; // Memory of hostLookUpGeometryMemoryListReceiver
-	int lookUpGeometryMemoryListEmitterSize; // Size of hostLookUpGeometryMemoryListEmitterPtr
-	int lookUpGeometryMemoryListReceiverSize;// Size of hostLookUpGeometryMemoryListReceiverPtr
+    unsigned short *hostLookUpGeometryMemoryListEmitterPtr;   // Memory of hostLookUpGeometryMemoryListEmitter
+    unsigned short *hostLookUpGeometryMemoryListReceiverPtr;  // Memory of hostLookUpGeometryMemoryListReceiver
+    int lookUpGeometryMemoryListEmitterSize;                  // Size of hostLookUpGeometryMemoryListEmitterPtr
+    int lookUpGeometryMemoryListReceiverSize;                 // Size of hostLookUpGeometryMemoryListReceiverPtr

-    unsigned short
-    	*emitter_index_ptr,
-    	*receiver_index_ptr;
+    unsigned short *emitter_index_ptr, *receiver_index_ptr;

-    float
-    	*emitter_list_ptr,
-    	*receiver_list_ptr;
+    float *emitter_list_ptr, *receiver_list_ptr;

-    int
-    	receiver_list_Size,
-    	emitter_list_Size;
+    int receiver_list_Size, emitter_list_Size;

-    double
-       	*output_ptr;
+    double *output_ptr;

-    double
-    	*Duration_ptr;
+    double *Duration_ptr;

-    float
-    	Sos,
-    	*speed_vec_ptr,
-    	*att_vec_ptr;
+    float Sos, *speed_vec_ptr, *att_vec_ptr;

-    int3
-    	SOSGrid_XYZ;
+    int3 SOSGrid_XYZ;

-	float3
-		sosOffset; 					///< Startpoint of SoSGrid
+    float3 sosOffset;  ///< Startpoint of SoSGrid

-    int
-    	aScanCount,
-    	aScanLength;
+    int aScanCount, aScanLength;

-    int3
-    	IMAGE_SIZE_XYZ;
+    int3 IMAGE_SIZE_XYZ;

-    float3
-    	regionOfInterestOffset; //imageStartpoint; TODO: umbenennen!
+    float3 regionOfInterestOffset;  // imageStartpoint; TODO: umbenennen!

-    float
-    	IMAGE_RESOLUTION,		  ///< Aufl<66>sung im OutputVolumen
-	    IMAGE_RESOLUTION_FACTOR,  ///< 1/Aufl<EFBFBD>sung im OutputVolumen
-	    SOS_RESOLUTION,			  ///< Aufloesung des SoSGrid
-	    SOS_RESOLUTION_FACTOR;	  ///< 1/Aufl<66>sung im SoS-Grid
+    float IMAGE_RESOLUTION,       ///< Aufl<66>sung im OutputVolumen
+        IMAGE_RESOLUTION_FACTOR,  ///< 1/Aufl<EFBFBD>sung im OutputVolumen
+        SOS_RESOLUTION,           ///< Aufloesung des SoSGrid
+        SOS_RESOLUTION_FACTOR;    ///< 1/Aufl<EFBFBD>sung im SoS-Grid

-    std::string
-        emitterGeometryPath,
-        receiverGeometryPath,
-        aScanSamplesPath,
-        outputPath;
+    std::string emitterGeometryPath, receiverGeometryPath, aScanSamplesPath, outputPath;

    float *aScanSamples;
    double *output;
-    //int aScanCount;
+    // int aScanCount;
    int
-    	//aScanSize,
-    	aScan_Bytes,
-    	//batchSize,	// --> aScanCount
-    	//aScanBatchSize;
-    	aScanBatch_Bytes;
+        // aScanSize,
+        aScan_Bytes,
+        // batchSize,	// --> aScanCount
+        // aScanBatchSize;
+        aScanBatch_Bytes;

    float voxelSize;

    float sampleRate;

-    //size_t
+    // size_t
    uint64_t
-    	//regionOfInterestVoxelCount,
-    	outputVolume_VoxelCount,
-        //outputSize;
-   		outputVolume_Bytes;
+        // regionOfInterestVoxelCount,
+        outputVolume_VoxelCount,
+        // outputSize;
+        outputVolume_Bytes;

-    float3 outputVolume_size_m; // ROI-Groesse in meter
+    float3 outputVolume_size_m;  // ROI-Groesse in meter

-    uint64_t
-    	partialOutputZLayerOffset;
+    uint64_t partialOutputZLayerOffset;

-    int
-    	partialOutputZLayerOffsetCount,
-    	partialOutputSoSZLayerCount,
-    	currentZLayerCount,
-    	partialSoSZLayerCount;
+    int partialOutputZLayerOffsetCount, partialOutputSoSZLayerCount, currentZLayerCount, partialSoSZLayerCount;

    // Fuer AscanIndexInterpolation
    // ------------------------------------------------------
-    int 		currentEmIndexUsedForAscanIndexCalculation;
-    float 		*deviceTextureAscanIndexFloat;					// Texture adresses for precalculated AscanIndex data
-    //std::size_t deviceTextureAscanIndexFloatSize;
+    int currentEmIndexUsedForAscanIndexCalculation;
+    float *deviceTextureAscanIndexFloat;  // Texture adresses for precalculated AscanIndex data
+    // std::size_t deviceTextureAscanIndexFloatSize;

-    cudaArray **deviceTextureAscanIndexFloatCuArray;  			// CudaArray for AscanIndex data
-    int 		maxSurfaceTexture3DDimension;		// max Dimension in 3D --> Max size for Texture
-    int 		maxAscanIndexArraysInTexture;		// = maxSurfaceTexture3DDimension/2;
-    int 		TableAscanIndexAllocationCount;		// Anzahl der benoetigten AscanBlocks der Groesse 2048/4096
-    int 		maxSupportedTexturesForAscanIndex;		// Definiert die maximal unterstuetzen Texturen fuer AscanIndex
-    int 		neededAscanBatchCount; 				// Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen
+    cudaArray **deviceTextureAscanIndexFloatCuArray;  // CudaArray for AscanIndex data
+    int maxSurfaceTexture3DDimension;                 // max Dimension in 3D --> Max size for Texture
+    int maxAscanIndexArraysInTexture;                 // = maxSurfaceTexture3DDimension/2;
+    int TableAscanIndexAllocationCount;               // Anzahl der benoetigten AscanBlocks der Groesse 2048/4096
+    int maxSupportedTexturesForAscanIndex;            // Definiert die maximal unterstuetzen Texturen fuer AscanIndex
+    int neededAscanBatchCount;                        // Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen
    // ------------------------------------------------------

    double *currentHostOutputAdress;

    // Pointer of Inputdata in memory of Ascanblock
-    float3
-        *receiver_list,			// LookUpTable receiverNr -> coordinates
-        *emitter_list;				// LookUpTable emitterNr -> coordinates
+    float3 *receiver_list,  // LookUpTable receiverNr -> coordinates
+        *emitter_list;      // LookUpTable emitterNr -> coordinates

-    unsigned short
-    	*receiver_index,			// Input Ascanblockdata: corresponding receiverNr
-    	*emitter_index;			// Input Ascanblockdata: corresponding emitterNr
+    unsigned short *receiver_index,  // Input Ascanblockdata: corresponding receiverNr
+        *emitter_index;              // Input Ascanblockdata: corresponding emitterNr

-    //float
+    // float
    //	*SoSData;					// Input Ascanblockdata: Corresponding SOS value

-    float *speedOfSoundField;		// Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen
-    float *attenuationField;		// Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen
+    float *speedOfSoundField;  // Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen
+    float *attenuationField;   // Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen

-	#ifdef SaftUseSosAttFloat2
-    	float2 *hostSosAttField;
-	#endif
+    float2 *hostSosAttField;

    // Memorysizes
    int
-    	//speedOfSoundFieldVoxelCount,				//
-    	SOSVolume_VoxelCount,				// Amount of Voxels of SOSVolume
-    	//speedOfSoundFieldBytes,						//
-    	SOSVolume_Bytes,					// Size of SOSVolume in Byte
-    	speedOfSoundEmitterVoxelPathCountByteSize,	// Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen
-    	speedOfSoundEmitterVoxelPathSumByteSize;	// Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel
+        // speedOfSoundFieldVoxelCount,				//
+        SOSVolume_VoxelCount,  // Amount of Voxels of SOSVolume
+        // speedOfSoundFieldBytes,						//
+        SOSVolume_Bytes,                            // Size of SOSVolume in Byte
+        speedOfSoundEmitterVoxelPathCountByteSize,  // Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen
+        speedOfSoundEmitterVoxelPathSumByteSize;    // Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel

-    dim3
-        fixedBlockDimensions,			// kann ws durch genericSAFTBlockDimensions ersetzt
-        genericSAFTBlockDimensions,
-        genericSAFTGridDimensions,
-        windowGridDimensions;
+    dim3 fixedBlockDimensions,  // kann ws durch genericSAFTBlockDimensions ersetzt
+        genericSAFTBlockDimensions, genericSAFTGridDimensions, windowGridDimensions;

-	cudaArray **deviceAScansCuArray;
+    cudaArray **deviceAScansCuArray;

-#ifdef SaftTextureForBresenhamSosPaths
+    cudaArray *deviceSosAttFieldCuArray;

-	#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-		cudaArray *deviceSpeedOfSoundFieldCuArray;			// SOS volume
-		cudaArray *deviceAttenuationFieldCuArray;			// ATT volume
-	#endif
+    int maxSoSReceiverArrayForTexture;
+    int TableVoxelToReceiverPathSosAllocationCount;
+    std::size_t receiver_list_Size_deviceMemory;

-	#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-		cudaArray *deviceSosAttFieldCuArray;
-	#endif
-#endif
+    // Für Emitter ----- normal definieren
+    cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray;  // SoSSum
+    cudaArray *deviceTableVoxelToEmitterPathCountCuArray;   // Count

+    // Für Receiver ----- als Arrays definieren da zwei benoetigt
+    cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray;  // SoSSum
+    cudaArray **deviceTableVoxelToReceiverPathCountCuArray;   // Count

+    cudaArray *deviceTableVoxelToEmPathSosBothCuArray;    // Emitter SoSSum + Count
+    cudaArray **deviceTableVoxelToRecPathSosBothCuArray;  // Receiver SoSSum + Count

-	int maxSoSReceiverArrayForTexture;
-	int TableVoxelToReceiverPathSosAllocationCount;
-	std::size_t receiver_list_Size_deviceMemory;
-
-#ifdef SaftTextureForEmRecSosPathsTables
-	// Für Emitter ----- normal definieren
-	cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray;			//SoSSum
-	cudaArray *deviceTableVoxelToEmitterPathCountCuArray;			//Count
-
-	// Für Receiver ----- als Arrays definieren da zwei benoetigt
-	cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray;		//SoSSum
-	cudaArray **deviceTableVoxelToReceiverPathCountCuArray;		//Count
-#endif
-
-#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
-	cudaArray *deviceTableVoxelToEmPathSosBothCuArray;  			//Emitter SoSSum + Count
-	cudaArray **deviceTableVoxelToRecPathSosBothCuArray;  			//Receiver SoSSum + Count
-#endif
-
-	// Schallgeschwindigkeitskorrektur-Mode
-    float *deviceSpeedOfSoundField;			// Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU
+    // Schallgeschwindigkeitskorrektur-Mode
+    float *deviceSpeedOfSoundField;  // Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU

    // Block-Mode
-    unsigned short *deviceEmitterIndex_block;		// Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU
+    unsigned short *deviceEmitterIndex_block;  // Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU
    unsigned short *deviceReceiverIndex_block;
-    float3 *deviceListEmitterGeometry;			// Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU
+    float3 *deviceListEmitterGeometry;  // Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU
    float3 *deviceListReceiverGeometry;

-    float  *deviceSoSData_block;				// Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU
-
-//    VoxelCountType								// Adressen fuer Speicher der SoS-Pfade auf der GPU
-//        * deviceTableVoxelToEmitterPathCount,
-//        * deviceTableVoxelToReceiverPathCount;
-    float
-    	*deviceTableVoxelToEmitterPathCountFloat,	// Texture adresses for precalculated SOS data
-    	*deviceTableVoxelToReceiverPathCountFloat,
-        *deviceTableVoxelToEmitterPathSosSum,
-        *deviceTableVoxelToReceiverPathSosSum;
+    float *deviceSoSData_block;  // Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU

+    float *deviceTableVoxelToEmitterPathCountFloat,  // Texture adresses for precalculated SOS data
+        *deviceTableVoxelToReceiverPathCountFloat, *deviceTableVoxelToEmitterPathSosSum, *deviceTableVoxelToReceiverPathSosSum;

    bool *deviceValidEmitterReceiverCombinations;

    int *deviceTransducerVectorAnalysisDistributionCounters;

-//    float3
-//        * deviceEmitterGeometry,
-//        * deviceReceiverGeometry;
-
-    int   usedAmountOfEmitter, 	// amount of used emitter
-    	  usedAmountOfReceiver; // amount of used receiver
+    int usedAmountOfEmitter,   // amount of used emitter
+        usedAmountOfReceiver;  // amount of used receiver

    // Output volume
-	double *deviceOutput;
+    double *deviceOutput;

-    //Streams used for synchronisation
-    cudaStream_t
-        copyStream,
-        calculationStream;
+    // Streams used for synchronisation
+    cudaStream_t copyStream, calculationStream;

-    //This variable describes the number of allocations used by the current SAFT mode
-    std::size_t aScanAllocationCount;	// Anzahl der Speicher die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.
+    // This variable describes the number of allocations used by the current SAFT mode
+    std::size_t aScanAllocationCount;  // Anzahl der Speicher die alloziert werden, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.

-    int
-        invalidEmitterReceiverCombinationsCount,
-        validEmitterReceiverCombinationsCount;
+    int invalidEmitterReceiverCombinationsCount, validEmitterReceiverCombinationsCount;

    Dimensions validBlockDimensions;
    bool useAutoTuning;
-//    AutoTuningConfiguration autoTuningConfiguration;
+    //    AutoTuningConfiguration autoTuningConfiguration;

-    size_t
-    	partialOutputSize,
-    	partialVolumeSize,				// Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde
-    	partialSosPathSize,				// Speicher(SOSATTPaths)  , der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde
-    	partialAscanIndexSize,			// Speicher(AscanIndex)   , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde
-    	maxFeasibleZLayerCount,			// Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt.
-    	maxFeasibleSosZLayerCount;		// Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt.
+    size_t partialOutputSize,
+        partialVolumeSize,          // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benoetigt wuerde
+        partialSosPathSize,         // Speicher(SOSATTPaths)  , der fuer die entsprechende Anzahl an SoS-Z-Layer benoetigt wuerde
+        partialAscanIndexSize,      // Speicher(AscanIndex)   , der fuer die entsprechende Anzahl an SoS-Z-Layer & Ascans benoetigt wuerde
+        maxFeasibleZLayerCount,     // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt.
+        maxFeasibleSosZLayerCount;  // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt.

-    int
-        minimumAutoTuningThreadCount,
-        maximumAutoTuningThreadCount;
+    int minimumAutoTuningThreadCount, maximumAutoTuningThreadCount;

-
-    //New partial reconstruction data
+    // New partial reconstruction data

    std::size_t partialSpeedOfSoundVoxelCount;
    std::size_t partialOutputZLayerCount;
    std::size_t zLayerVoxelCount;
-    std::size_t sosZLayerVoxelCount;           // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp
+    std::size_t sosZLayerVoxelCount;  // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp
    std::size_t partialOutputVoxelCount;

+    double diff_time;           // For Time Measurement
+    float transferRate;         // For DataTransferrate Measurement
+    float performRate;          // For PerformSAFTrate Measurement
+    cudaDeviceProp deviceProp;  // Ausgabe der Frequenz

-	double diff_time;	// For Time Measurement
-	float transferRate; // For DataTransferrate Measurement
-	float performRate;  // For PerformSAFTrate Measurement
-	cudaDeviceProp deviceProp;		// Ausgabe der Frequenz
+    // Core reconstruction

-
-    //Core reconstruction
-
-    void processAScans(ullong & duration);
+    void processAScans(ullong &duration);
    void performCoreReconstruction();

-    //Pre-calculation
+    // Pre-calculation

-    //void precalculateAverageSpeedOfSound(int zLayer, int zLayerCount);  // TODO: Funktion die nicht mehr benutzt wird?
-//    void analysisOfTransducerVectors();
-
-//    void normalisePerformanceStatisticsOutput();
-//    void printTransducerVectorStatistics();
-
-    //Auto-tuning
-    bool determineGridDimensions(dim3 const & blockDimensions, dim3 & gridDimensions);
+    // Auto-tuning
+    bool determineGridDimensions(dim3 const &blockDimensions, dim3 &gridDimensions);
    void determineValidBlockDimensions();

+    void reduceKernelDimensions(dim3 const &gridDimensions, dim3 const &blockDimensions, dim3 &reducedGridDimensions, dim3 &reducedBlockDimensions);

-    void reduceKernelDimensions(dim3 const & gridDimensions, dim3 const & blockDimensions, dim3 & reducedGridDimensions, dim3 & reducedBlockDimensions);
-
-    //Pre-calculation kernels
+    // Pre-calculation kernels
    //------------------------------------------------------------------------
-	#ifdef SaftUseConstantMemforGeometry
-		//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
-		void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
-	#else
-		//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceSpeedOfSoundSumOutput);
-		void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, float * deviceSpeedOfSoundSumOutput);
-	#endif
+    void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float *deviceVoxelCountOutputFloat, float *deviceSpeedOfSoundSumOutput);

-	#ifdef SaftUseAscanIndexInterpolation
-		void precalculateAscanIndex
-		(
-				int currentSpeedOfSoundZLayer, 						///< First z-layer in the speed of sound grid the pre-calculation is performed for.
-				int maxFeasibleSosZLayerCount 					 	///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
-				//int currentEmIndexUsedForAscanIndexCalculation,		///< current Index of Em for which the AscanIndex is calculated
-				//int emitter_list_Size, 		///< Number of emitter_array got from Matlab
-				//int receiver_list_Size, 	///< Number of receiver_array got from Matlab
-				//float * deviceTextureAscanIndexFloatCuArray 	    ///< Out: AscanIndex for the path from Emitter to voxel to Receiver.
-		);
+    void precalculateAscanIndex(int currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
+                                int maxFeasibleSosZLayerCount   ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
+                                                                // int currentEmIndexUsedForAscanIndexCalculation,		///< current Index of Em for which the AscanIndex is calculated
+                                                                // int emitter_list_Size, 		///< Number of emitter_array got from Matlab
+                                                                // int receiver_list_Size, 	///< Number of receiver_array got from Matlab
+                                                                // float * deviceTextureAscanIndexFloatCuArray 	    ///< Out: AscanIndex for the path from Emitter to voxel to Receiver.
+    );

-		void precalculateAscanIndex_usePaths
-		(
-				int ascanIndex_i,									///< Offset of AscanIndex batch.
-				int aScanWindowSize,								///< Amount of Ascans in AscanIndex batch to process.
-				int currentSpeedOfSoundZLayer, 						///< First z-layer in the speed of sound grid the pre-calculation is performed for.
-				int maxFeasibleSosZLayerCount 					 	///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
-		//		int currentEmIndexUsedForAscanIndexCalculation,		///< current Index of Em for which the AscanIndex is calculated -> No more necessary due to all Combinations-should be Calculated
-		//		float * deviceTextureAscanIndexFloatCuArray			///< Out: AscanIndex for the path from Emitter to voxel to Receiver.
-		);
+    void precalculateAscanIndex_usePaths(
+        int ascanIndex_i,               ///< Offset of AscanIndex batch.
+        int aScanWindowSize,            ///< Amount of Ascans in AscanIndex batch to process.
+        int currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
+        int maxFeasibleSosZLayerCount   ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
+                                        //		int currentEmIndexUsedForAscanIndexCalculation,		///< current Index of Em for which the AscanIndex is calculated -> No more necessary due to all
+                                        // Combinations-should be Calculated 		float * deviceTextureAscanIndexFloatCuArray			///< Out: AscanIndex for the path from Emitter to voxel to
+                                        // Receiver.
+    );

-	#endif
-
-	// Initialize AScanIndexSurface
-	void fillCuArray
-		(
-				float useValue,
-				cudaArray **deviceTextureAscanIndexFloatCuArray, ///< CuArray to fill
-				int TableAscanIndexAllocationCount
-		);
-
-    //SAFT Kernel
-	void performSAFT(int aScanIndex, size_t aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, int currentEmIndexUsedForAscanIndexCalculation, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceAScansCuArray);   //Ascans in CuArray f<>r Texturmemory
+    // Initialize AScanIndexSurface
+    void fillCuArray(float useValue,
+                     cudaArray **deviceTextureAscanIndexFloatCuArray,  ///< CuArray to fill
+                     int TableAscanIndexAllocationCount);

+    // SAFT Kernel
+    void performSAFT(int aScanIndex, size_t aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer,
+                     int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, int currentEmIndexUsedForAscanIndexCalculation, dim3 const &windowGridDimensions, dim3 const &gridDimensions,
+                     dim3 const &blockDimensions, float *deviceSpeedOfSoundField, cudaArray *deviceAScansCuArray);  // Ascans in CuArray f<>r Texturmemory
 };

-//std::string vectorToString(float3 const & vector);
-//std::string voxelToString(dim3 const & voxel);
 extern void memoryCheck();
 extern std::size_t memoryGPUfree();
 extern std::size_t memoryGPUtotal();

-extern void performCUDAResultCheck(cudaError_t result, std::string const & file, int line);
-
-
+extern void performCUDAResultCheck(cudaError_t result, std::string const &file, int line);
Author	SHA1	Message	Date
kradchen	b599153837	fix: fix a miss "{" bug	2024-11-21 10:25:45 +08:00
kradchen	82a2a9e132	feat: refactor & clean cpp code in SAFT_TOFI	2024-11-21 09:49:34 +08:00
kradchen	0e29f139af	feat: refactor & clean kernerl dir in SAFT_TOFI	2024-11-19 16:49:21 +08:00