feat: refactor & clean kernerl dir in SAFT_TOFI

2024-11-19 16:49:21 +08:00
parent d470f12dc1
commit 0e29f139af
10 changed files with 2487 additions and 5499 deletions
--- a/SAFT_TOFI/CMakeLists.txt
+++ b/SAFT_TOFI/CMakeLists.txt
@@ -3,7 +3,9 @@ project(SaftTofi)
 set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
 enable_language(CUDA)
 find_package (OpenMP REQUIRED)
-add_library(SaftTofi SHARED ./src/SAFT_TOFI.cpp ./src/saft.cu ./src/processAScans.cpp ./src/saft.cpp )
+file(GLOB_RECURSE  cu_files ./src/*.cu)
+file(GLOB_RECURSE  cuh_files ./src/*.cuh)
+add_library(SaftTofi SHARED ./src/SAFT_TOFI.cpp ./src/processAScans.cpp ./src/saft.cpp ${cu_files} ${cuh_files})
 target_include_directories(SaftTofi PRIVATE ../SAFT ./src /usr/local/cuda/include  ) 
 set_target_properties(SaftTofi PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 target_compile_options(SaftTofi PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--- a/SAFT_TOFI/src/kernel/constantMemory.cuh
+++ b/SAFT_TOFI/src/kernel/constantMemory.cuh
@@ -1,33 +0,0 @@
-#include "saft.hpp"
-
-/*!
-	Emitter and receiver geometry held in constant memory, available across all functions in saft.cu because all of it is held in the same compilation unit.
-	- Emitter und Receiver Geometrie werden im Constant Memory gehalten, erreichbar f<>r alle Funktionen in Saft.cu  weil alle von ihnen in der selben Kompilierungs-Einheit gehalten werden.
-*/
-
-#ifdef SaftUseConstantMemforGeometry
-
-	#ifdef SaftUseHarmonicMean
-        
-        __constant__ float3 emitterPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-		__constant__ float3 receiverPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-
-		// __constant__ float3 emitterPOSharmon[157 * 4];
-		// __constant__ float3 receiverPOSharmon[157 * 9];
-
-		float3* constEmitterPtr = &emitterPOSharmon[0];
-		float3* constReceiverPtr = &receiverPOSharmon[0];
-	#endif
-
-
-	// LookUpTable for GeometryList and Memory Position
-    
-    __constant__ unsigned short lookUpGeometryMemoryListEmitter [MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-	__constant__ unsigned short lookUpGeometryMemoryListReceiver[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
-
-	// __constant__ unsigned short lookUpGeometryMemoryListEmitter [157 * 4];
-	// __constant__ unsigned short lookUpGeometryMemoryListReceiver[157 * 9];
-
-	unsigned short* constLookUpGeometryMemoryListEmitterPtr = &lookUpGeometryMemoryListEmitter[0];
-	unsigned short* constLookUpGeometryMemoryListReceiverPtr = &lookUpGeometryMemoryListReceiver[0];
-#endif
--- a/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cu
+++ b/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cu
--- a/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cuh
+++ b/SAFT_TOFI/src/kernel/precalculateSpeedOfSoundKernel.cuh
--- a/SAFT_TOFI/src/kernel/rayTracing.cu
+++ b/SAFT_TOFI/src/kernel/rayTracing.cu
@@ -0,0 +1,3 @@
+#include "rayTracing.cuh"
+texture<float, cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField;  // Schritt 1. Textur anlegen	//TODO: fuer Float2 fall rausnehmen
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texRefSosAttField;       // Schritt 1. Textur anlegen
--- a/SAFT_TOFI/src/kernel/rayTracing.cuh
+++ b/SAFT_TOFI/src/kernel/rayTracing.cuh
@@ -1,676 +1,167 @@
-#include <stdio.h>
 #include "saft.hpp"

-// Structure of File
-//-------------------------------------------------------
-//
-// __device__ __forceinline__ void determineSpeedOfSoundFieldVoxelFloat 		// Teilpfade  (genutzt)// Bestimme den SOSVoxel der zu einer Position gehoert, mit Nachkommastelle
-//
-// __device__ __forceinline__ void processRayTracedVoxelTexture 				// Für den Fall SaftUseSosAttFloat1 erstmal lassen
-// __device__ __forceinline__ void processRayTracedVoxelTextureSosAtt			// AscanIndex // Addiere lokale SOS&ATT-Werte und # Voxel im Pfad
-//
-// Bresenham
-// __device__ __forceinline__ void performRayTracedSpeedAdditionTexture			// Teilpfade  // Dreidimensionale Version des Bresenham Line Algorithmus im Float-Format
+#define SQR(X) ((X) * (X))

-// -------------------------------------------------------
+extern texture<float, cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField;  // Schritt 1. Textur anlegen	//TODO: fuer Float2 fall rausnehmen
+extern texture<float2, cudaTextureType3D, cudaReadModeElementType> texRefSosAttField;       // Schritt 1. Textur anlegen

-// printf() is only supported
-// for devices of compute capability 2.0 and above
-
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
-	#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
-#endif
-
-#define SQR(X) ((X)*(X))
+#define OutputSOSPositionX 20  // Volumen Voxel im SOS-Grid!
+#define OutputSOSPositionY 20
+#define OutputSOSPositionZ 20  // Echte Z-SoS-Layer ohne Offset


-//#ifdef debug_CudaRayTraceKernel
-	#define OutputSOSPositionX  20			// Volumen Voxel im SOS-Grid!
-	#define OutputSOSPositionY  20
-	#define OutputSOSPositionZ  20 		    // Echte Z-SoS-Layer ohne Offset
-
-//	#define ER_PositionX  4				// Emitter Receiver Position im SOS-Grid!
-//	#define ER_PositionY  37
-//	#define ER_PositionZ  9
-//#endif
-
-#ifdef debug_CudaRayTraceKernelLive
-	#define OutputPositionX  250			// Volumen Voxel im Volumen!
-	#define OutputPositionY  250
-	#define OutputPositionZ  0
-
-	#define DebugSosVoxelX  10
-	#define DebugSosVoxelY  10
-	#define DebugSosVoxelZ  10
-
-	#define ER_PositionX  0				// Emitter Receiver Position im SOS-Grid!
-	#define ER_PositionY  32
-	#define ER_PositionZ  0
-#endif
-
-//Textur fuer SoSField anlegen
-
-#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-	texture<float, cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField; // Schritt 1. Textur anlegen
-#endif
-
-#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-	texture<float , cudaTextureType3D, cudaReadModeElementType> texRefSpeedOfSoundField; // Schritt 1. Textur anlegen	//TODO: fuer Float2 fall rausnehmen
-	texture<float2, cudaTextureType3D, cudaReadModeElementType> texRefSosAttField; // Schritt 1. Textur anlegen
-#endif
-
-
-
-/**
-   Determines the voxel within the speed of sound field associated with a position.
-   - Bestimme den Voxel in dem Schallgeschwindigkeitsfeld der zu einer Position gehoert, mit Nachkommastelle
-*/
-__device__ __forceinline__ void determineSpeedOfSoundFieldVoxelFloat(		// __forceinline__ zwingt den Compiler diesen Code bei jeden Aufruf direkt einzubinden (nicht als Funktion).
-    float3 const & position, 	///< Position within the scanner.
-    float3 & voxel, 				///< This argument is written to. Output voxel.
-    float3 & sosOffset,
-    float & SOS_RESOLUTION_FACTOR
-    )
+__device__ __forceinline__ void determineSpeedOfSoundFieldVoxelFloat(  // __forceinline__ zwingt den Compiler diesen Code bei jeden Aufruf direkt einzubinden (nicht als Funktion).
+    float3 const &position,                                            ///< Position within the scanner.
+    float3 &voxel,                                                     ///< This argument is written to. Output voxel.
+    float3 &sosOffset, float &SOS_RESOLUTION_FACTOR)

 {
-
-	#ifndef SOS_Version2
-		voxel.x = (position.x - sosOffset.x ) * SOS_RESOLUTION_FACTOR + 0.5f;	// SoSVoxel aus Positionsdaten bestimmen
-		voxel.y = (position.y - sosOffset.y ) * SOS_RESOLUTION_FACTOR + 0.5f;
-		voxel.z = (position.z - sosOffset.z ) * SOS_RESOLUTION_FACTOR + 0.5f;
-	#else
-		voxel.x = (position.x - sosOffset.x ) * SOS_RESOLUTION_FACTOR;	// SoSVoxel aus Positionsdaten bestimmen
-		voxel.y = (position.y - sosOffset.y ) * SOS_RESOLUTION_FACTOR;
-		voxel.z = (position.z - sosOffset.z ) * SOS_RESOLUTION_FACTOR;
-	#endif
-
+    voxel.x = (position.x - sosOffset.x) * SOS_RESOLUTION_FACTOR;  // SoSVoxel aus Positionsdaten bestimmen
+    voxel.y = (position.y - sosOffset.y) * SOS_RESOLUTION_FACTOR;
+    voxel.z = (position.z - sosOffset.z) * SOS_RESOLUTION_FACTOR;
 }

-
-
-
 /**
   Process a voxel in the Bresenham algorithm. Float Format
   Accumulate speed of sound samples and keep track of the number of voxels in the path.
   - Verarbeite einen Voxel in dem Bresenham Algorithmus
   - Addiere Schallgeschwindigkeits-Sample
 */
-__device__ __forceinline__ void processRayTracedVoxelTexture(
-	float const * currentVoxelFloat, 			///< Bresenham speed of sound voxel float.
-    int & voxelCount, 				///< This argument is written to. Number of voxels in the path so far.
-    float & totalSpeed, 			///< This argument is written to. Speed of sound sample accumulator.
-    //float const * speedOfSoundField,///< Speed of sound field data containing samples.
-    cudaArray *deviceSpeedOfSoundFieldCuArray,		///< Pointer to cudaArray for SoSFieldData
-    int3 const & SOSGrid_XYZ		///< Size of SOS-Grid in XYZ
-    )
+__device__ __forceinline__ void processRayTracedVoxelTexture(float const *currentVoxelFloat,  ///< Bresenham speed of sound voxel float.
+                                                             int &voxelCount,                 ///< This argument is written to. Number of voxels in the path so far.
+                                                             float &totalSpeed,               ///< This argument is written to. Speed of sound sample accumulator.
+                                                             // float const * speedOfSoundField,///< Speed of sound field data containing samples.
+                                                             cudaArray *deviceSpeedOfSoundFieldCuArray,  ///< Pointer to cudaArray for SoSFieldData
+                                                             int3 const &SOSGrid_XYZ                     ///< Size of SOS-Grid in XYZ
+)

 {
-	#ifdef SaftUseHarmonicMean
-		totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-	#endif
-
-    #ifdef debug_CudaRayTraceKernel
-    		if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-    		{
-    			printf("pSOS currentVoxel [%d %d %d]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-    			//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-    			printf("  totalSpeed         = %2.10f\n", totalSpeed);
-    			//printf("  totalAttenuation   = %2.10f\n", totalAttenuation);
-    			printf("  voxelCount         = %d\n",     voxelCount);
-    		}
-    #endif
+    totalSpeed += 1 / (float)tex3D(texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
 }



-
-///////////////////////////////////////////////////////////////////////////////////////////
-// AscanIndex - Variante
-// kernel version Z.316
-///////////////////////////////////////////////////////////////////////////////////////////
-///////////////////////////////////////////////////////////////////////////////////////////
-/**
-   Process a voxel in the Bresenham algorithm. Float Format
-   Accumulate speed of sound samples and keep track of the number of voxels in the path.
-   - Addiere lokale Schallgeschwindigkeits-Werte und Anzahl besuchter Voxel im Pfad
- */
-// Aufruf:  processRayTracedVoxelTextureSosAtt(currentVoxelFloat, voxelCount, totalSpeed, totalAttenuation, deviceSosAttFieldCuArray, SOSGrid_XYZ);
-
 __device__ __forceinline__ void processRayTracedVoxelTextureSosAtt(
-	float const * currentVoxelFloat, 			///< SOS-coordinates of current Voxel for Bresenham in float[3].
-	float & voxelCount, 							///< This argument is written to. Number of voxels in the path so far. // TODO: not necessary here to sum up because it is already known from Bresenham length
-    float & totalSpeed, 						///< This argument is written to. Sum Up total SOS
-    float & totalAttenuation, 					///< This argument is written to. Sum Up total Attenuation
-    //float const * speedOfSoundField,			///< Speed of sound field data containing samples.
-    cudaArray *deviceSosAttFieldCuArray,		///< Pointer to cudaArray for SoSATTFieldData  	// TODO: not  necessary if access is used with texture memory
-    int3 const & SOSGrid_XYZ					///< Size of SOS-Grid in XYZ					// TODO: only necessary for boundary-check.
-    )
+    float const *currentVoxelFloat,  ///< SOS-coordinates of current Voxel for Bresenham in float[3].
+    float &voxelCount,               ///< This argument is written to. Number of voxels in the path so far. // TODO: not necessary here to sum up because it is already known from Bresenham length
+    float &totalSpeed,               ///< This argument is written to. Sum Up total SOS
+    float &totalAttenuation,         ///< This argument is written to. Sum Up total Attenuation
+    cudaArray *deviceSosAttFieldCuArray,  ///< Pointer to cudaArray for SoSATTFieldData  	// TODO: not  necessary if access is used with texture memory
+    int3 const &SOSGrid_XYZ               ///< Size of SOS-Grid in XYZ					// TODO: only necessary for boundary-check.
+)

 {
-	float2 SosAttValue;
-
-	#ifdef SaftUseHarmonicMean
-		//totalSpeed += 1/speedOfSoundSample;		// harmonisches Mittel
-//		#ifdef SaftTextureForBresenhamInterpolated
-//			#ifndef SOS_Version2
-//				totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f );
-//			#else
-//				totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f );
-//			#endif
-//		#else
-			#ifndef SOS_Version2
-				totalSpeed += 1/(float)tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2] );
-			#else
-				//totalSpeed       += 1/tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-				//totalAttenuation +=   tex3D( texRefAbsorptionField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-
-
-				#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-					totalSpeed       += 1/tex3D( texRefSpeedOfSoundField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-					//totalAttenuation +=   tex3D( texRefAbsorptionField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-				#endif
-				#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-//					if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-//					{
-//						//printf("pSOSATT pathPoint [%f %f %f]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-//						//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-//						printf(".  totalSpeed         = %2.10f\n", totalSpeed);
-//						printf(".  totalAttenuation   = %2.10f\n", totalAttenuation);
-//						printf(".  voxelCount         = %d\n",     voxelCount);
-//					}
-					SosAttValue = tex3D( texRefSosAttField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-					totalSpeed       += 1/SosAttValue.x;
-					totalAttenuation += SosAttValue.y;
-
-
-					//if (SosAttValue.y > 20)
-					//	printf("!!!!!!!!!!!!!!!!! pSOSATT pathPoint [%f %f %f] = %f\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2], SosAttValue.y);
-
-
-//					if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-//					{
-//						//printf("pSOSATT pathPoint [%f %f %f]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-//						//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-//						printf("..  totalSpeed         = %2.10f\n", totalSpeed);
-//						printf("..  totalAttenuation   = %2.10f\n", totalAttenuation);
-//						printf("..  voxelCount         = %d\n",     voxelCount);
-//					}
-				#endif
-
-
-			#endif
-//		#endif
-	#endif
-
-
-	#ifdef debug_CudaRayTraceKernel
-		if ((currentVoxelFloat[0] == DebugSosVoxelX) && (currentVoxelFloat[1] == DebugSosVoxelY) && (currentVoxelFloat[2] == DebugSosVoxelZ))
-		{
-			printf("pSOSATT currentVoxel [%f %f %f]\n", currentVoxelFloat[0], currentVoxelFloat[1], currentVoxelFloat[2]);
-			//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-			printf("  totalSpeed         = %2.10f\n", totalSpeed);
-			printf("  totalAttenuation   = %2.10f\n", totalAttenuation);
-			printf("  voxelCount         = %d\n",     voxelCount);
-		}
-	#endif
-
+    float2 SosAttValue;
+    SosAttValue = tex3D(texRefSosAttField, currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
+    totalSpeed += 1 / SosAttValue.x;
+    totalAttenuation += SosAttValue.y;
 }


+__device__ __forceinline__ void performRayTracedSpeedAdditionTexture(float &voxelCount,  ///< This argument is written to. Number of voxels within the path traced.

+                                                                     float &totalSpeed,  ///< This argument is written to. Sum of the speed of sound samples in the path traced.
+                                                                     float &totalAttenuation,

+                                                                     float3 const &point1f,  ///< Vector array describing the Voxelcoordinates of emitters or receivers.
+                                                                     dim3 const &point2,     ///< Vector array describing the Voxelcoordinates of Voxels.

-///////////////////////////////////////////////////////////////////////////////////////////
-// aktuell genutze Variante fuer Teilpfade
-///////////////////////////////////////////////////////////////////////////////////////////
+                                                                     cudaArray *deviceSpeedOfSoundFieldCuArray,  ///< CuArray fuer SOSFieldTextur

-/**
-   - Dreidimensionale Version des Bresenham Line Algorithmus im Float-Format
-*/
-__device__ __forceinline__ void performRayTracedSpeedAdditionTexture(
-	float & voxelCount, 					///< This argument is written to. Number of voxels within the path traced.
-
-#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-	float & totalSpeed, 				///< This argument is written to. Sum of the speed of sound samples in the path traced.
-#endif
-#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-	float & totalSpeed, 				///< This argument is written to. Sum of the speed of sound samples in the path traced.
-	float & totalAttenuation,
-#endif
-
-	float3 const & point1f,				///< Vector array describing the Voxelcoordinates of emitters or receivers.
-	dim3 const & point2,				///< Vector array describing the Voxelcoordinates of Voxels.
-
-#ifndef SaftTextureForBresenhamSosPaths
-	float const * deviceSpeedOfSoundField, 		///< Array of speed of sound samples. Dimensions ordered by speed of indices, commencing with the fastest moving one: 1. x 2. y 3. z
-#else
-	cudaArray *deviceSpeedOfSoundFieldCuArray,	///< CuArray fuer SOSFieldTextur
-#endif
-
-	int3 const & SOSGrid_XYZ,			///< Size of SOS-Grid in XYZ
-	float3 & sosOffset,
-	float & SOS_RESOLUTION,
-	float & IMAGE_RESOLUTION,
-	float3 regionOfInterestOffset,
-
-	#ifdef SaftUseConstantMemforGeometry
-		int geometry 							///< emitters=0 or receivers=1.
-	#else
-		float3 const * geometry 				///< Vector array describing the positions of emitters or receivers.
-	#endif
-	)
+                                                                     int3 const &SOSGrid_XYZ,  ///< Size of SOS-Grid in XYZ
+                                                                     float3 &sosOffset, float &SOS_RESOLUTION, float &IMAGE_RESOLUTION, float3 regionOfInterestOffset,
+                                                                     int geometry  ///< emitters=0 or receivers=1.
+)

 {
+    voxelCount = 0.0f;
+    totalSpeed = 0.0f;
+    totalAttenuation = 0.0f;

-	voxelCount = 0.0f;
-	totalSpeed = 0.0f;
-	totalAttenuation = 0.0f;
+    // Voxel-Koordinaten ebenfalls in float umwandeln
+    float voxel1f[3] = {point1f.x, point1f.y, point1f.z};  // Point1 liegt im float3 Format vor enthaellt schon +0.5

-	// Voxel-Koordinaten ebenfalls in float umwandeln
-	float voxel1f[3]= {point1f.x,point1f.y,point1f.z};    					//Point1 liegt im float3 Format vor enthaellt schon +0.5
+    // float VoxelIncrement = IMAGE_RESOLUTION/SOS_RESOLUTION;
+    float voxel2f[3];

-	//float VoxelIncrement = IMAGE_RESOLUTION/SOS_RESOLUTION;
-	float voxel2f[3];
+    voxel2f[0] = (float)point2.x;  // + 0.5f;
+    voxel2f[1] = (float)point2.y;  // + 0.5f;
+    voxel2f[2] = (float)point2.z;  // + 0.5f;

-	#ifndef SOS_Version2
-		voxel2f[0] = (regionOfInterestOffset.x - sosOffset.x ) / SOS_RESOLUTION + 0.5f;	// Start des Bildes im SOS-Grid aus Positionsdaten bestimmen
-		voxel2f[1] = (regionOfInterestOffset.y - sosOffset.y ) / SOS_RESOLUTION + 0.5f;
-		voxel2f[2] = (regionOfInterestOffset.z - sosOffset.z ) / SOS_RESOLUTION + 0.5f;
-	#endif
+    int greatestDistanceDim = 0;       // Gibt die Richtung(X,Y oder Z) an mit der grueueten Entfernung
+    int slowDim1 = 0;                  // Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
+    int slowDim2 = 0;                  // Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
+    float greatestDistance_XYZ = 0.0;  // Grueuete Distanz in die Richtung mit der grueueten Entfernung
+    int fastDirectionSteps = 0;        // Schritte die gegangen werden.

+    float m_XYZ[3];
+    float pathPoint[3];
+    if ((abs(voxel1f[0] - voxel2f[0]) <= abs(voxel1f[2] - voxel2f[2])))      // X<Z
+        if ((abs(voxel1f[1] - voxel2f[1]) <= abs(voxel1f[0] - voxel2f[0])))  // Y<Z
+        {
+            greatestDistanceDim = 2;  // => Z
+            slowDim1 = 0;
+            slowDim2 = 1;
+        }
+        else if ((abs(voxel1f[2] - voxel2f[2]) <= abs(voxel1f[1] - voxel2f[1])))  // Z<Y
+        {
+            greatestDistanceDim = 1;  // => Y
+            slowDim1 = 0;
+            slowDim2 = 2;
+        }
+        else  // Z=Y
+        {
+            greatestDistanceDim = 2;  // => Z
+            slowDim1 = 0;
+            slowDim2 = 1;
+        }
+    else                                                                     // X>Z
+        if ((abs(voxel1f[1] - voxel2f[1]) <= abs(voxel1f[0] - voxel2f[0])))  // Y<X
+        {
+            greatestDistanceDim = 0;  // => X
+            slowDim1 = 1;
+            slowDim2 = 2;
+        }
+        else  // Y>X
+        {
+            greatestDistanceDim = 1;  // => Y
+            slowDim1 = 0;
+            slowDim2 = 2;
+        }

-		#ifdef debug_CudaRayTraceKernel
-			if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-		//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-			{
-//				printf("-> performRayTracedSpeedAdditionTexture\n");
-//				printf("	point2 (Vol Pos im SoS)  [%d %d %d]\n", point2.x , point2.y , point2.z);
-//				printf("	voxel1 (E/R Pos im SoS)  [%13.6f %13.6f %13.6f]\n", voxel1f[0] , voxel1f[1] , voxel1f[2] );	// Voxel 1 = ER Position
-				printf("	voxel1f                = [%3f %3f %3f]\n", voxel1f[0],voxel1f[1],voxel1f[2]);
-				printf("	voxel1f                = [%3i %3i %3i]\n", (int)voxel1f[0],(int)voxel1f[1],(int)voxel1f[2]);
-//				printf("	regionOfInterestOffset   [%f %f %f]\n", regionOfInterestOffset.x , regionOfInterestOffset.y , regionOfInterestOffset.z );	// Voxel 2 = Volumen Position
-//				printf("	sosOffset                [%f %f %f]\n", sosOffset.x , sosOffset.y , sosOffset.z );
-//				printf("	SOS_RESOLUTION  = %12.8f\n", SOS_RESOLUTION );
-//				printf("	VoxelIncrement  = %12.8f\n", VoxelIncrement );
-//				//printf("   -> voxel2Start (Vol Pos SoS)  [%f %f %f]\n", voxel2f[0] , voxel2f[1] , voxel2f[2] );	// Voxel 2 Start -> wird noch angepasst
-			}
+    // ist Steigung negativ? & Bestimmen der fastDirectionSteps vom diskretisierten Startpunkt zum Endpunkt bzw. Voxel
+    if (voxel1f[greatestDistanceDim] >= voxel2f[greatestDistanceDim])  // voxel2f > voxel1f   Endpukt > Startpkt -> Steigung positiv
+    {
+        fastDirectionSteps = floor(voxel1f[greatestDistanceDim] + 0.5f) - floor(voxel2f[greatestDistanceDim] + 0.5f) + 1;
+        pathPoint[greatestDistanceDim] = voxel2f[greatestDistanceDim];
+        pathPoint[slowDim1] = voxel2f[slowDim1];
+        pathPoint[slowDim2] = voxel2f[slowDim2];
+    }
+    else  // voxel2f < voxel1f   Endpukt < Startpkt -> Steigung negativ
+    {
+        fastDirectionSteps = floor(voxel2f[greatestDistanceDim] + 0.5f) - floor(voxel1f[greatestDistanceDim] + 0.5f) + 1;
+        pathPoint[greatestDistanceDim] = voxel1f[greatestDistanceDim];
+        pathPoint[slowDim1] = voxel1f[slowDim1];
+        pathPoint[slowDim2] = voxel1f[slowDim2];
+    }

-		#endif
+    // Steigung bestimmen
+    // Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
+    greatestDistance_XYZ = ((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]);  // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
+    m_XYZ[greatestDistanceDim] = 1.0f;                                                                   // Wegen Rundungsfehler bei der Division, die auftreten koennen.
+    // m_XYZ[greatestDistanceDim] = (voxel2f[greatestDistanceDim] - voxel1f[greatestDistanceDim]) / greatestDistance_XYZ;
+    m_XYZ[slowDim1] = (voxel2f[slowDim1] - voxel1f[slowDim1]) / greatestDistance_XYZ;
+    m_XYZ[slowDim2] = (voxel2f[slowDim2] - voxel1f[slowDim2]) / greatestDistance_XYZ;

-		#ifndef SOS_Version2
-			// Herausfinden wo genau der Voxel anfaengt/die Koordinaten des Voxels
-			voxel2f[0] = (float)point2.x + VoxelIncrement - fmod(((float)point2.x-voxel2f[0]), VoxelIncrement);
-			voxel2f[1] = (float)point2.y + VoxelIncrement - fmod(((float)point2.y-voxel2f[1]), VoxelIncrement);
-			voxel2f[2] = (float)point2.z + VoxelIncrement - fmod(((float)point2.z-voxel2f[2]), VoxelIncrement);
-		#else
-			//Vom Mittelpunkt ausgehen
-			voxel2f[0] = (float)point2.x;// + 0.5f;
-			voxel2f[1] = (float)point2.y;// + 0.5f;
-			voxel2f[2] = (float)point2.z;// + 0.5f;
-		#endif
+    int j = 0;
+    for (j = fastDirectionSteps; j > 0; j--)  //(Alle Punkte innerhalb der Schleife berechnen)
+    {
+        processRayTracedVoxelTextureSosAtt(pathPoint, voxelCount, totalSpeed, totalAttenuation, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
+        pathPoint[greatestDistanceDim] = pathPoint[greatestDistanceDim] + m_XYZ[greatestDistanceDim];
+        pathPoint[slowDim1] = pathPoint[slowDim1] + m_XYZ[slowDim1];
+        pathPoint[slowDim2] = pathPoint[slowDim2] + m_XYZ[slowDim2];
+    }

-
-		#ifdef debug_CudaRayTraceKernel
-
-			if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-		//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-			{
-				printf("   -> voxel2Angepasst (Vol Pos SoS)  [%13.6f %13.6f %13.6f]\n", voxel2f[0] , voxel2f[1] , voxel2f[2] );	// Voxel 2 nach anpassung
-				//printf("-> performRayTracedSpeedAdditionTexture\n");
-//				printf("	voxel1 (E/R Pos im SoS)  [%f %f %f]\n", voxel1f[0] , voxel1f[1] , voxel1f[2] );	// Voxel 1 = ER Position
-//				printf("	voxel2 (Vol Pos im SoS)  [%f %f %f]\n", voxel2f[0] , voxel2f[1] , voxel2f[2] );	// Voxel 2 = Volumen Position
-//				printf("	regionOfInterestOffset   [%f %f %f]\n", regionOfInterestOffset.x , regionOfInterestOffset.y , regionOfInterestOffset.z );	// Voxel 2 = Volumen Position
-//				printf("	sosOffset                [%f %f %f]\n", sosOffset.x , sosOffset.y , sosOffset.z );
-//				printf("	SOS_RESOLUTION  = %f\n", SOS_RESOLUTION );
-//				printf("	VoxelIncrement  = %f\n", VoxelIncrement );
-//				printf("	point2 (Vol Pos im SoS)  [%f %f %f]\n", point2.x , point2.y , point2.z);
-
-			}
-
-		#endif
-
-		int greatestDistanceDim = 0;		// Gibt die Richtung(X,Y oder Z) an mit der grueueten Entfernung
-		int slowDim1 = 0;					// Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
-		int slowDim2 = 0;					// Gibt die Richtung(X,Y oder Z) der langsamen Richtung;
-		float greatestDistance_XYZ = 0.0;	// Grueuete Distanz in die Richtung mit der grueueten Entfernung
-		#ifdef debug_CudaRayTraceKernel
-			bool m_positiv = 0;				// Steigung positiv?
-		#endif
-		int fastDirectionSteps = 0;			// Schritte die gegangen werden.
-
-		#if defined(debug_CudaRayTraceKernel) || (! defined(SOS_Version2))
-			float distance_XYZ[3];
-			float mFastDirectionSteps_XYZ[3];
-			float mDistance_XYZ[3];
-		#endif
-		float m_XYZ[3];
-		float pathPoint[3];
-		#ifdef SOS_Version3
-			float endPoint[3];
-		#endif
-
-		 //  Welcher Abstand in den Dimensionen XYZ ist der groesste? Um Richtung zu bestimmen
-			#ifndef SOS_Version2
-				if ( (abs(voxel1f[0]-voxel2f[0]) <= abs(voxel1f[2]-voxel2f[2])))      		// X<Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<Z
-						greatestDistanceDim = 2;                                           				// => Z
-					else if ( (abs(voxel1f[2]-voxel2f[2]) <= abs(voxel1f[1]-voxel2f[1])))		// Z<Y
-						greatestDistanceDim = 1;                                           				// => Y
-					else                                                    					// Z=Y
-						greatestDistanceDim = 2;                                           				// => Z
-				else                                                      				// X>Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<X
-						greatestDistanceDim = 0;                                           				// => X
-					else                                                    					// Y>X
-						greatestDistanceDim = 1;                                           				// => Y
-			#else
-				if ( (abs(voxel1f[0]-voxel2f[0]) <= abs(voxel1f[2]-voxel2f[2])))      		// X<Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<Z
-					{
-						greatestDistanceDim = 2;                                           				// => Z
-						slowDim1 = 0;
-						slowDim2 = 1;
-					}
-					else if ( (abs(voxel1f[2]-voxel2f[2]) <= abs(voxel1f[1]-voxel2f[1])))		// Z<Y
-					{
-						greatestDistanceDim = 1;                                           				// => Y
-						slowDim1 = 0;
-						slowDim2 = 2;
-					}
-					else                                                    					// Z=Y
-					{
-						greatestDistanceDim = 2;                                           				// => Z
-						slowDim1 = 0;
-						slowDim2 = 1;
-					}
-				else                                                      				// X>Z
-					if ( (abs(voxel1f[1]-voxel2f[1]) <= abs(voxel1f[0]-voxel2f[0])))    		// Y<X
-					{
-						greatestDistanceDim = 0;                                           				// => X
-						slowDim1 = 1;
-						slowDim2 = 2;
-					}
-					else                                                    					// Y>X
-					{
-						greatestDistanceDim = 1;                                           				// => Y
-						slowDim1 = 0;
-						slowDim2 = 2;
-					}
-			#endif
-
-			#ifndef SOS_Version2
-				#ifdef debug_CudaRayTraceKernel
-					greatestDistance_XYZ=((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]); // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
-
-					distance_XYZ[0] = (voxel2f[0] - voxel1f[0]);    // Abstand in X-Richtung
-					distance_XYZ[1] = (voxel2f[1] - voxel1f[1]);    // Abstand in Y-Richtung
-					distance_XYZ[2] = (voxel2f[2] - voxel1f[2]);    // Abstand in Z-Richtung
-				#endif
-
-				pathPoint[0] = voxel1f[0];	// Voxel1f bzw. Emitter als Startpunkt nutzen
-				pathPoint[1] = voxel1f[1];
-				pathPoint[2] = voxel1f[2];
-
-				#ifdef debug_CudaRayTraceKernel
-
-					if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-				//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-					{
-						// Ausgabe der berechneten Werte
-						printf("	\n");
-						printf("	voxel1f                       = [%3f %3f %3f]\n", voxel1f[0],voxel1f[1],voxel1f[2]);
-						printf("	voxel2f                       = [%3f %3f %3f]\n", voxel2f[0],voxel2f[1],voxel2f[2]);
-						printf("	greatestDistanceDim(X0 Y1 Z2) = %i\n", greatestDistanceDim);
-						printf("	greatestDistance_XYZ          = %f\n", greatestDistance_XYZ);
-						printf("	distance_XYZ                  = [%3f %3f %3f]\n", distance_XYZ[0],distance_XYZ[1],distance_XYZ[2]);
-						//printf("	m_positiv                     = %i\n", m_positiv);
-						//printf("	fastDirectionSteps            = %i\n", fastDirectionSteps);	// Fehler wenn negativ!
-						printf("	pathPoint                     = [%3f %3f %3f]\n", pathPoint[0],pathPoint[1],pathPoint[2]);
-					}
-
-				#endif
-
-				#ifdef debug_CudaRayTraceKernel
-					// Steigung bestimmen
-					// Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
-					mDistance_XYZ[0] = distance_XYZ[0] / greatestDistance_XYZ;
-					mDistance_XYZ[1] = distance_XYZ[1] / greatestDistance_XYZ;
-					mDistance_XYZ[2] = distance_XYZ[2] / greatestDistance_XYZ;
-				#endif
-
-				// ist Steigung negativ? & Bestimmen der fastDirectionSteps vom diskretisierten Startpunkt zum Endpunkt bzw. Voxel
-				if (voxel2f[greatestDistanceDim] >= pathPoint[greatestDistanceDim])    // voxel2f > voxel1f   Endpukt > Startpkt -> Steigung positiv
-				{
-					#ifdef debug_CudaRayTraceKernel
-						m_positiv = 1;
-					#endif
-					fastDirectionSteps = floor(voxel2f[greatestDistanceDim] ) - floor(pathPoint[greatestDistanceDim] ) + 1;
-				}
-				else                                					// voxel2f < voxel1f   Endpukt < Startpkt -> Steigung negativ
-				{
-					#ifdef debug_CudaRayTraceKernel
-						m_positiv = 0;
-					#endif
-					fastDirectionSteps = floor(pathPoint[greatestDistanceDim] ) - floor(voxel2f[greatestDistanceDim] ) + 1;
-				}
-
-				// Steigung bezogen auf die Anzahl der noetigen ganzen Schritte
-				mFastDirectionSteps_XYZ[0] = distance_XYZ[0]/(float)(fastDirectionSteps-1);
-				mFastDirectionSteps_XYZ[1] = distance_XYZ[1]/(float)(fastDirectionSteps-1);
-				mFastDirectionSteps_XYZ[2] = distance_XYZ[2]/(float)(fastDirectionSteps-1);
-
-				m_XYZ[0] = mFastDirectionSteps_XYZ[0] ;  // Steigung bezogen auf die Anzahl der nuetigen ganzen Schritte in jede Richtung nutzen
-				m_XYZ[1] = mFastDirectionSteps_XYZ[1] ;
-				m_XYZ[2] = mFastDirectionSteps_XYZ[2] ;
-			#else
-
-				// ist Steigung negativ? & Bestimmen der fastDirectionSteps vom diskretisierten Startpunkt zum Endpunkt bzw. Voxel
-				if (voxel1f[greatestDistanceDim] >= voxel2f[greatestDistanceDim])    // voxel2f > voxel1f   Endpukt > Startpkt -> Steigung positiv
-				{
-					fastDirectionSteps = floor(voxel1f[greatestDistanceDim] + 0.5f ) - floor(voxel2f[greatestDistanceDim] + 0.5f ) + 1;
-
-					pathPoint[greatestDistanceDim] = voxel2f[greatestDistanceDim];
-					pathPoint[slowDim1] = voxel2f[slowDim1];
-					pathPoint[slowDim2] = voxel2f[slowDim2];
-
-					#ifdef SOS_Version3
-						endPoint[0] = voxel2f[0];
-						endPoint[1] = voxel1f[1];
-						endPoint[2] = voxel1f[2];
-					#endif
-				}
-				else                                					// voxel2f < voxel1f   Endpukt < Startpkt -> Steigung negativ
-				{
-					fastDirectionSteps = floor(voxel2f[greatestDistanceDim] + 0.5f ) - floor(voxel1f[greatestDistanceDim] + 0.5f ) + 1;
-
-					pathPoint[greatestDistanceDim] = voxel1f[greatestDistanceDim];
-					pathPoint[slowDim1] = voxel1f[slowDim1];
-					pathPoint[slowDim2] = voxel1f[slowDim2];
-
-					#ifdef SOS_Version3
-						endPoint[0] = voxel2f[0];
-						endPoint[1] = voxel2f[1];
-						endPoint[2] = voxel2f[2];
-					#endif
-				}
-
-
-				#ifdef debug_CudaRayTraceKernel
-					greatestDistance_XYZ=((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]); // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
-
-					distance_XYZ[0] = (voxel2f[0] - voxel1f[0]);    // Abstand in X-Richtung
-					distance_XYZ[1] = (voxel2f[1] - voxel1f[1]);    // Abstand in Y-Richtung
-					distance_XYZ[2] = (voxel2f[2] - voxel1f[2]);    // Abstand in Z-Richtung
-
-					if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-				//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-					{
-						// Ausgabe der berechneten Werte
-						printf("	\n");
-						printf("	voxel1f                       = [%3f %3f %3f]\n", voxel1f[0],voxel1f[1],voxel1f[2]);
-						printf("	voxel2f                       = [%3f %3f %3f]\n", voxel2f[0],voxel2f[1],voxel2f[2]);
-						printf("	greatestDistanceDim(X0 Y1 Z2) = %i\n", greatestDistanceDim);
-						printf("	greatestDistance_XYZ          = %f\n", greatestDistance_XYZ);
-						printf("	distance_XYZ                  = [%3f %3f %3f]\n", distance_XYZ[0],distance_XYZ[1],distance_XYZ[2]);
-						//printf("	m_positiv                     = %i\n", m_positiv);
-						//printf("	fastDirectionSteps            = %i\n", fastDirectionSteps);	// Fehler wenn negativ!
-						printf("	pathPoint                     = [%3f %3f %3f]\n", pathPoint[0],pathPoint[1],pathPoint[2]);
-					}
-
-					// Steigung bestimmen
-					// Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
-					mDistance_XYZ[0] = distance_XYZ[0] / greatestDistance_XYZ;
-					mDistance_XYZ[1] = distance_XYZ[1] / greatestDistance_XYZ;
-					mDistance_XYZ[2] = distance_XYZ[2] / greatestDistance_XYZ;
-				#endif
-
-				// Steigung bestimmen
-				// Auf Gesamtentfernung bezogene Steigung in jede Richtung (Float)
-				greatestDistance_XYZ=((float)voxel2f[greatestDistanceDim] - (float)voxel1f[greatestDistanceDim]); // Groessten Abstand in XY oder Z-Richtung ermitteln (Float)
-				m_XYZ[greatestDistanceDim] = 1.0f;		// Wegen Rundungsfehler bei der Division, die auftreten koennen.
-				//m_XYZ[greatestDistanceDim] = (voxel2f[greatestDistanceDim] - voxel1f[greatestDistanceDim]) / greatestDistance_XYZ;
-				m_XYZ[slowDim1] = (voxel2f[slowDim1] - voxel1f[slowDim1]) / greatestDistance_XYZ;
-				m_XYZ[slowDim2] = (voxel2f[slowDim2] - voxel1f[slowDim2]) / greatestDistance_XYZ;
-			#endif
-
-
-			#ifdef debug_CudaRayTraceKernel
-
-				if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-				//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-				{
-
-					// Ausgabe der berechneten Werte
-					//printf("	greatestDistance_XYZ    = %f\n", greatestDistance_XYZ);
-					//printf("	distance_XYZ            = [%3f %3f %3f]\n", distance_XYZ[0],distance_XYZ[1],distance_XYZ[2]);
-					//printf("	m_positiv               = %i\n", m_positiv);
-					printf("	fastDirectionSteps -1   = %i\n", (fastDirectionSteps-1));
-					printf("	pathPoint               = [%3f %3f %3f]\n", pathPoint[0],pathPoint[1],pathPoint[2]);
-					printf("	mFastDirectionSteps_XYZ = [%3f %3f %3f]\n", mFastDirectionSteps_XYZ[0],mFastDirectionSteps_XYZ[1],mFastDirectionSteps_XYZ[2]);
-					printf("	m_XYZ                   = [%3f %3f %3f]\n", m_XYZ[0],m_XYZ[1],m_XYZ[2]);
-
-					// Punkte auf Pfad ablaufen
-//					printf("	\n");
-//					printf("	Float Bresenham mit Anfangspkt- und Endpktkorrektur Texture\n");
-//					printf("	=========================================================================================================\n");
-//					printf("	Step    | currentVoxel  |      X      |      Y      |      Z      ||    SoSvalue    |   delta_SoSvalue\n");
-//					printf("	---------------------------------------------------------------------------------------------------------\n");
-				}
-			#endif
-
-
-
-			#ifndef SOS_Version2
-				// Startpunkt (voxel1f) festlegen
-				pathPoint[0] = voxel1f[0];
-				pathPoint[1] = voxel1f[1];
-				pathPoint[2] = voxel1f[2];
-			#endif
-
-			// Innere schleife fuer den Bresenham ohne Endpunkt
-			#ifndef SOS_Version2
-				int j = 1;
-				for  (j=1; j<=(fastDirectionSteps-1); j++) //(Start und Endpunkt werden ausserhalb der Schleife festgelegt)
-			#else
-				#ifndef SOS_Version3
-					int j=0;
-					for  (j=fastDirectionSteps; j>0; j--) //(Alle Punkte innerhalb der Schleife berechnen)
-				#else
-					int j = 1;
-					for  (j=1; j<=(fastDirectionSteps-1); j++) //(Endpunkt wird ausserhalb der Schleife festgelegt)
-				#endif
-			#endif
-				{
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, speedOfSoundField, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-
-					#ifndef SaftTextureForBresenhamSosPaths
-						// AuskommentiertprocessRayTracedVoxelFloat(pathPoint, voxelCount, totalSpeed, deviceSpeedOfSoundField, SOSGrid_XYZ);
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, speedOfSoundField, SOSGrid_XYZ);
-					#else
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-
-								#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
-									processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-								#endif
-
-								#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
-									processRayTracedVoxelTextureSosAtt(pathPoint, voxelCount, totalSpeed, totalAttenuation, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-
-										#ifdef debug_CudaRayTraceKernel
-											if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhaengig
-											//if ((pathPoint[0] == DebugSosVoxelX) && (pathPoint[1] == DebugSosVoxelY) && (pathPoint[2] == DebugSosVoxelZ) && ((int)voxel1f[0] == ER_PositionX) && ((int)voxel1f[1] == ER_PositionY) && ((int)voxel1f[2] == ER_PositionZ)) // Auch von ER-Position
-											//if ((pathPoint[0] == DebugSosVoxelX) && (pathPoint[1] == DebugSosVoxelY) && (pathPoint[2] == DebugSosVoxelZ)) // Nur von SOS_XYZ-Position
-											//if ( ((int)voxel1f[0] == ER_PositionX) && ((int)voxel1f[1] == ER_PositionY) && ((int)voxel1f[2] == ER_PositionZ) ) // Nur von ER-Position
-											{
-												printf("pSOSATT currentVoxel [%f %f %f] - j(%i)\n", pathPoint[0], pathPoint[1], pathPoint[2], j);
-												//printf("  speedOfSoundSample = %2.10f\n", speedOfSoundSample);
-
-												float2 SosAttValue = tex3D( texRefSosAttField,   pathPoint[0] + 0.5f, pathPoint[1] + 0.5f, pathPoint[2] + 0.5f);
-												//totalSpeed       += SosAttValue.x;
-												//totalAttenuation += SosAttValue.y;
-
-												printf("  totalSpeed         = %2.10f\n", SosAttValue.x, totalSpeed);
-												printf("  totalAttenuation + %2.10f  = %2.10f\n", SosAttValue.y, totalAttenuation);
-												printf("  voxelCount         = %d\n",     voxelCount);
-											}
-										#endif
-
-//											SosAttValue = tex3D( texRefSosAttField,   currentVoxelFloat[0] + 0.5f, currentVoxelFloat[1] + 0.5f, currentVoxelFloat[2] + 0.5f);
-//											totalSpeed       += SosAttValue.x;
-//											totalAttenuation += SosAttValue.y;
-								#endif
-
-					#endif
-
-					#ifdef debug_CudaRayTraceKernel
-						if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhaengig
-						//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-						{
-							//printf("	Step %3i| [%3i %3i %3i] | % 11.6f | % 11.6f | % 11.6f ||  % 13.6f | % 13.6f \n", j, (int)floor(pathPoint[0]), (int)floor(pathPoint[1]), (int)floor(pathPoint[2]), pathPoint[0], pathPoint[1], pathPoint[2], totalSpeed, speedOfSoundField[((int)floor(pathPoint[2]) * SOSGrid_XYZ.y + (int)floor(pathPoint[1])) * SOSGrid_XYZ.x + (int)floor(pathPoint[0])]);
-							printf("	Step %3i| [%3i %3i %3i] | % 17.12f  % 17.12f  % 17.12f ||  % 13.6f | % 13.6f \n", j, (int)floor(pathPoint[0]+0.5f), (int)floor(pathPoint[1]+0.5f), (int)floor(pathPoint[2]+0.5f), pathPoint[0], pathPoint[1], pathPoint[2], voxelCount/totalSpeed, (float)tex3D( texRefSpeedOfSoundField, pathPoint[0] + 0.5f, pathPoint[1] + 0.5f, pathPoint[2] + 0.5f ) );
-						}
-					#endif
-
-					#ifndef SOS_Version2
-						pathPoint[0] = pathPoint[0] + m_XYZ[0];
-						pathPoint[1] = pathPoint[1] + m_XYZ[1];
-						pathPoint[2] = pathPoint[2] + m_XYZ[2];
-					#else
-						pathPoint[greatestDistanceDim] = pathPoint[greatestDistanceDim] + m_XYZ[greatestDistanceDim];
-						pathPoint[slowDim1] = pathPoint[slowDim1] + m_XYZ[slowDim1];
-						pathPoint[slowDim2] = pathPoint[slowDim2] + m_XYZ[slowDim2];
-					#endif
-				}
-
-			#if (!defined (SOS_Version2)) || defined(SOS_Version3)
-	    	    // Endpunkt
-
-					#ifndef SaftTextureForBresenhamSosPaths
-					// AuskommentiertprocessRayTracedVoxelFloat(endPoint, voxelCount, totalSpeed, deviceSpeedOfSoundField, SOSGrid_XYZ);
-						//processRayTracedVoxelTexture(pathPoint, voxelCount, totalSpeed, speedOfSoundField, SOSGrid_XYZ);
-					#else
-						processRayTracedVoxelTexture(endPoint, voxelCount, totalSpeed, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ);
-						// TODO: Wieder einen ganzen Schritt entfernen und nur Distanz des Voxels dazufuegen --> dazu erst voxelCount auf float umstellen
-						voxelCount--;
-						pathPoint[greatestDistanceDim] = pathPoint[greatestDistanceDim] - m_XYZ[greatestDistanceDim];
-						pathPoint[slowDim1] = pathPoint[slowDim1] - m_XYZ[slowDim1];
-						pathPoint[slowDim2] = pathPoint[slowDim2] - m_XYZ[slowDim2];
-						voxelCount += sqrtf( SQR(pathPoint[1]-endPoint[1]) + SQR(pathPoint[2]-endPoint[2]) + SQR(pathPoint[3]-endPoint[3]) );
-					#endif
-
-					#ifdef debug_CudaRayTraceKernel
-						if ((point2.x == OutputSOSPositionX) && (point2.y == OutputSOSPositionY) && (point2.z == OutputSOSPositionZ)) // Nur von VolumeOutput abhuengig
-							//    	if ((point2.x == OutputPositionX) && (point2.y == OutputPositionY) && (point2.z == OutputPositionZ) && (voxel1.x == ER_PositionX) && (voxel1.y == ER_PositionY) && (voxel1.z == ER_PositionZ)) // Auch von ER-Position
-						{
-							//printf("	Step %3i| [%3i %3i %3i] | % 11.6f | % 11.6f | % 11.6f  ||  % 13.6f | % 13.6f \n", j, (int)floor(voxel2f[0]), (int)floor(voxel2f[1]), (int)floor(voxel2f[2]), voxel2f[0], voxel2f[1], voxel2f[2], totalSpeed, speedOfSoundField[((int)floor(voxel2f[2]) * SOSGrid_XYZ.y + (int)floor(voxel2f[1])) * SOSGrid_XYZ.x + (int)floor(voxel2f[0])]);
-							printf("endPkt	Step %3i| [%3i %3i %3i] | % 17.12f  % 17.12f  % 17.12f ||  % 13.6f | % 13.6f \n", j, (int)floor(endPoint[0]+0.5f), (int)floor(endPoint[1]+0.5f), (int)floor(endPoint[2]+0.5f), endPoint[0], endPoint[1], endPoint[2], voxelCount/totalSpeed, (float)tex3D( texRefSpeedOfSoundField, endPoint[0] + 0.5f, endPoint[1] + 0.5f, endPoint[2] + 0.5f ) );
-	//						printf("<- performRayTracedSpeedAdditionTexture\n");
-						}
-					#endif
-			#else
-						// keinen Endpunkt extra dazurechnen.
-			#endif
-
-			// Anzahl der Besuchten Voxel liegt von Anfang an fest, daher nicht noetig einzeln aufzuaddieren!
-			voxelCount=fastDirectionSteps;
+    // Anzahl der Besuchten Voxel liegt von Anfang an fest, daher nicht noetig einzeln aufzuaddieren!
+    voxelCount = fastDirectionSteps;
 }
--- a/SAFT_TOFI/src/kernel/saftKernel.cu
+++ b/SAFT_TOFI/src/kernel/saftKernel.cu
@@ -0,0 +1,561 @@
+#include "saftKernel.cuh"
+
+texture<float, cudaTextureType2D, cudaReadModeElementType> texRefAscans;  // Schritt 1. Textur anlegen
+
+// Texture for loading AscanIndexes without ATT-Correction (float)
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_0;
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_1;
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_2;
+texture<float, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat1_3;
+
+// Texture for loading AscanIndexes with ATT-Correction (float2)
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_0;
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_1;
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_2;
+texture<float2, cudaTextureType3D, cudaReadModeElementType> texTableAscanIndexFloat2_3;
+
+__global__ void saftKernelAscanIndex_SOS_ATT(  // Version SoSATT-Korrektur
+
+    int const ascanIndexBatchOffset,           ///< Offset of AScans if more then one AScan-Batch is used for Reconstruction
+    float const aScanWindowSize,               ///< Amount of AScans used here for Reconstruction
+    int const maxAscanIndexArraysInTexture,    ///< Maximum amount in A-Scans in one Texture
+    int const TableAscanIndexAllocationCount,  ///< Used amount of Textures (currently limited to maximum 4 Textures)
+
+    int3 const IMAGE_SIZE_XYZ,           ///< XYZ des Outputvolumens
+    float3 const SosVoxelStartPosition,  ///< Offset of  SOS-Grids
+    float const IMAGE_RESOLUTION,        ///< Resolution of Output-Volume
+    float const VoxelIncrement,          ///< Step-Width of one Voxel in SOS-Korrdinates
+
+    int const blockIndexOffset,    ///<
+    int const speedOfSoundZLayer,  ///<
+    dim3 const gridDimensions,     ///<
+    dim3 const blockDimensions,    ///<
+    float const debugMode,           ///<
+    float const debugModeParameter,  ///<
+    int const *deviceSAFT_VARIANT,   ///<
+    double *output
+
+)
+{
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 1. Determine which Voxel is to be calculated in this Kernel
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    dim3 blockVoxel(  // Which Block(xyz) corresponds to this Thread?
+        (threadIdx.x / blockDimensions.y) % blockDimensions.x, threadIdx.x % blockDimensions.y, threadIdx.x / (blockDimensions.x * blockDimensions.y));
+
+    // Index of Block for this Thread
+    unsigned long long int blockIndex = ((blockIndexOffset + blockIdx.z) * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+
+    dim3 regionOfInterestVoxel(                                                                   // Which Voxel corresponds to this Thread? Start with 0.
+        ((blockIndex / gridDimensions.y) % gridDimensions.x) * blockDimensions.x + blockVoxel.x,  // Medium speed index
+        (blockIndex % gridDimensions.y) * blockDimensions.y + blockVoxel.y,                       // Fastest index
+        (blockIndex / (gridDimensions.x * gridDimensions.y)) * blockDimensions.z + blockVoxel.z   // Slowest index
+    );
+
+    // If Voxel is outside the reconstructed Image leave Kernel
+    if ((regionOfInterestVoxel.x >= IMAGE_SIZE_XYZ.x) || (regionOfInterestVoxel.y >= IMAGE_SIZE_XYZ.y) || (regionOfInterestVoxel.z >= IMAGE_SIZE_XYZ.z))
+        return;
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 2. Determine
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //    - SOSvoxel in which voxel is located
+    //    - Index of OutputVolume, and tables of Emitter, Receiver coordinates and SOSpaths
+    //    - Variable declarations
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Memory-Index for this Thread for Output-Array of this Voxel
+    unsigned long long int memoryIndex =
+        (((unsigned long long int)IMAGE_SIZE_XYZ.y * ((unsigned long long int)regionOfInterestVoxel.z - (unsigned long long int)blockIndexOffset) + (unsigned long long int)regionOfInterestVoxel.y) *
+             (unsigned long long int)IMAGE_SIZE_XYZ.x +
+         (unsigned long long int)regionOfInterestVoxel.x);
+    float3 SosVoxelf;  // SoS-Voxel Koordinates in float
+    // Determine SOS-Voxel-Position
+    SosVoxelf.x = (SosVoxelStartPosition.x + (VoxelIncrement * regionOfInterestVoxel.x));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.y = (SosVoxelStartPosition.y + (VoxelIncrement * regionOfInterestVoxel.y));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.z = (SosVoxelStartPosition.z + (VoxelIncrement * regionOfInterestVoxel.z));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // TexturIndex for access on Texturmemory depending of Voxel
+    float TexturIndexZ_AscanIndex = 0.0f;           // Z-Index for access on Texturmemory
+    float const TexturIndexX = SosVoxelf.x + 0.5f;  // Due to Access over Texturmemory +0.5f.
+    float const TexturIndexY = SosVoxelf.y + 0.5f;
+    float const SosVoxelTextureZ = (SosVoxelf.z - (float)speedOfSoundZLayer) + 0.5f;  // Z offset inside precalculated SOS paths
+
+    float voxelValue = 0.0;  // reflection value, which is summed up in Ascan-Loop = Outputvalue
+
+    __syncthreads();
+
+    if (ascanIndexBatchOffset == 0)  // Initialisierung beim ersten Kernelaufruf sprich ascanIndexBatchOffset == 0
+    {
+        voxelValue = 0.0;
+    }
+    else
+    {
+        voxelValue = (float)output[memoryIndex];  // Alle anderen Kernelaufrufe muessen zuerst den alten Wert laden, ist bei mehreren Durchlaeufen noetig
+    }
+
+    __syncthreads();
+
+    if (TableAscanIndexAllocationCount == 1)
+    {
+        float2 currentSOSVoxel_AscanIndexAttValues;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            currentSOSVoxel_AscanIndexAttValues = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            __syncthreads();
+            voxelValue += currentSOSVoxel_AscanIndexAttValues.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues.x - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 2)
+    {
+        float2 currentSOSVoxel_AscanIndexAttValues;
+
+        // #pragma unroll 3
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            // 1ten Teil mit selben Index laden
+            TexturIndexZ_AscanIndex = 2 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            currentSOSVoxel_AscanIndexAttValues = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+
+            voxelValue += currentSOSVoxel_AscanIndexAttValues.y *
+                          tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues.x - 0.5f, (float)(ascanIndexBatchOffset) + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexAttValues = tex3D(texTableAscanIndexFloat2_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues.y *
+                          tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues.x - 0.5f, (float)(ascanIndexBatchOffset) + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 3)
+    {
+
+        float2 currentSOSVoxel_AscanIndexAttValues_0;
+        float2 currentSOSVoxel_AscanIndexAttValues_1;
+        float2 currentSOSVoxel_AscanIndexAttValues_2;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            // load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexAttValues_0 = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_1 = tex3D(texTableAscanIndexFloat2_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_2 = tex3D(texTableAscanIndexFloat2_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            __syncthreads();
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_0.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_0.x - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_1.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_1.x - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_2.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_2.x - 0.5f, Offset_2 + ascanIndex_i);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 4)
+    {
+        float2 currentSOSVoxel_AscanIndexAttValues_0;
+        float2 currentSOSVoxel_AscanIndexAttValues_1;
+        float2 currentSOSVoxel_AscanIndexAttValues_2;
+        float2 currentSOSVoxel_AscanIndexAttValues_3;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_3 = (float)ascanIndexBatchOffset + 3.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            // syncthreads();
+            //  load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexAttValues_0 = tex3D(texTableAscanIndexFloat2_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_1 = tex3D(texTableAscanIndexFloat2_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_2 = tex3D(texTableAscanIndexFloat2_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexAttValues_3 = tex3D(texTableAscanIndexFloat2_3, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            __syncthreads();
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_0.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_0.x - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_1.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_1.x - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_2.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_2.x - 0.5f, Offset_2 + ascanIndex_i);
+            voxelValue += currentSOSVoxel_AscanIndexAttValues_3.y * tex2D(texRefAscans, currentSOSVoxel_AscanIndexAttValues_3.x - 0.5f, Offset_3 + ascanIndex_i);
+        }
+    }
+
+    __syncthreads();
+    output[memoryIndex] = (double)voxelValue;
+}
+
+__global__ void saftKernelAscanIndex_SOS(  // Version SoS-Korrektur
+
+    int const ascanIndexBatchOffset,           ///< Offset of AScans if more then one AScan-Batch is used for Reconstruction
+    float const aScanWindowSize,               ///< Amount of AScans used here for Reconstruction
+    int const maxAscanIndexArraysInTexture,    ///< Maximum amount in A-Scans in one Texture
+    int const TableAscanIndexAllocationCount,  ///< Used amount of Textures (currently limited to maximum 4 Textures)
+
+    int3 const IMAGE_SIZE_XYZ,           ///< XYZ des Outputvolumens
+    float3 const SosVoxelStartPosition,  ///< Offset of  SOS-Grids
+    float const IMAGE_RESOLUTION,        ///< Resolution of Output-Volume
+    float const VoxelIncrement,          ///< Step-Width of one Voxel in SOS-Korrdinates
+
+    int const blockIndexOffset, int const speedOfSoundZLayer, dim3 const gridDimensions, dim3 const blockDimensions, double *output
+
+)
+{
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 1. Determine which Voxel is to be calculated in this Kernel
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    dim3 blockVoxel(  // Which Block(xyz) corresponds to this Thread?
+        (threadIdx.x / blockDimensions.y) % blockDimensions.x, threadIdx.x % blockDimensions.y, threadIdx.x / (blockDimensions.x * blockDimensions.y));
+
+    // Index of Block for this Thread
+    long blockIndex = ((blockIndexOffset + blockIdx.z) * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+
+    dim3 regionOfInterestVoxel(                                                                   // Which Voxel corresponds to this Thread? Start with 0.
+        ((blockIndex / gridDimensions.y) % gridDimensions.x) * blockDimensions.x + blockVoxel.x,  // Medium speed index
+        (blockIndex % gridDimensions.y) * blockDimensions.y + blockVoxel.y,                       // Fastest index
+        (blockIndex / (gridDimensions.x * gridDimensions.y)) * blockDimensions.z + blockVoxel.z   // Slowest index
+    );
+
+    // If Voxel is outside the reconstructed Image leave Kernel
+    if ((regionOfInterestVoxel.x >= IMAGE_SIZE_XYZ.x) || (regionOfInterestVoxel.y >= IMAGE_SIZE_XYZ.y) || (regionOfInterestVoxel.z >= IMAGE_SIZE_XYZ.z))
+        return;
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 2. Determine
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //    - SOSvoxel in which voxel is located
+    //    - Index of OutputVolume, and tables of Emitter, Receiver coordinates and SOSpaths
+    //    - Variable declarations
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Memory-Index for this Thread for Output-Array of this Voxel
+    long memoryIndex = ((IMAGE_SIZE_XYZ.y * (regionOfInterestVoxel.z - blockIndexOffset) + regionOfInterestVoxel.y) * IMAGE_SIZE_XYZ.x + regionOfInterestVoxel.x);
+    float3 SosVoxelf;  // SoS-Voxel Koordinates in float
+    // Determine SOS-Voxel-Position
+    SosVoxelf.x = (SosVoxelStartPosition.x + (VoxelIncrement * regionOfInterestVoxel.x));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.y = (SosVoxelStartPosition.y + (VoxelIncrement * regionOfInterestVoxel.y));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.z = (SosVoxelStartPosition.z + (VoxelIncrement * regionOfInterestVoxel.z));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // TexturIndex for access on Texturmemory depending of Voxel
+    float TexturIndexZ_AscanIndex = 0.0f;           // Z-Index for access on Texturmemory
+    float const TexturIndexX = SosVoxelf.x + 0.5f;  // Due to Access over Texturmemory +0.5f.
+    float const TexturIndexY = SosVoxelf.y + 0.5f;
+    float const SosVoxelTextureZ = (SosVoxelf.z - (float)speedOfSoundZLayer) + 0.5f;  // Z offset inside precalculated SOS paths
+    float voxelValue = 0.0;                                                           // reflection value, which is summed up in Ascan-Loop = Outputvalue
+
+    float currentSOSVoxel_AscanIndexValues;
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 3. SAFT-Algorithmus
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //  - Index aus Textur lesen und fuer zugriff auf A-Scan nutzen
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Vorgehen Ascanindexvariante
+    // 1. Bestimme Koordinaten in SOS-Koordinaten für festen Emitter und 1413 Receiver
+    //		Index = X+Xmax*Y+Xmax*Ymax*(Zmax*RecNr+Z).
+    //		Xmax = 128
+    //		Ymax = 128
+    //		Zmax = 2
+    //		=> X = x
+    //		=> Y = y
+    //		=> Z = Zmax*RecNr+z
+    // 2. Lade an dieser Stelle den Interpolierten Index.
+    //		2.a Über Texturmemory
+    //		2.b über alle 8 benachbarten Voxel oder 64 bei tricubic-Interpolation in 3D
+    // 3. Lade Ascan-Sample an diesem Index und summiere auf
+
+    __syncthreads();
+    if (ascanIndexBatchOffset == 0)  // Initialisierung beim ersten Kernelaufruf sprich ascanIndexBatchOffset == 0
+    {
+        voxelValue = 0.0f;
+    }
+    else
+    {
+        voxelValue = (float)output[memoryIndex];  // Alle anderen Kernelaufrufe muessen zuerst den alten Wert laden, ist bei mehreren Durchlaeufen noetig
+    }
+
+    __syncthreads();
+    // float VoxelAscanIndex2 = 0.0f;
+    // float voxelValue2 = 0.0f;
+
+    if (TableAscanIndexAllocationCount == 1)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < 1; ascanIndex_i+=1.0f)	// bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // nutze immer nur 1tes Surface
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan a
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 2)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 3)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+
+            // 3ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 2 * maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+
+    else if (TableAscanIndexAllocationCount == 4)
+    {
+        float currentSOSVoxel_AscanIndexValues_0;
+        float currentSOSVoxel_AscanIndexValues_1;
+        float currentSOSVoxel_AscanIndexValues_2;
+        float currentSOSVoxel_AscanIndexValues_3;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_3 = (float)ascanIndexBatchOffset + 3.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 4
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            // syncthreads();
+            //  load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexValues_0 = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_1 = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_2 = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_3 = tex3D(texTableAscanIndexFloat1_3, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_0 - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_1 - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_2 - 0.5f, Offset_2 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_3 - 0.5f, Offset_3 + ascanIndex_i);
+
+            // if (((int)ascanIndex_i & 31) == 0) __syncthreads();
+        }
+    }
+    else
+    {
+        // Do nothing due to only 4 are defined
+    }
+    __syncthreads();
+    output[memoryIndex] = (double)voxelValue;
+}
+
+__global__ void saftKernelAscanIndex(  // Version ohne-Korrektur
+
+    int const ascanIndexBatchOffset,           ///< Offset of AScans if more then one AScan-Batch is used for Reconstruction
+    float const aScanWindowSize,               ///< Amount of AScans used here for Reconstruction
+    int const maxAscanIndexArraysInTexture,    ///< Maximum amount in A-Scans in one Texture
+    int const TableAscanIndexAllocationCount,  ///< Used amount of Textures (currently limited to maximum 4 Textures)
+
+    int3 const IMAGE_SIZE_XYZ,           ///< XYZ des Outputvolumens
+    float3 const SosVoxelStartPosition,  ///< Offset of  SOS-Grids
+    float const IMAGE_RESOLUTION,        ///< Resolution of Output-Volume
+    float const VoxelIncrement,          ///< Step-Width of one Voxel in SOS-Korrdinates
+
+    int const blockIndexOffset, int const speedOfSoundZLayer, dim3 const gridDimensions, dim3 const blockDimensions, double *output
+
+)
+{
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 1. Determine which Voxel is to be calculated in this Kernel
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    dim3 blockVoxel(  // Which Block(xyz) corresponds to this Thread?
+        (threadIdx.x / blockDimensions.y) % blockDimensions.x, threadIdx.x % blockDimensions.y, threadIdx.x / (blockDimensions.x * blockDimensions.y));
+
+    // Index of Block for this Thread
+    long blockIndex = ((blockIndexOffset + blockIdx.z) * gridDim.y + blockIdx.y) * gridDim.x + blockIdx.x;
+
+    dim3 regionOfInterestVoxel(                                                                   // Which Voxel corresponds to this Thread? Start with 0.
+        ((blockIndex / gridDimensions.y) % gridDimensions.x) * blockDimensions.x + blockVoxel.x,  // Medium speed index
+        (blockIndex % gridDimensions.y) * blockDimensions.y + blockVoxel.y,                       // Fastest index
+        (blockIndex / (gridDimensions.x * gridDimensions.y)) * blockDimensions.z + blockVoxel.z   // Slowest index
+    );
+
+    // If Voxel is outside the reconstructed Image leave Kernel
+    if ((regionOfInterestVoxel.x >= IMAGE_SIZE_XYZ.x) || (regionOfInterestVoxel.y >= IMAGE_SIZE_XYZ.y) || (regionOfInterestVoxel.z >= IMAGE_SIZE_XYZ.z))
+        return;
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 2. Determine
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //    - SOSvoxel in which voxel is located
+    //    - Index of OutputVolume, and tables of Emitter, Receiver coordinates and SOSpaths
+    //    - Variable declarations
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Memory-Index for this Thread for Output-Array of this Voxel
+    long memoryIndex = ((IMAGE_SIZE_XYZ.y * (regionOfInterestVoxel.z - blockIndexOffset) + regionOfInterestVoxel.y) * IMAGE_SIZE_XYZ.x + regionOfInterestVoxel.x);
+    float3 SosVoxelf;  // SoS-Voxel Koordinates in float
+    // Determine SOS-Voxel-Position
+    SosVoxelf.x = (SosVoxelStartPosition.x + (VoxelIncrement * regionOfInterestVoxel.x));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.y = (SosVoxelStartPosition.y + (VoxelIncrement * regionOfInterestVoxel.y));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    SosVoxelf.z = (SosVoxelStartPosition.z + (VoxelIncrement * regionOfInterestVoxel.z));  // Hier Addition der SOSVoxel im SoS-Grid durchfuehren
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // TexturIndex for access on Texturmemory depending of Voxel
+    float TexturIndexZ_AscanIndex = 0.0f;           // Z-Index for access on Texturmemory
+    float const TexturIndexX = SosVoxelf.x + 0.5f;  // Due to Access over Texturmemory +0.5f.
+    float const TexturIndexY = SosVoxelf.y + 0.5f;
+    float const SosVoxelTextureZ = (SosVoxelf.z - (float)speedOfSoundZLayer) + 0.5f;  // Z offset inside precalculated SOS paths
+    float voxelValue = 0.0;                                                           // reflection value, which is summed up in Ascan-Loop = Outputvalue
+
+    float currentSOSVoxel_AscanIndexValues;
+
+    /////////////////////////////////////////////////////////////////////////////////////////
+    // 3. SAFT-Algorithmus
+    /////////////////////////////////////////////////////////////////////////////////////////
+    //  - Index aus Textur lesen und fuer zugriff auf A-Scan nutzen
+    /////////////////////////////////////////////////////////////////////////////////////////
+
+    // Vorgehen Ascanindexvariante
+    // 1. Bestimme Koordinaten in SOS-Koordinaten für festen Emitter und 1413 Receiver
+    //		Index = X+Xmax*Y+Xmax*Ymax*(Zmax*RecNr+Z).
+    //		Xmax = 128
+    //		Ymax = 128
+    //		Zmax = 2
+    //		=> X = x
+    //		=> Y = y
+    //		=> Z = Zmax*RecNr+z
+    // 2. Lade an dieser Stelle den Interpolierten Index.
+    //		2.a Über Texturmemory
+    //		2.b über alle 8 benachbarten Voxel oder 64 bei tricubic-Interpolation in 3D
+    // 3. Lade Ascan-Sample an diesem Index und summiere auf
+
+    __syncthreads();
+    if (ascanIndexBatchOffset == 0)  // Initialisierung beim ersten Kernelaufruf sprich ascanIndexBatchOffset == 0
+    {
+        voxelValue = 0.0f;
+    }
+    else
+    {
+        voxelValue = (float)output[memoryIndex];  // Alle anderen Kernelaufrufe muessen zuerst den alten Wert laden, ist bei mehreren Durchlaeufen noetig
+    }
+
+    __syncthreads();
+    // float VoxelAscanIndex2 = 0.0f;
+    // float voxelValue2 = 0.0f;
+
+    if (TableAscanIndexAllocationCount == 1)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // nutze immer nur 1tes Surface
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 2)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+    else if (TableAscanIndexAllocationCount == 3)
+    {
+        // #pragma unroll 3
+        // for(float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i+=1.0f)	// 1.Teil Ascans durchlaufen	//  GV/s	  //2: GV/s, 4: GV/s, 8: GV/s, 16: GV/s
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.6GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0 * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+
+            // 1ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 0.5f);  // i gibt Index fuer Ascan an
+
+            // 2ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + maxAscanIndexArraysInTexture + 0.5f);
+
+            // 3ten Teil mit selben Index laden
+            currentSOSVoxel_AscanIndexValues = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues - 0.5f, ascanIndexBatchOffset + ascanIndex_i + 2 * maxAscanIndexArraysInTexture + 0.5f);
+        }
+    }
+
+    else if (TableAscanIndexAllocationCount == 4)
+    {
+        float currentSOSVoxel_AscanIndexValues_0;
+        float currentSOSVoxel_AscanIndexValues_1;
+        float currentSOSVoxel_AscanIndexValues_2;
+        float currentSOSVoxel_AscanIndexValues_3;
+
+        float Offset_0 = (float)ascanIndexBatchOffset + 0.5f;
+        float Offset_1 = (float)ascanIndexBatchOffset + maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_2 = (float)ascanIndexBatchOffset + 2.0f * maxAscanIndexArraysInTexture + 0.5f;
+        float Offset_3 = (float)ascanIndexBatchOffset + 3.0f * maxAscanIndexArraysInTexture + 0.5f;
+
+        // #pragma unroll 2
+        for (float ascanIndex_i = 0.0f; ascanIndex_i < maxAscanIndexArraysInTexture; ascanIndex_i += 1.0f)  // bis zu 60.8GV/s
+        {
+            TexturIndexZ_AscanIndex = 2.0f * ascanIndex_i + SosVoxelTextureZ;  // Z-Index fuer  Zugriff auf Textur interpoliert
+            //__syncthreads();
+            // load TOF-Index from Textur 0-3
+            currentSOSVoxel_AscanIndexValues_0 = tex3D(texTableAscanIndexFloat1_0, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_1 = tex3D(texTableAscanIndexFloat1_1, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_2 = tex3D(texTableAscanIndexFloat1_2, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+            currentSOSVoxel_AscanIndexValues_3 = tex3D(texTableAscanIndexFloat1_3, TexturIndexX, TexturIndexY, TexturIndexZ_AscanIndex);
+
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_0 - 0.5f, Offset_0 + ascanIndex_i);  // i gibt Index fuer Ascan an
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_1 - 0.5f, Offset_1 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_2 - 0.5f, Offset_2 + ascanIndex_i);
+            voxelValue += tex2D(texRefAscans, currentSOSVoxel_AscanIndexValues_3 - 0.5f, Offset_3 + ascanIndex_i);
+        }
+    }
+    __syncthreads();
+    output[memoryIndex] = (double)voxelValue;
+}
--- a/SAFT_TOFI/src/kernel/saftKernel.cuh
+++ b/SAFT_TOFI/src/kernel/saftKernel.cuh
--- a/SAFT_TOFI/src/kernel/saftPrivate.cu
+++ b/SAFT_TOFI/src/kernel/saftPrivate.cu
@@ -0,0 +1,624 @@
+#include "precalculateSpeedOfSoundKernel.cuh"
+#include "rayTracing.cuh"
+#include "saftKernel.cuh"
+#include "saft.hpp"
+
+void SAFTHandler::precalculateAscanIndex_usePaths(int ascanIndex_i, int aScanWindowSize, int currentSpeedOfSoundZLayer, int maxFeasibleSosZLayerCount)
+{
+    cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathSosBoth = cudaCreateChannelDesc(32, 32, 32, 32,
+                                                                                             cudaChannelFormatKindFloat);  // Schritt 2.1 Output-Kanal anlegen und
+                                                                                                                           // beschreiben - Float4
+                                                                                                                           // Both Emitter Path Tables
+    // --------------------------------------------------------
+    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[1] = cudaAddressModeClamp;
+    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[2] = cudaAddressModeClamp;
+    switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+    {
+    case 0:
+        texTableVoxelToEmitterPathSosBoth_preprocess.filterMode = cudaFilterModePoint;
+        break;
+    case 1:
+        texTableVoxelToEmitterPathSosBoth_preprocess.filterMode = cudaFilterModeLinear;
+        break;
+    }
+    texTableVoxelToEmitterPathSosBoth_preprocess.normalized = 0;
+    CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToEmitterPathSosBoth_preprocess, deviceTableVoxelToEmPathSosBothCuArray, &texChannelDescTableVoxelToEmRecPathSosBoth));
+
+    // Texturmemory fuer Receiver - SosPathsTables
+    // ===================================================================================================================
+    // Both Receiver Path Tables
+    // ------------------------------------------------------
+    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[1] = cudaAddressModeClamp;
+    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[2] = cudaAddressModeClamp;
+    switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+    {
+    case 0:
+        texTableVoxelToReceiverPathSosBoth0_preprocess.filterMode = cudaFilterModePoint;
+        break;
+    case 1:
+        texTableVoxelToReceiverPathSosBoth0_preprocess.filterMode = cudaFilterModeLinear;
+        break;
+    }
+    texTableVoxelToReceiverPathSosBoth0_preprocess.normalized = 0;
+
+    CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth0_preprocess, deviceTableVoxelToRecPathSosBothCuArray[0], &texChannelDescTableVoxelToEmRecPathSosBoth));
+
+    if (TableVoxelToReceiverPathSosAllocationCount > 1)
+    {                                                                                          // TODO: mit Arrays flexibel programmieren, wenn moeglich!!!
+        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[1] = cudaAddressModeClamp;
+        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableVoxelToReceiverPathSosBoth1_preprocess.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableVoxelToReceiverPathSosBoth1_preprocess.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableVoxelToReceiverPathSosBoth1_preprocess.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth1_preprocess, deviceTableVoxelToRecPathSosBothCuArray[1], &texChannelDescTableVoxelToEmRecPathSosBoth));
+    }
+
+    if (TableVoxelToReceiverPathSosAllocationCount > 2)
+    {
+        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[1] = cudaAddressModeClamp;
+        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableVoxelToReceiverPathSosBoth2_preprocess.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableVoxelToReceiverPathSosBoth2_preprocess.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableVoxelToReceiverPathSosBoth2_preprocess.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth2_preprocess, deviceTableVoxelToRecPathSosBothCuArray[2], &texChannelDescTableVoxelToEmRecPathSosBoth));
+    }
+
+    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1, 1);
+    dim3 blocksPerGrid(1, 1, 1);  
+    blocksPerGrid.x = SOSGrid_XYZ.y;
+    blocksPerGrid.y = maxFeasibleSosZLayerCount;
+    blocksPerGrid.z = 1;
+
+    // Step 2. Bereite Output-Textur fuer AscanIndex vor
+
+    if (TableAscanIndexAllocationCount > 0)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat0, deviceTextureAscanIndexFloatCuArray[0]);
+    }
+    if (TableAscanIndexAllocationCount > 1)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat1, deviceTextureAscanIndexFloatCuArray[1]);
+    }
+    if (TableAscanIndexAllocationCount > 2)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat2, deviceTextureAscanIndexFloatCuArray[2]);
+    }
+    if (TableAscanIndexAllocationCount > 3)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat3, deviceTextureAscanIndexFloatCuArray[3]);
+    }
+
+    // Step 3. Fuehre Kernel aus mit #Threads: SOS.x*SOS.y . Innerhalb werden
+    // immer 1024/2048 A-Scans durchlaufen und in AscanIndex-Textur geschrieben
+
+    if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false))
+    {  // ====================================================
+       // Blockmode with SOS-value per Ascan
+
+        precalculateAscanIndex_usePathsKernel<<<blocksPerGrid, threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
+                                                                                  aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
+                                                                                                    // vorberechnet werden können
+                                                                                  deviceSosAttFieldCuArray,
+                                                                                  currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
+                                                                                                              ///< grid the pre-calculation is performed
+                                                                                                              ///< for.
+                                                                                  maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
+                                                                                                              ///< sound grid the pre-calculation is
+                                                                                                              ///< performed for.
+                                                                                  // currentEmIndexUsedForAscanIndexCalculation,	///< current Index of Em
+                                                                                  // for which the AscanIndex is calculated
+
+                                                                                  maxSoSReceiverArrayForTexture,
+
+                                                                                  deviceEmitterIndex_block,   // Speicheradresse fuer EmitterIndexdaten
+                                                                                  deviceReceiverIndex_block,  // Speicheradresse fuer ReceiverIndexdaten
+
+                                                                                  TableAscanIndexAllocationCount,  ///< Anzahl der benoetigten AscanBlocks
+                                                                                                                   ///< der Groesse 2048/4096
+                                                                                  maxAscanIndexArraysInTexture,    ///< maximale Anzahl an Em/Rec in einem
+                                                                                                                   ///< CUDA Array (fest definiert fuer
+                                                                                                                   ///< bestimmung welche Textur genutzt
+                                                                                                                   ///< wird)
+
+                                                                                  deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
+                                                                                                                        ///< the path from transducer to
+                                                                                                                        ///< voxel.
+
+                                                                                  SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
+                                                                                  deviceSAFT_VARIANT);
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false))
+    {  // ====================================================
+       // 3DVolume Mode with SOS-Correction no ATT-Correction
+
+        precalculateAscanIndex_usePathsKernel_SOS<<<blocksPerGrid,
+                                                    threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
+                                                                       aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
+                                                                                         // vorberechnet werden können
+                                                                       deviceSosAttFieldCuArray,
+                                                                       currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
+                                                                                                   ///< grid the pre-calculation is performed
+                                                                                                   ///< for.
+                                                                       maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
+                                                                                                   ///< sound grid the pre-calculation is
+                                                                                                   ///< performed for.
+                                                                       // currentEmIndexUsedForAscanIndexCalculation,		///< current
+                                                                       // Index of Em for which the AscanIndex is calculated
+
+                                                                       maxSoSReceiverArrayForTexture,
+
+                                                                       deviceEmitterIndex_block,   // Speicheradresse fuer EmitterIndexdaten
+                                                                       deviceReceiverIndex_block,  // Speicheradresse fuer ReceiverIndexdaten
+
+                                                                       TableAscanIndexAllocationCount,  ///< Anzahl der benoetigten AscanBlocks
+                                                                                                        ///< der Groesse 2048/4096
+                                                                       maxAscanIndexArraysInTexture,    ///< maximale Anzahl an Em/Rec in einem
+                                                                                                        ///< CUDA Array (fest definiert fuer
+                                                                                                        ///< bestimmung welche Textur genutzt
+                                                                                                        ///< wird)
+
+                                                                       deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
+                                                                                                             ///< the path from transducer to
+                                                                                                             ///< voxel.
+
+                                                                       SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
+                                                                       deviceSAFT_VARIANT);
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true))
+    {  // ====================================================
+       // 3DVolume Mode with SOS- and ATT-Correction
+
+        precalculateAscanIndex_usePathsKernel_SOS_ATT<<<blocksPerGrid,
+                                                        threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
+                                                                           aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
+                                                                                             // vorberechnet werden können
+                                                                           deviceSosAttFieldCuArray,
+                                                                           currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
+                                                                                                       ///< grid the pre-calculation is performed
+                                                                                                       ///< for.
+                                                                           maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
+                                                                                                       ///< sound grid the pre-calculation is
+                                                                                                       ///< performed for.
+                                                                           // currentEmIndexUsedForAscanIndexCalculation,		///< current
+                                                                           // Index of Em for which the AscanIndex is calculated
+                                                                           maxSoSReceiverArrayForTexture,
+                                                                           deviceEmitterIndex_block,             // Speicheradresse fuer EmitterIndexdaten
+                                                                           deviceReceiverIndex_block,            // Speicheradresse fuer ReceiverIndexdaten
+                                                                           TableAscanIndexAllocationCount,       ///< Anzahl der benoetigten AscanBlocks
+                                                                                                                 ///< der Groesse 2048/4096
+                                                                           maxAscanIndexArraysInTexture,         ///< maximale Anzahl an Em/Rec in einem
+                                                                                                                 ///< CUDA Array (fest definiert fuer
+                                                                                                                 ///< bestimmung welche Textur genutzt
+                                                                                                                 ///< wird)
+                                                                           deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
+                                                                                                                 ///< the path from transducer to
+                                                                                                                 ///< voxel.
+                                                                           SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
+                                                                           deviceSAFT_VARIANT);
+    }
+
+    CUDA_CHECK(cudaGetLastError());
+
+    // ==================================================== cudaUnbindTexture
+    // Texturmemory fuer Emitter - SosPathsTables entbinden
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToEmitterPathSosBoth_preprocess));
+    // Texturmemory fuer Receiver - SosPathsTables entbinden
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth0_preprocess));
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth1_preprocess));
+    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth2_preprocess));
+}
+
+void SAFTHandler::precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceListGeometry, int geometryElementCount, float *deviceVoxelCountOutputFloat,
+                                                  float *deviceSpeedOfSoundSumOutput)
+{
+    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1,
+                         1);  // max. 512 oder 1024	Threads werden vorgegeben und
+    // dim3 threadsPerBlock (SOSGrid_XYZ.x,SOSGrid_XYZ.y,1);	// max. 512 oder
+    // 1024	Threads werden vorgegeben und
+    dim3 blocksPerGrid(1, 1, 1);  // max. 65.535			Bloecke im Grid
+                                  // berechnet.	Initialisierung
+    blocksPerGrid.x = SOSGrid_XYZ.y;
+    blocksPerGrid.y = sosZLayerCount;
+    blocksPerGrid.z = 1;
+
+    cudaChannelFormatDesc texChannelDescSosAttField = cudaCreateChannelDesc(32, 32, 0, 0,
+                                                                            cudaChannelFormatKindFloat);  // Schritt 2.1 Output-Kanal
+                                                                                                          // anlegen und beschreiben
+
+    texRefSosAttField.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+    texRefSosAttField.addressMode[1] = cudaAddressModeClamp;
+    texRefSosAttField.addressMode[2] = cudaAddressModeClamp;
+
+    if (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing] == 1)
+    {
+        texRefSosAttField.filterMode = cudaFilterModeLinear;  // Lineare Interpolation
+    }
+    else
+    {
+        texRefSosAttField.filterMode = cudaFilterModePoint;  // Nearest Neighbor
+    }
+    texRefSosAttField.normalized = 0;
+
+    CUDA_CHECK(cudaBindTextureToArray(&texRefSosAttField, deviceSosAttFieldCuArray,
+                                      &texChannelDescSosAttField));  // Schritt 4.1 3DArray an Texturmemory
+                                                                     // binden
+
+    if (deviceListGeometry == 0)
+    {
+        cudaBindSurfaceToArray(outSurfRefTableVoxelToEmPathSosBoth, deviceTableVoxelToEmPathSosBothCuArray);
+    }
+
+    if (deviceListGeometry == 1)
+    {
+        if (TableVoxelToReceiverPathSosAllocationCount > 0)
+        {
+            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth0, deviceTableVoxelToRecPathSosBothCuArray[0]);
+        }
+        if (TableVoxelToReceiverPathSosAllocationCount > 1)
+        {
+            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth1, deviceTableVoxelToRecPathSosBothCuArray[1]);
+        }
+        if (TableVoxelToReceiverPathSosAllocationCount > 2)
+        {
+            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth2, deviceTableVoxelToRecPathSosBothCuArray[2]);
+        }
+    }
+
+    precalculateAverageSpeedOfSoundKernel<<<blocksPerGrid, threadsPerBlock>>>(deviceSosAttFieldCuArray, firstZLayer, sosZLayerCount, deviceListGeometry, geometryElementCount,
+                                                                              maxSoSReceiverArrayForTexture,  // maximale Anzahl an Receivern in einem
+                                                                                                              // CUDA Array
+
+                                                                              // deviceVoxelCountOutput,
+                                                                              deviceVoxelCountOutputFloat, deviceSpeedOfSoundSumOutput,
+                                                                              //            regionOfInterestOffset,
+                                                                              SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter);
+    CUDA_CHECK(cudaGetLastError());
+
+    CUDA_CHECK(cudaUnbindTexture(&texRefSosAttField));
+}
+
+void SAFTHandler::fillCuArray(float useValue,
+                              cudaArray **deviceTextureAscanIndexFloatCuArray,  ///< CuArray to fill
+                              int TableAscanIndexAllocationCount)
+{
+    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1,
+                         1);  // determine neccessary amount of threads
+                              // // max. 512 oder 1024
+    dim3 blocksPerGrid(1, 1,
+                       1);  // determine neccessary amount of blocks in grid	// max. 65.535
+    blocksPerGrid.x = SOSGrid_XYZ.y;
+    blocksPerGrid.y = maxFeasibleSosZLayerCount;
+    blocksPerGrid.z = 1;
+
+    // Step 1. Bereite Output-Textur fuer AscanIndex vor
+    if (TableAscanIndexAllocationCount > 0)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat0, deviceTextureAscanIndexFloatCuArray[0]);
+    }
+    if (TableAscanIndexAllocationCount > 1)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat1, deviceTextureAscanIndexFloatCuArray[1]);
+    }
+    if (TableAscanIndexAllocationCount > 2)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat2, deviceTextureAscanIndexFloatCuArray[2]);
+    }
+    if (TableAscanIndexAllocationCount > 3)
+    {
+        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat3, deviceTextureAscanIndexFloatCuArray[3]);
+    }
+
+    // Step 2. Fuere Kernel aus mit #Threads: SOS.x*SOS.y . Innerhalb werden immer
+    // 1024/2048 A-Scans durchgegangen und in AscanIndex-Textur geschrieben
+    fillCuArrayKernel<<<blocksPerGrid, threadsPerBlock>>>(useValue,
+                                                          deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS
+                                                                                                ///< samples in the path
+                                                                                                ///< from transducer to
+                                                                                                ///< voxel.
+                                                          maxAscanIndexArraysInTexture,
+                                                          TableAscanIndexAllocationCount,  ///< Amount of Surfaces in
+                                                                                           ///< the Array of cuArrays
+                                                          maxFeasibleSosZLayerCount, ATTMode_3DVolume, debugMode, debugModeParameter);
+
+    CUDA_CHECK(cudaGetLastError());
+}
+
+void SAFTHandler::performSAFT(
+    int aScanIndex,              ///< The A-scan index is increased by the A-scan batch size in every iteration. It describes the offset into the A-scan samples the SAFT kernel is operating with.
+    size_t aScanWindowSize,      ///< A-scan batch size in terms of number of samples within one window.
+    int3 IMAGE_SIZE_XYZ,         ///< Bildbereichsgroesse/ROI in Voxel
+    int3 SOSGrid_XYZ,            ///< SoSGridgroesse in Voxel
+    int blockIndexOffset,        ///< Additional offset added to the z component of the block index, required because of the adjustments for partial reconstruction in different z-layers.
+    int outputWindowVoxelCount,  ///< Number of Voxels in the output window.
+    int speedOfSoundZLayer,      ///< current SoS z-layer Offset in the speed of sound grid.
+    int speedOfSoundVoxelsWithinZLayers,  ///< Number of z-layers in the speed of sound grid touched by the z-layers of the active zone of reconstruction in the region of interest.
+    int maxFeasibleSosZLayerCount,
+    int currentEmIndexUsedForAscanIndexCalculation,  ///< current Index of Em for which the AscanIndex is calculated
+    dim3 const &windowGridDimensions,  ///< Grid dimensions to be used to launch the SAFT kernel. It is smaller than the full grid dimensions and only represents the current reconstruction window.
+    dim3 const &gridDimensions,        ///< Full grid dimensions of the reconstruction.
+    dim3 const &blockDimensions,       ///< Block dimensions to be used with the SAFT kernel.
+    float *deviceSpeedOfSoundField,    ///< Pointer to SoSGrid.
+    cudaArray *deviceAScansCuArray
+    //    cudaStream_t stream ///< Stream to execute the SAFT kernel on.
+)
+{
+
+    dim3 reducedGridDimensions, reducedBlockDimensions;
+
+    reduceKernelDimensions(windowGridDimensions, blockDimensions, reducedGridDimensions, reducedBlockDimensions);
+
+    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex_SOS_ATT, cudaFuncCachePreferL1));
+    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex_SOS, cudaFuncCachePreferL1));
+    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex, cudaFuncCachePreferL1));
+
+    //	Texture Memory Adressing-mode 			// http://docs.nvidia.com/cuda/pdf/CUDA_C_Programming_Guide.pdf  ->  3.2.11.1. Texture Memory S. 42
+    // cudaAddressModeClamp 	- Return values at the boarders if out-of range - default
+    // cudaAddressModeBorder	- Return 0 if out-of range
+    // cudaAddressModeMirror	- Mirror the values    		- For normalized coordinates
+    // cudaAddressModeWrap      - Repeating the values    	- For normalized coordinates
+
+    // Texturmemory fuer Ascans
+    cudaChannelFormatDesc texChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);  // Beschreibung des RueckgabeFormats der Textur
+    texRefAscans.addressMode[0] = cudaAddressModeBorder;                                                    // Texturreferenz beschreiben
+    texRefAscans.addressMode[1] = cudaAddressModeBorder;
+
+    if (SAFT_VARIANT[SAFT_VARIANT_AscanInterpolation] == 1)
+    {
+        texRefAscans.filterMode = cudaFilterModeLinear;  // Lineare Interpolation
+    }
+    else
+    {
+        texRefAscans.filterMode = cudaFilterModePoint;  // Nearest Neighbor
+    }
+
+    texRefAscans.normalized = 0;
+    CUDA_CHECK(cudaBindTextureToArray(&texRefAscans, deviceAScansCuArray, &texChannelDesc));
+
+    if (ATTMode_3DVolume == false)
+    {  // ========= 3DVolume Mode without ATT-Correction
+
+        cudaChannelFormatDesc texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc<float>();  // Should do the same
+
+        // AscanIndex Path Tables ------------------------------------------------------
+        texTableAscanIndexFloat1_0.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableAscanIndexFloat1_0.addressMode[1] = cudaAddressModeClamp;
+        texTableAscanIndexFloat1_0.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableAscanIndexFloat1_0.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableAscanIndexFloat1_0.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableAscanIndexFloat1_0.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_0, deviceTextureAscanIndexFloatCuArray[0], &texChannelDescTableAscanIndexFloat));
+
+        if (TableAscanIndexAllocationCount > 1)
+        {                                                                      // TODO: mit Arrays flexibel programmieren!!!
+            texTableAscanIndexFloat1_1.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat1_1.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat1_1.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat1_1.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat1_1.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat1_1.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_1, deviceTextureAscanIndexFloatCuArray[1], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 2)
+        {
+            texTableAscanIndexFloat1_2.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat1_2.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat1_2.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat1_2.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat1_2.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat1_2.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_2, deviceTextureAscanIndexFloatCuArray[2], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 3)
+        {
+            texTableAscanIndexFloat1_3.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat1_3.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat1_3.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat1_3.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat1_3.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat1_3.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat));
+        }
+    }
+    else if (ATTMode_3DVolume == true)
+    {  // ========= 3DVolume Mode with ATT-Correction
+
+        cudaChannelFormatDesc texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc<float2>();  // Should do the same
+
+        // AscanIndex Path Tables ------------------------------------------------------
+        texTableAscanIndexFloat2_0.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+        texTableAscanIndexFloat2_0.addressMode[1] = cudaAddressModeClamp;
+        texTableAscanIndexFloat2_0.addressMode[2] = cudaAddressModeClamp;
+        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+        {
+        case 0:
+            texTableAscanIndexFloat2_0.filterMode = cudaFilterModePoint;
+            break;
+        case 1:
+            texTableAscanIndexFloat2_0.filterMode = cudaFilterModeLinear;
+            break;
+        }
+        texTableAscanIndexFloat2_0.normalized = 0;
+
+        CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_0, deviceTextureAscanIndexFloatCuArray[0], &texChannelDescTableAscanIndexFloat));
+
+        if (TableAscanIndexAllocationCount > 1)
+        {                                                                      // TODO: mit Arrays flexibel programmieren!!!
+            texTableAscanIndexFloat2_1.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat2_1.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat2_1.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat2_1.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat2_1.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat2_1.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_1, deviceTextureAscanIndexFloatCuArray[1], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 2)
+        {
+            texTableAscanIndexFloat2_2.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat2_2.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat2_2.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat2_2.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat2_2.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat2_2.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_2, deviceTextureAscanIndexFloatCuArray[2], &texChannelDescTableAscanIndexFloat));
+        }
+
+        if (TableAscanIndexAllocationCount > 3)
+        {
+            texTableAscanIndexFloat2_3.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
+            texTableAscanIndexFloat2_3.addressMode[1] = cudaAddressModeClamp;
+            texTableAscanIndexFloat2_3.addressMode[2] = cudaAddressModeClamp;
+            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
+            {
+            case 0:
+                texTableAscanIndexFloat2_3.filterMode = cudaFilterModePoint;
+                break;
+            case 1:
+                texTableAscanIndexFloat2_3.filterMode = cudaFilterModeLinear;
+                break;
+            }
+            texTableAscanIndexFloat2_3.normalized = 0;
+
+            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat));
+        }
+    }
+    // Vorberechnung der Koordinaten --> schnelleres Bestimmen der Voxelposition
+    float VoxelIncrement = IMAGE_RESOLUTION / SOS_RESOLUTION;
+    float3 SosVoxelStartPosition;
+    SosVoxelStartPosition.x = (regionOfInterestOffset.x - sosOffset.x) / SOS_RESOLUTION;  // Start des Bildes im SOS-Grid aus Positionsdaten bestimmen
+    SosVoxelStartPosition.y = (regionOfInterestOffset.y - sosOffset.y) / SOS_RESOLUTION;
+    SosVoxelStartPosition.z = (regionOfInterestOffset.z - sosOffset.z) / SOS_RESOLUTION;
+    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
+
+    // Anzahl der Teiltabellen, bereits hier berechnen oder übergeben
+    int TableAscanIndexAllocationCount = ceil((float)aScanWindowSize / (float)maxAscanIndexArraysInTexture);  // float is important due to ceiling
+    // printf( "TableAscanIndexAllocationCount = (%i/%i) = %i = %i\n", aScanWindowSize, maxAscanIndexArraysInTexture, TableAscanIndexAllocationCount,
+    // ceil(aScanWindowSize/maxAscanIndexArraysInTexture));
+
+    // Call of 3 SAFT Versions - AscanIndex-Varainten
+    // #####################################################################################################################################################################
+    // ==================================================== Blockmode with SOS-value per Ascan
+    // ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
+    // ==================================================== 3DVolume Mode with SOS- and ATT-Correction
+
+    if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false))
+    {  // ==================================================== Blockmode with SOS-value per Ascan
+        printf("\n\n --- SAFT (AscanIndex) without SOS currently not implemented --- \n");
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false))
+    {  
+		// ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
+        saftKernelAscanIndex_SOS<<<reducedGridDimensions, reducedBlockDimensions>>>(  // , 0, stream>>>(
+
+            aScanIndex,
+
+            (float)aScanWindowSize,          // maxAscanIndexArraysInTexture
+            maxAscanIndexArraysInTexture,    //  maxSoSReceiverArrayForTexture
+            TableAscanIndexAllocationCount,  // Anzahl der genutzten Teiltabellen
+            IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement, blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions, deviceOutput);
+    }
+    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true))
+    {  // ==================================================== 3DVolume Mode with SOS- and ATT-Correction
+
+        saftKernelAscanIndex_SOS_ATT<<<reducedGridDimensions, reducedBlockDimensions>>>(  // , 0, stream>>>(
+            aScanIndex,
+            (float)aScanWindowSize,          // maxAscanIndexArraysInTexture
+            maxAscanIndexArraysInTexture,    //  maxSoSReceiverArrayForTexture
+            TableAscanIndexAllocationCount,  // Anzahl der genutzten Teiltabellen
+            IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement,
+            blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions,
+            debugMode, debugModeParameter, deviceSAFT_VARIANT,
+            deviceOutput
+
+        );
+    }
+
+    // Unbind Textures
+    CUDA_CHECK(cudaUnbindTexture(&texRefAscans));
+
+    if (ATTMode_3DVolume == false)
+    {  // ========= 3DVolume Mode without ATT-Correction
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_0));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_1));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_2));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_3));
+    }
+    else if (ATTMode_3DVolume == true)
+    {  // ========= 3DVolume Mode with ATT-Correction
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_0));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_1));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_2));
+        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_3));
+    }
+    CUDA_CHECK(cudaGetLastError());
+}
--- a/SAFT_TOFI/src/saft.cu
+++ b/SAFT_TOFI/src/saft.cu
@@ -9,8 +9,7 @@
 	- Das wird gemacht, weil CUDA keine externen Referenzen unterst<73>tzt, um Daten von anderen Compilierungs Einheiten zu referenzieren.
 */

-#include "kernel/constantMemory.cuh"	// Deklaration der Daten, die sich im Constant-Memory befinden Geometriedaten
-#include "kernel/rayTracing.cuh"		// GPU-Code für Bresenham
-#include "kernel/precalculateSpeedOfSoundKernel.cuh"	// GPU-Code Partitionierung für Bresenham. Ruft den Bresenham auf.
-#include "kernel/saftKernel.cuh"		// GPU-Kernel für SAFT
+// #include "kernel/rayTracing.cuh"		// GPU-Code für Bresenham
+// #include "kernel/precalculateSpeedOfSoundKernel.cuh"	// GPU-Code Partitionierung für Bresenham. Ruft den Bresenham auf.
+// #include "kernel/saftKernel.cuh"		// GPU-Kernel für SAFT