Commit source

2023-05-18 16:04:27 +08:00
parent 88cf81e4ea
commit c6cd188732
83 changed files with 39921 additions and 0 deletions
--- a/SAFT_ATT/CMakeLists.txt
+++ b/SAFT_ATT/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
+project(SaftATT)
+set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
+enable_language(CUDA)
+set(Matlab_ROOT_DIR /usr/local/Polyspace/R2019b)
+find_package(Matlab)
+
+add_library(SaftATT SHARED ./src/SAFT_ATT.cpp ./src/saft.cu ./src/processAScans.cpp ./src/saft.cpp )
+target_include_directories(SaftATT PRIVATE ./src /usr/local/cuda/include  /usr/local/Polyspace/R2019b/extern/include) 
+set_target_properties(SaftATT PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+target_compile_options(SaftATT PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
+                       -O3
+                       --compiler-options -fPIC
+                       --use_fast_math
+                       --ptxas-options=-v
+                       -arch compute_30 -code compute_30,sm_30 
+                       >)
+
+target_link_libraries(SaftATT PRIVATE ${CUDA_RUNTIME_LIBRARY} ${Matlab_MEX_LIBRARY} ${Matlab_MX_LIBRARY})
+set_target_properties(SaftATT PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_LIST_DIR}/src/SAFT_ATT.h)
--- a/SAFT_ATT/src/SAFT_ATT.cpp
+++ b/SAFT_ATT/src/SAFT_ATT.cpp
--- a/SAFT_ATT/src/SAFT_ATT.h
+++ b/SAFT_ATT/src/SAFT_ATT.h
@@ -0,0 +1,8 @@
+#ifndef __SAFT_ATT_H__
+#define __SAFT_ATT_H__
+#include <mex.h>
+extern "C"{
+    void SAFT_ATT(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]);
+} 
+
+#endif // __SAFT_ATT_H__
--- a/SAFT_ATT/src/kernel/analysisOfTransducerVectorsKernel.hcu
+++ b/SAFT_ATT/src/kernel/analysisOfTransducerVectorsKernel.hcu
@@ -0,0 +1,76 @@
+//// printf() is only supported
+//// for devices of compute capability 2.0 and above
+//#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
+//	#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
+//#endif
+//
+///**
+//   This kernel is responsible for determining valid combinations of emitters and receivers, based on the angle between their transducer vectors.
+//   Reference: Thesis 3.3
+//   - Dieser Kernel ist verantwortlich daf<61>r g<>ltige Kombinationen von Emitter und Receiver zu bestimmen, basierend auf dem Winkel zwischen den TransducerVektoren.
+//   - Reference: Thesis 3.3
+//*/
+//__global__ void analyseTransducerVectorsKernel(
+//    bool * validEmitterReceiverCombinations, ///< Boolean array in which valid combinations of emitters and receivers are held (true for valid ones, false for invalid ones). The data are stored in the following order, arranged from the fastest moving index to the slowest one: receivers, emitters.
+//    int * analysisOfTransducerVectorsDistributionCounters ///< This is a pointer to an int [2] in which the number of valid and invalid combinations is stored, for later analysis.
+//    )
+//{
+//    int
+//        threadFieldOffset,
+//        threadFieldCount;
+//    getWorkLoad(emitterReceiverCombinations, threadFieldOffset, threadFieldCount);
+//
+//    int
+//        emitterIndex = threadFieldOffset / receiverCount,
+//        receiverIndex = threadFieldOffset % receiverCount;
+//
+//    float3 currentEmitterVector = getEmitterTransducerVector(emitterIndex);
+//
+//    int
+//        invalidCombinationCount = 0,
+//        validCombinationCount = 0;
+//
+//    for(int i = threadFieldOffset, limit = threadFieldOffset + threadFieldCount; i < limit; i++)
+//    {
+//        float3 currentReceiverVector = getReceiverTransducerVector(receiverIndex);
+//        float angle = determineAngle(currentEmitterVector, currentReceiverVector);
+//        bool isValidCombination = angle <= maximumAngleBetweenEmitterAndReceiverTransducerVectors;
+//
+//        //printf( "Kernelaufruf~~~");
+//        //printf( "[th %d  bl %d] i:%i - isValidCombination: %i\n", threadIdx.x, blockIdx.x, i, isValidCombination);
+//
+//        validEmitterReceiverCombinations[i] = isValidCombination;
+//        if(normalisePerformanceStatistics)
+//        {
+//            if(isValidCombination)
+//                validCombinationCount++;
+//            else
+//                invalidCombinationCount++;
+//        }
+//        receiverIndex++;
+//        if(receiverIndex == receiverCount)
+//        {
+//            receiverIndex = 0;
+//            emitterIndex++;
+//            currentEmitterVector = getEmitterTransducerVector(emitterIndex);
+//        }
+//    }
+//    if(normalisePerformanceStatistics)
+//    {
+//        atomicAdd(analysisOfTransducerVectorsDistributionCounters + 0, invalidCombinationCount);
+//        atomicAdd(analysisOfTransducerVectorsDistributionCounters + 1, validCombinationCount);
+//    }
+//}
+//
+///**
+//   Proxy function to launch the actual kernel that determines the valid combinations of emitters and receivers.
+//   - Proxyfunktion um den aktuellen Kernel aufzurufen, der die g<>ltigen Kombinationen von Emitter und Receiver bestimmt
+//*/
+//void SAFTHandler::analyseTransducerVectors(
+//    dim3 gridDimensions, ///< Grid dimensions to be used by the kernel.
+//    dim3 blockDimensions ///< Block dimensions to be used by the kernel.
+//    )
+//{
+//    analyseTransducerVectorsKernel<<<gridDimensions, blockDimensions>>>(deviceValidEmitterReceiverCombinations, deviceTransducerVectorAnalysisDistributionCounters);
+//    CUDA_CHECK(cudaGetLastError());
+//}
--- a/SAFT_ATT/src/kernel/constantMemory.hcu
+++ b/SAFT_ATT/src/kernel/constantMemory.hcu
@@ -0,0 +1,54 @@
+/*!
+	Emitter and receiver geometry held in constant memory, available across all functions in saft.cu because all of it is held in the same compilation unit.
+	- Emitter und Receiver Geometrie werden im Constant Memory gehalten, erreichbar f<>r alle Funktionen in Saft.cu  weil alle von ihnen in der selben Kompilierungs-Einheit gehalten werden.
+*/
+
+#include "saft.hpp"
+
+#ifdef SaftUseConstantMemforGeometry
+
+	#ifdef SaftCalcSoSInKernel
+		__constant__ float3 emitterPOSsosInKernel[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+		__constant__ float3 receiverPOSsosInKernel[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+
+        //__constant__ float3 emitterPOSsosInKernel[157 * 4];
+		//__constant__ float3 receiverPOSsosInKernel[157 * 9];
+
+        
+
+		float3* constEmitterPtr = &emitterPOSsosInKernel[0];
+		float3* constReceiverPtr = &receiverPOSsosInKernel[0];
+
+	#else
+		#ifdef SaftUseArithmeticMean		// Nötig wegen Doppelnennung :-(
+			__constant__ float3 emitterPOSarith[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+			__constant__ float3 receiverPOSarith[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+            
+            //__constant__ float3 emitterPOSarith[157 * 4];
+			//__constant__ float3 receiverPOSarith[157 * 9];
+
+			float3* constEmitterPtr = &emitterPOSarith[0];
+			float3* constReceiverPtr = &receiverPOSarith[0];
+		#endif
+		#ifdef SaftUseHarmonicMean
+			__constant__ float3 emitterPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+			__constant__ float3 receiverPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+
+            //__constant__ float3 emitterPOSharmon[157 * 4];
+			//__constant__ float3 receiverPOSharmon[157 * 9];
+
+			float3* constEmitterPtr = &emitterPOSharmon[0];
+			float3* constReceiverPtr = &receiverPOSharmon[0];
+		#endif
+	#endif
+
+	// LookUpTable for GeometryList and Memory Position
+	__constant__ unsigned short lookUpGeometryMemoryListEmitter [MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+	__constant__ unsigned short lookUpGeometryMemoryListReceiver[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
+
+    //__constant__ unsigned short lookUpGeometryMemoryListEmitter [157 * 4];
+	//__constant__ unsigned short lookUpGeometryMemoryListReceiver[157 * 9];
+
+	unsigned short* constLookUpGeometryMemoryListEmitterPtr = &lookUpGeometryMemoryListEmitter[0];
+	unsigned short* constLookUpGeometryMemoryListReceiverPtr = &lookUpGeometryMemoryListReceiver[0];
+#endif
--- a/SAFT_ATT/src/kernel/precalculateSpeedOfSoundKernel.hcu
+++ b/SAFT_ATT/src/kernel/precalculateSpeedOfSoundKernel.hcu
@@ -0,0 +1,684 @@
+#include <stdio.h>
+#include "saft.hpp"
+//#include <mex.h>
+
+// printf() is only supported
+// for devices of compute capability 2.0 and above
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
+	#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
+#endif
+
+#ifdef debug_CudaPrecalculateKernel
+	#define DebugSosVoxelX  5
+	#define DebugSosVoxelY  5
+	#define DebugSosVoxelZ  5
+#endif
+//	#define DebugSosVoxelX  64
+//	#define DebugSosVoxelY  64
+//	#define DebugSosVoxelZ  64
+
+//Surfaces fuer Emitter - SosPathsTables
+#ifdef SaftTextureForEmRecSosPathsTablesFloat1
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToEmitterPathSosSum;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToEmitterPathCount;
+
+	//Surfaces fuer Emitter - SosPathsTables
+	//surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSumTest;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSum0;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSum1;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSum2;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathCount0;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathCount1;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathCount2;
+#endif
+
+#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToEmPathSosBoth;
+
+	//Surfaces fuer Emitter - SosPathsTables
+	//surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSumTest;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToRecPathSosBoth0;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToRecPathSosBoth1;
+	surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToRecPathSosBoth2;
+#endif
+
+
+__global__ void precalculateAverageSpeedOfSoundKernel(
+
+	#ifndef SaftTextureForBresenhamSosPaths
+		float const * deviceSpeedOfSoundField, 		///< Array of speed of sound samples. Dimensions ordered by speed of indices, commencing with the fastest moving one: 1. x 2. y 3. z
+	#else
+
+		#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
+			cudaArray *deviceSpeedOfSoundFieldCuArray,	///< CuArray fuer SOSFieldTextur
+		#endif
+		#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+			cudaArray *deviceSosAttFieldCuArray,	///< CuArray fuer SosAttFieldTextur
+		#endif
+	#endif
+		int firstZLayer, 							///< First z-layer in the speed of sound grid the pre-calculation is performed for.
+		int sosZLayerCount, 						///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
+		#ifdef SaftUseConstantMemforGeometry
+			int geometry, 							///< emitters=0 or receivers=1.
+		#else
+			float3 const * geometry, 				///< Vector array describing the positions of emitters or receivers.
+		#endif
+		int geometryElementCount, 				    ///< Number of elements in the geometry array.
+		int maxSoSReceiverArrayForTexture,		    ///< max amount of elements in the receiver CUDA array.
+	//    VoxelCountType * deviceVoxelCountOutput,  ///< fuer Count im Integerformat gedacht fuer Texturmemory.
+		float * deviceVoxelCountOutputFloat,	    ///< fuer Count im Floatformat gedacht fuer Texturmemory.
+		float * speedOfSoundSumOutput, 			    ///< fuer SoS im Floatformat gedacht fuer Texturmemory.
+	//    float3 regionOfInterestOffset,
+		int3 SOSGrid_XYZ,
+		float3 sosOffset,
+		float3 regionOfInterestOffset,
+		float IMAGE_RESOLUTION,
+		float SOS_RESOLUTION,
+		float debugMode,
+		float debugModeParameter
+)
+{
+
+
+    dim3 SosVoxel
+    (
+		threadIdx.x ,			    // SoS-Voxel X		? Threads fangen an bei 0 an
+		blockIdx.x ,			    // SoS-Voxel Y
+		blockIdx.y + firstZLayer    // SoS-Voxel Z  +  Offset
+    );
+
+	#ifdef debug_CudaPrecalculateKernel
+    	//printf(" SosVoxel.x,y,z  =	[%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z);	// Herausfinden welche berechnet werden
+		if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
+		{
+
+			int threadCountAll = gridDim.z * gridDim.x * blockDim.x;		// = Anzahl aller Threads X*Y*Z
+			int threadIndex = blockDim.x * (blockIdx.y * gridDim.x + blockIdx.x) + threadIdx.x;
+
+			printf("==================================================================\n");
+			printf(" threadCountAll  =	%i\n", threadCountAll);		// Anzahl aller Threads //Brauche ich wahrscheinlich gar nicht.
+			printf(" threadIndex     =	%i\n", threadIndex);		// Threadindex von aktuellem Kernel
+			printf(" SosVoxel.x,y,z  =	[%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z);	// In welchem SoS-Voxel befinde ich mich?
+			printf(" geometryElementCount =	%i\n", geometryElementCount);	// Wie viele Elemente gibt es in der Emitter/receiverListe?
+			printf("==================================================================\n");
+		}
+	#endif
+
+//		if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
+//		{
+//			printf(" PrecalculateKernel: debugMode [%i] for geometry[%i]\n", debugMode, geometry);
+//		}
+
+	int voxelCount;			// Anzahl der Voxel auf einem SoS-Pfad
+	float totalSpeed = 0.0;				// SoSSumme auf einem SoS-Pfad
+	float totalAttenuation = 0.0;		// AttSumme auf einem Attenuation-Pfad
+
+	dim3 SosGeometryVoxel; 	// SoSVoxel von Emitter/Receiver
+	float3 SosGeometryVoxelFloat; 	// SoSVoxel von Emitter/Receiver in Float
+	float SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION;	// Aufluesung im SoS-Grid
+	//int tableIndex;			// Index innerhalb TableVoxelToEmitter/ReceiverPath
+
+	// Speicher in Texturformat
+	// int xmax = SOSGrid_XYZ.x;
+	// int ymax = SOSGrid_XYZ.y;
+	// int zmax = sosZLayerCount; //SOSGrid_XYZ.z;
+	int i_x = SosVoxel.x;
+	int i_y = SosVoxel.y;
+	int i_z = (SosVoxel.z-firstZLayer); // float SosVoxelTextureZ = (SosVoxelf.z - speedOfSoundZLayer);
+	//int Index;
+	int TexturGeometryIndexZ;
+
+	float3 currentGeometry;
+
+	for(int geometryIndexCounter = 0; geometryIndexCounter < geometryElementCount; geometryIndexCounter++)		// Alle Emitter oder Receiver in der Liste von Matlab durchgehen
+	{
+		int lookUpGeometryIndex = 0;
+
+		// Lade lookUpGeometryMemoryList-Eintrag, um Position im Memory zu bestimmen
+		if (geometry == 0)   //         => Emitter
+			{
+				lookUpGeometryIndex = lookUpGeometryMemoryListEmitter[geometryIndexCounter];	// Load from Constant Memory
+			}
+		else //if (geometry == 1)	=> Receiver
+			{
+				lookUpGeometryIndex = lookUpGeometryMemoryListReceiver[geometryIndexCounter];	// Load from Constant Memory
+			}
+
+
+		//if (currentGeometry.x != 255)		// currentGeometry.x = 255 ist außerhalb des Wertebereichs und zeigt an, das Geometrie nicht genutzt wird. Darum muss nicht berechnet werden.
+		if (lookUpGeometryIndex != 65535)	// currentGeometry.x = 65535 ist außerhalb des Wertebereichs und zeigt an, das Geometrie nicht genutzt wird. Darum muss nicht berechnet werden.
+		{
+
+			#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
+				if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
+					{
+						printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i): lookUpGeometryIndex(%i)\n", geometry, geometryIndexCounter, lookUpGeometryIndex);		// In welche Speicherstelle wird geschrieben
+													printf(" SOSGrid_XYZ.x,y,z          =	[%i %i %i]\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z);	// In welchem SoS-Voxel befinde ich mich?
+													printf(" geometryElementCount       =	%i\n", geometryElementCount);	// Wie viele Elemente gibt es in der Emitter/receiverListe?
+													printf("-------------------------------------------------------------------\n");
+													printf(" SosVoxel.x,y,z             =	[%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z);	// In welchem SoS-Voxel befinde ich mich?
+													printf(" geometryIndexCounter       =	%i\n", geometryIndexCounter);	// Wie viele Elemente gibt es in der Emitter/receiverListe?
+													printf(" firstZLayer                =	%i\n", firstZLayer);	// zLayer Offset, welcher wird zur Zeit berechnet?
+													printf(" sosZLayerCount             =	%i\n", sosZLayerCount);	// Anzhal der zu berechnenden zLayer?
+													printf("-------------------------------------------------------------------\n");
+													printf(" speedOfSoundSumOutput_Index=	%i\n", SOSGrid_XYZ.x*(SOSGrid_XYZ.y*SOSGrid_XYZ.y*geometryIndexCounter+SOSGrid_XYZ.y*(SosVoxel.z-firstZLayer)+SosVoxel.y)+SosVoxel.x);		// In welche Speicherstelle wird geschrieben
+													printf(" totalSpeed                 =	%f\n", totalSpeed);		// Berechnete Geschwindigkeit
+													printf(" write i_x,i_y,i_z          =	[%i %i %i]\n", i_x, i_y, i_z);	// In welchem SoS-Voxel schreibe ich?
+													printf("==================================================================\n");
+					}
+			#endif
+
+			// Wenn Emitter/Receiver genutzt werden Koordinaten laden
+			#ifdef SaftUseConstantMemforGeometry
+				if (geometry == 0)   //         => Emitter
+					{
+						#ifdef SaftCalcSoSInKernel
+							currentGeometry = emitterPOSsosInKernel[geometryIndexCounter];	// Positionsdaten von Emitter lesen
+						#else
+							#ifdef SaftUseArithmeticMean		// Nötig wegen Doppelnennung :-(
+								currentGeometry = emitterPOSarith[geometryIndexCounter];	// Positionsdaten von Emitter lesen
+							#endif
+							#ifdef SaftUseHarmonicMean
+								currentGeometry = emitterPOSharmon[geometryIndexCounter];	// Positionsdaten von Emitter lesen
+							#endif
+						#endif
+						//syncthreads();
+					}
+
+					else //if (geometry == 1)	=> Receiver
+					{
+						#ifdef SaftCalcSoSInKernel
+							currentGeometry = receiverPOSsosInKernel[geometryIndexCounter];	// Positionsdaten von Receiver lesen
+						#else
+							#ifdef SaftUseArithmeticMean		// Nötig wegen Doppelnennung :-(
+								currentGeometry = receiverPOSarith[geometryIndexCounter];	// Positionsdaten von Receiver lesen
+							#endif
+							#ifdef SaftUseHarmonicMean
+								currentGeometry = receiverPOSharmon[geometryIndexCounter];	// Positionsdaten von Receiver lesen
+							#endif
+						#endif
+						//syncthreads();
+					}
+			#endif
+
+			#ifndef SaftUseConstantMemforGeometry
+				currentGeometry = geometry[geometryIndexCounter];	// Positionsdaten von Emitter/Receiver lesen
+			#endif
+
+
+
+
+
+			// Versuche mit Geometrie d.h. E/R-Kooridinaten um einen halben Voxel zu verschieben ==> muesste eigentlich bei beiden, S/E-Koordinaten und Voxel, gemacht werden
+			//determineSpeedOfSoundFieldVoxel(currentGeometry , SosGeometryVoxel, SOS_RESOLUTION_FACTOR);	// SoSVoxel von Emitter/Receiver bestimmen
+//			currentGeometry_plushalf.x = currentGeometry.x + IMAGE_RESOLUTION/2;	// Emitter/Receiver-Position in SoS-Koordinaten Umwandeln
+//			currentGeometry_plushalf.y = currentGeometry.y + IMAGE_RESOLUTION/2;	// halbe Koordinaten hier nicht benuetigt, da ja die genaue Positionsdaten da sind
+//			currentGeometry_plushalf.z = currentGeometry.z + IMAGE_RESOLUTION/2;
+
+//			// Versuche im Integerformat
+//			//determineSpeedOfSoundFieldVoxel     (currentGeometry_plushalf , SosGeometryVoxel, sosOffset, SOS_RESOLUTION_FACTOR);// SoSVoxel von E/R bestimmen // currentGeometry + 1/2--> SosGeometryVoxel
+//			determineSpeedOfSoundFieldVoxel       (currentGeometry , SosGeometryVoxel,      sosOffset, SOS_RESOLUTION_FACTOR);	// SoSVoxel von E/R bestimmen // currentGeometry --> SosGeometryVoxel Integer
+//			//                            out         out         voxel1(E/R)            voxel2(SoSVoxel)  SoSField                 Size of SoSField
+// 			performRayTracedSpeedAddition(voxelCount, totalSpeed, SosGeometryVoxel,      SosVoxel,         deviceSpeedOfSoundField, SOSGrid_XYZ);	// SosGeometryVoxel im Integerformat, SoSVoxel als Integer
+
+
+			// Bestimmen der SoS-Koordinaten fuer die Sender/Empfuenger-Koordinaten
+ 			determineSpeedOfSoundFieldVoxelFloat(currentGeometry, SosGeometryVoxelFloat, sosOffset, SOS_RESOLUTION_FACTOR);    // SoSVoxel von E/R bestimmen // currentGeometry --> SosGeometryVoxel Float
+ 			//determineSpeedOfSoundFieldVoxelFloat(currentGeometry_plushalf, SosGeometryVoxelFloat, sosOffset, SOS_RESOLUTION_FACTOR);    // SoSVoxel von E/R bestimmen // currentGeometry --> SosGeometryVoxel Float
+
+ 			// Nutzen der Bresenham-Floatvariante
+ 			//                                 out         out         voxel1(E/R)            voxel2(SoSVoxel)  SoSField                 Size of SoSField  , E=0/R=1
+ 			//performRayTracedSpeedAdditionFloat(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel,         deviceSpeedOfSoundField, SOSGrid_XYZ       , geometry);	// SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
+
+ 			// Nutzen der Bresenham-Floatvariante mit Texturmemory und Interpolation
+ 			//                                   out         out         voxel1(E/R)            voxel2(SoSVoxel+0.5)  SoSField              Size of SoSField  																		, E=0/R=1
+ 			//performRayTracedSpeedAdditionTexture(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel,         deviceSpeedOfSoundField,  SOSGrid_XYZ       ,sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry);	// SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
+
+			#ifndef SaftTextureForBresenhamSosPaths	// SOS-Volume ueber Array oder normal ansprechen?!
+						performRayTracedSpeedAdditionTexture(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel,         deviceSpeedOfSoundField,  SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry);	// SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
+			#else
+						//performRayTracedSpeedAdditionTexture(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel,  deviceSpeedOfSoundFieldCuArray,  SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry);	// SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
+
+					#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
+						performRayTracedSpeedAdditionTexture (voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel,  deviceSpeedOfSoundFieldCuArray,  SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry);	// SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
+					#endif
+					#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+						performRayTracedSpeedAdditionTexture (voxelCount, totalSpeed, totalAttenuation, SosGeometryVoxelFloat, SosVoxel,  deviceSosAttFieldCuArray,  SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry);	// SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
+					#endif
+			#endif
+
+			#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
+					//if ((SosVoxel.y == DebugSosVoxelY) && ( (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX))){
+					if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ)){
+						//printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f]:[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), SoStotalSpeed(%3.3f), Index[Table,Index] = [%i %i]\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosGeometryVoxelFloat.x, SosGeometryVoxelFloat.y, SosGeometryVoxelFloat.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)voxelCount, totalSpeed, tableIndex, Index);		// In welche Speicherstelle wird geschrieben
+						#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
+							printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.6f %+3.6f %+3.6f]:[%+3.6f %+3.6f %+3.6f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), SoStotalSpeed(%3.3f)\n\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosGeometryVoxelFloat.x, SosGeometryVoxelFloat.y, SosGeometryVoxelFloat.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)voxelCount, totalSpeed);		// In welche Speicherstelle wird geschrieben
+						#endif
+						#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+							printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.6f %+3.6f %+3.6f]:[%+3.6f %+3.6f %+3.6f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), SoStotalSpeed(%3.3f), totalAttenuation(%3.3f)\n\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosGeometryVoxelFloat.x, SosGeometryVoxelFloat.y, SosGeometryVoxelFloat.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)voxelCount, totalSpeed, totalAttenuation);		// In welche Speicherstelle wird geschrieben
+						#endif
+					}
+			#endif
+
+
+
+				if (geometry == 0)	// Emitter
+					{
+					//speedOfSoundSumOutput[Index] = totalSpeed;						// Fuellen der TableVoxelToEmitter/ReceiverPathSosSum
+					//deviceVoxelCountOutputFloat[Index] = (float)voxelCount;
+					//speedOfSoundSumOutput[Index] = 0.0f;
+					//deviceVoxelCountOutputFloat[Index] = 0.0f;
+
+					TexturGeometryIndexZ  = sosZLayerCount * lookUpGeometryIndex + i_z;
+
+					#ifdef SaftTextureForEmRecSosPathsTablesFloat1	// Float1
+						surf3Dwrite(       totalSpeed, outSurfRefTableVoxelToEmitterPathSosSum, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
+						surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToEmitterPathCount,  i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathCount
+					#endif
+					#ifdef SaftTextureForEmRecSosPathsTablesFloat2 // Float2
+						float2 VoxelValues;
+						VoxelValues.x = totalSpeed;
+						VoxelValues.y = (float)voxelCount;
+						surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToEmPathSosBoth, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+					#endif
+					#ifdef SaftTextureForEmRecSosPathsTablesFloat4 // Float4
+						float4 VoxelValues;
+						VoxelValues.x = totalSpeed;
+						VoxelValues.y = (float)voxelCount;
+
+//						if (totalAttenuation>debugModeParameter)	// Max Border for Attenuation Correction
+//							VoxelValues.z = debugModeParameter;		// Average Attenuation on this Path
+//						else
+							VoxelValues.z = totalAttenuation;		// Average Attenuation on this Path
+
+
+						VoxelValues.w = 0.0f;
+						surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToEmPathSosBoth, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+					#endif
+
+					}
+				else
+					{
+					//speedOfSoundSumOutput[Index] = totalSpeed;						// Fuellen der TableVoxelToEmitter/ReceiverPathSosSum
+					//deviceVoxelCountOutputFloat[Index] = (float)voxelCount;
+
+					//speedOfSoundSumOutput[Index] = 0.0f;
+					//deviceVoxelCountOutputFloat[Index] = 0.0f;
+
+					TexturGeometryIndexZ = sosZLayerCount * ((lookUpGeometryIndex) % maxSoSReceiverArrayForTexture) + i_z;
+
+					#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
+						//if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
+						//	printf(">>>> %i >>>> Precalc: geomIdxCounter(%4i):[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] firstZLayer(%i) ==> TexturNr.[%3i], TexturGeometryIndexZ(%3i), lookUpGeometryIndex(%4i)\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, firstZLayer, (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) , TexturGeometryIndexZ, lookUpGeometryIndex);		// In welche Speicherstelle wird geschrieben
+					#endif
+
+					#ifdef SaftTextureForEmRecSosPathsTablesFloat1	// Float1
+						if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 0){
+							surf3Dwrite((float)totalSpeed, outSurfRefTableVoxelToReceiverPathSosSum0, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
+							surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToReceiverPathCount0,  i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
+							}
+						else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 1) {
+							surf3Dwrite(       totalSpeed, outSurfRefTableVoxelToReceiverPathSosSum1, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
+							surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToReceiverPathCount1,  i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
+							}
+						else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 2){
+							surf3Dwrite(       totalSpeed, outSurfRefTableVoxelToReceiverPathSosSum2, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
+							surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToReceiverPathCount2,  i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
+							}
+					#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat2 // Float2
+					float2 VoxelValues;
+					VoxelValues.x = totalSpeed;
+					VoxelValues.y = (float)voxelCount;
+
+					if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 0){
+						surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth0, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+						}
+					else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 1) {
+						surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth1, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+						}
+					else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 2){
+						surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth2, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+						}
+				#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat4 // Float4
+					float4 VoxelValues;
+					VoxelValues.x = totalSpeed;			// Average SoS on this Path
+					VoxelValues.y = (float)voxelCount;	// Amount of visited voxel
+
+//					if (totalAttenuation>debugModeParameter)	// Max Border for Attenuation Correction
+//						VoxelValues.z = debugModeParameter;		// Average Attenuation on this Path
+//					else
+						VoxelValues.z = totalAttenuation;		// Average Attenuation on this Path
+
+					VoxelValues.w = 0.0f;			// Amount of visited voxel
+					if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 0){
+						surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth0, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+						}
+					else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 1) {
+						surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth1, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+						}
+					else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 2){
+						surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth2, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
+						}
+				#endif
+				//speedOfSoundSumOutput[Index] = totalSpeed;						// Fuellen der TableVoxelToEmitter/ReceiverPathSosSum
+				//deviceVoxelCountOutput[tableIndex] = typedVoxelCount;			// Fuellen der TableVoxelToEmitter/ReceiverPathCount
+					}
+
+			#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
+			//					//printf(" SosVoxel.x,y,z  =	[%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z);	// Herausfinden welche berechnet werden
+
+				if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
+					{
+
+						//printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f]  - SOSVoxel [%3i %3i %3i] firstZLayer(%i)\n>>>>>>>>>>>> VoxelCnt(%i), SoSSum(%3.3f), SoSSum_Index = %i\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, firstZLayer, (int)typedVoxelCount, totalSpeed, Index);		// In welche Speicherstelle wird geschrieben
+						printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.6f %+3.6f %+3.6f]  - SOSVoxel [%3i %3i %3i] firstZLayer(%i)\n>>>>>>>>>>>> surf3Dwrite Textur[%3i %3i %3i], TexturGeometryIndexZ(%3i) = VoxelCnt(%3.6f), SoSSum(%3.6f) = avgSpeed(%3.6f) \n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, firstZLayer, i_x,i_y,i_z, TexturGeometryIndexZ, (float)voxelCount, totalSpeed, (1/(totalSpeed/(float)voxelCount)));		// In welche Speicherstelle wird geschrieben
+							//printf("======%i %i %i============================================================\n", geometry,geometry,geometry);
+							printf(" SOSGrid_XYZ.x,y,z          =	[%i %i %i]\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z);	// In welchem SoS-Voxel befinde ich mich?
+							printf(" geometryElementCount       =	%i\n", geometryElementCount);	// Wie viele Elemente gibt es in der Emitter/receiverListe?
+							printf("-------------------------------------------------------------------\n");
+							printf(" SosVoxel.x,y,z             =	[%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z);	// In welchem SoS-Voxel befinde ich mich?
+							printf(" firstZLayer                =	%i\n", firstZLayer);	// zLayer Offset, welcher wird zur Zeit berechnet?
+							printf(" geometryIndexCounter       =	%i\n", geometryIndexCounter);	// Welches Elemente aus der Emitter/receiverListe?
+							//printf(" TexturGeometryIndexZ       =	%i\n", TexturGeometryIndexZ);	// zLayer Offset, welcher wird zur Zeit berechnet?
+							printf(" lookUpGeometryIndex        =	%i => ### %i in [%i] ###\n", lookUpGeometryIndex, TexturGeometryIndexZ, (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture));	// Welcher Index hat Emitter/receiver?
+							printf(" i_z = (SosVxl.z-firstZLay) =	%i\n", i_z);	// zLayer Offset, welcher wird zur Zeit berechnet?
+
+							//printf("-------------------------------------------------------------------\n");
+							//printf(" speedOfSoundSumOutput_Index=	%i\n", SOSGrid_XYZ.x*(SOSGrid_XYZ.y*SOSGrid_XYZ.y*geometryIndexCounter+SOSGrid_XYZ.y*(SosVoxel.z-firstZLayer)+SosVoxel.y)+SosVoxel.x);		// In welche Speicherstelle wird geschrieben
+//										printf(" totalSpeed                 =	%f\n", totalSpeed);		// Berechnete Geschwindigkeit
+//										printf("==================================================================\n");
+					}
+			#endif
+
+
+//			//#ifdef debug_CudaPrecalculateKernel
+//				//if ((SosVoxel.y == DebugSosVoxelY) && ( (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX))){
+//						//printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), 1/SoStotalSpeed(%3.3f), Index[Table,Index] = [%i %i]\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)typedVoxelCount, totalSpeed, tableIndex, Index);		// In welche Speicherstelle wird geschrieben ?
+//						printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), 1/SoStotalSpeed(%3.3f) = [%i %i]\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (float)voxelCount, totalSpeed);		// In welche Speicherstelle wird geschrieben ?
+//						//			// Speicher in Texturformat
+//						//			// Indexberechnung für Einsatz des Texturmemorys
+//						//			float xmax = SOSGrid_XYZ.x;
+//						//			float ymax = SOSGrid_XYZ.y;
+//						//			float zmax = (float)maxFeasibleSosZLayerCount;
+//						//			float i_x = SosVoxel.x;
+//						//			float i_y = SosVoxel.y;
+//						//			float i_z = (float)(int)(SosVoxelTextureZ);	// float SosVoxelTextureZ = (SosVoxelf.z - speedOfSoundZLayer);
+//
+//						//Index = xmax*(ymax*(zmax*geometryIndexCounter+i_z)+i_y)+i_x;  // ohne lookUpGeometryIndex-Liste linear im Speicher liegend
+//						//Index = xmax*(ymax*(zmax*lookUpGeometryIndex+i_z)+i_y)+i_x;		// mit lookUpGeometryIndex-Liste
+//
+//						//printf(">>>> %i >>>> Index = xmax(%i)*(ymax(%i)*(zmax(%i)*geometryIndexCounter(%i)+i_z(%i))+i_y(%i))+i_x(%i) = [%i]\n", geometry, (int)xmax, (int)ymax, (int)zmax, geometryIndexCounter, (SosVoxel.z-firstZLayer), SosVoxel.y, SosVoxel.x,Index);		// In welche Speicherstelle wird geschrieben ?
+//						printf(">>>> %i >>>> Index = xmax(%i)*(ymax(%i)*(zmax(%i)*lookUpGeometryIndex(%i)+i_z(%i))+i_y(%i))+i_x(%i) = [%i]\n", geometry, (int)xmax, (int)ymax, (int)zmax, lookUpGeometryIndex, (SosVoxel.z-firstZLayer), SosVoxel.y, SosVoxel.x,Index);		// In welche Speicherstelle wird geschrieben ?
+//				//}
+//			//#endif
+
+			// Alle berechneten SOS-Voxel ausgeben mit Index
+			//printf(" SosVoxel.x,y,z  =	[%i %i %i] => Index (%i)\n", SosVoxel.x, SosVoxel.y, SosVoxel.z, Index);	// In welchem SoS-Voxel befinde ich mich?
+		}
+
+	}
+
+}
+
+
+/**
+   Proxy function which calls the speed of sound precalculation kernel.
+   - Proxy-Funktion der einen Schallgeschwindigkeits-Kernel aufruft.
+*/
+
+//precalculateAverageSpeedOfSound(
+//		currentSpeedOfSoundZLayer,
+//		maxFeasibleSosZLayerCount,
+//		0,
+//		emitter_list_Size,
+//		deviceTableVoxelToEmitterPathCount,
+//		deviceTableVoxelToEmitterPathCountFloat,
+//		deviceTableVoxelToEmitterPathSosSum);
+
+void SAFTHandler::precalculateAverageSpeedOfSound
+(
+    int firstZLayer, 						 ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
+    int sosZLayerCount, 					 ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
+#ifdef SaftUseConstantMemforGeometry
+    int deviceListGeometry,					///< emitters=0 or receivers=1.
+#else
+    float3 const * deviceListGeometry, 	 ///< Vector array describing the positions of emitters or receivers.
+#endif
+
+    int geometryElementCount, 				 ///< Number of elements in the geometry array got from Matlab
+    //VoxelCountType * deviceVoxelCountOutput, ///< Out: # of voxels in the path from a transducer element to a voxel.
+    float * deviceVoxelCountOutputFloat,	 ///< Out: # of voxels in the path from a transducer element to a voxel in Float format.
+    float * deviceSpeedOfSoundSumOutput 	 ///< Out: Sum of SoS samples in the path from transducer to voxel.
+//    int blocksPerGrid, 					 ///< Number of blocks per grid to be used to execute the kernel.
+//    int threadsPerBlock, 					 ///< Number of threads per block to be used to execute the kernel.
+//    cudaStream_t stream, 					 ///< Stream to be used for the execution of the kernel.
+)
+{
+	#ifdef debug_OutputFunctions
+		printf( "==> SAFTHandler::precalculateAverageSpeedOfSound - Start\n");
+	#endif
+
+
+
+	dim3 threadsPerBlock (SOSGrid_XYZ.x,1,1);	// max. 512 oder 1024	Threads werden vorgegeben und
+	//dim3 threadsPerBlock (SOSGrid_XYZ.x,SOSGrid_XYZ.y,1);	// max. 512 oder 1024	Threads werden vorgegeben und
+	dim3 blocksPerGrid (1,1,1);					// max. 65.535			Bloecke im Grid berechnet.	Initialisierung
+	blocksPerGrid.x = SOSGrid_XYZ.y;
+	blocksPerGrid.y = sosZLayerCount;
+	blocksPerGrid.z = 1;
+
+	#ifdef debug_CudaPrecalculateKernel
+		int sosZLayerVoxelCountToProcess = sosZLayerVoxelCount * sosZLayerCount;	// Anzahl der Voxel die berechnet werden sollen
+
+		printf("===========================================================================================\n");
+		printf(" deviceListGeometry: %i (0=Em/1=Rec)\n", deviceListGeometry);
+		printf(" geometryElementCount: %i\n", geometryElementCount);
+
+		printf(" sosZLayerVoxelCountToProcess = sosZLayerVoxelCount(%i) * sosZLayerCount(%i) = %i\n", sosZLayerVoxelCount, sosZLayerCount, sosZLayerVoxelCountToProcess);
+		printf(" threadsPerBlock x,y,z: [%i %i %i]\n", threadsPerBlock.x, threadsPerBlock.y, threadsPerBlock.z);
+		printf(" blocksPerGrid   x,y,z: [%i %i %i]\n", blocksPerGrid.x, blocksPerGrid.y, blocksPerGrid.z);
+		printf(" firstZLayer (Start z): %i\n", firstZLayer);
+		//printf(" SOSGrid_XYZ     x,y,z: [%i %i %i]\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z);
+		printf("===========================================================================================\n");
+	#endif
+
+	#ifdef SaftTextureForBresenhamSosPaths
+		// Prepare Texture for SpeedOfSoundField
+		#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
+			cudaChannelFormatDesc texChannelDescSpeedOfSoundField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);	// Beschreibung des RueckgabeFormats der Textur fuer SpeedOfSoundField
+
+			texRefSpeedOfSoundField.addressMode[0] = cudaAddressModeClamp;		// Texturreferenz beschreiben
+			texRefSpeedOfSoundField.addressMode[1] = cudaAddressModeClamp;
+			texRefSpeedOfSoundField.addressMode[2] = cudaAddressModeClamp;
+
+			if (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing] == 1){
+				texRefSpeedOfSoundField.filterMode = cudaFilterModeLinear;		// Lineare Interpolation
+			}
+			else{
+				texRefSpeedOfSoundField.filterMode = cudaFilterModePoint;		// Nearest Neighbor
+			}
+//			#ifdef SaftTextureForBresenhamInterpolated
+//				texRefSpeedOfSoundField.filterMode     = cudaFilterModeLinear;
+//			#else
+//				texRefSpeedOfSoundField.filterMode     = cudaFilterModePoint;
+//			#endif
+			texRefSpeedOfSoundField.normalized     = 0;
+
+			CUDA_CHECK(cudaBindTextureToArray ( &texRefSpeedOfSoundField, deviceSpeedOfSoundFieldCuArray, &texChannelDescSpeedOfSoundField )); // Schritt 4.1 3DArray an Texturmemory binden
+		#endif
+
+
+		#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+			cudaChannelFormatDesc texChannelDescSosAttField = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat);		// Schritt 2.1 Output-Kanal anlegen und beschreiben
+
+			texRefSosAttField.addressMode[0] = cudaAddressModeClamp;		// Texturreferenz beschreiben
+			texRefSosAttField.addressMode[1] = cudaAddressModeClamp;
+			texRefSosAttField.addressMode[2] = cudaAddressModeClamp;
+
+			if (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing] == 1){
+				texRefSosAttField.filterMode = cudaFilterModeLinear;		// Lineare Interpolation
+			}
+			else{
+				texRefSosAttField.filterMode = cudaFilterModePoint;		// Nearest Neighbor
+			}
+//			#ifdef SaftTextureForBresenhamInterpolated
+//				texRefSosAttField.filterMode     = cudaFilterModeLinear;
+//			#else
+//				texRefSosAttField.filterMode     = cudaFilterModePoint;
+//			#endif
+			texRefSosAttField.normalized     = 0;
+
+			CUDA_CHECK(cudaBindTextureToArray ( &texRefSosAttField, deviceSosAttFieldCuArray, &texChannelDescSosAttField )); // Schritt 4.1 3DArray an Texturmemory binden
+		#endif
+
+	#endif
+
+	#ifdef SaftTextureForEmRecSosPathsTables
+		if (deviceListGeometry == 0){
+			#ifdef SaftTextureForEmRecSosPathsTablesFloat1
+				cudaBindSurfaceToArray(outSurfRefTableVoxelToEmitterPathSosSum, deviceTableVoxelToEmitterPathSosSumCuArray);
+				cudaBindSurfaceToArray(outSurfRefTableVoxelToEmitterPathCount, deviceTableVoxelToEmitterPathCountCuArray);
+			#endif
+			#ifdef SaftTextureForEmRecSosPathsTablesFloat2
+				cudaBindSurfaceToArray(outSurfRefTableVoxelToEmPathSosBoth, deviceTableVoxelToEmPathSosBothCuArray);
+			#endif
+			#ifdef SaftTextureForEmRecSosPathsTablesFloat4																// TODO: hier Name aendern mit Att
+				cudaBindSurfaceToArray(outSurfRefTableVoxelToEmPathSosBoth, deviceTableVoxelToEmPathSosBothCuArray);
+			#endif
+		}
+
+		//deviceTableVoxelToReceiverPathCountCuArray[0] = deviceTableVoxelToReceiverPathSosSumCuArrayTest;
+		//cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSumTest, deviceTableVoxelToReceiverPathSosSumCuArrayTest);
+
+		if (deviceListGeometry == 1){
+			//printf( "#################(int)floor((float)geometryElementCount / (float)maxSoSReceiverArrayForTexture) == %i\n", (int)floor((float)geometryElementCount / (float)maxSoSReceiverArrayForTexture));
+			//printf( "#################TableVoxelToReceiverPathSosAllocationCount                                     == %i\n", TableVoxelToReceiverPathSosAllocationCount);
+
+			if ( TableVoxelToReceiverPathSosAllocationCount > 0){
+
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat1
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToReceiverPathSosSumCuArray[0](%X) deviceTableVoxelToReceiverPathCountCuArray[0](%X)\n", deviceTableVoxelToReceiverPathSosSumCuArray[0], deviceTableVoxelToReceiverPathCountCuArray[0]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum0, deviceTableVoxelToReceiverPathSosSumCuArray[0]);
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount0, deviceTableVoxelToReceiverPathCountCuArray[0]);
+				#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat2
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[0](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[0]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth0, deviceTableVoxelToRecPathSosBothCuArray[0]);
+				#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat4																		// TODO: hier Name aendern mit Att
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[0](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[0]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth0, deviceTableVoxelToRecPathSosBothCuArray[0]);
+				#endif
+			}
+			if ( TableVoxelToReceiverPathSosAllocationCount > 1) {
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat1
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToReceiverPathSosSumCuArray[1](%X) deviceTableVoxelToReceiverPathCountCuArray[1](%X)\n", deviceTableVoxelToReceiverPathSosSumCuArray[1], deviceTableVoxelToReceiverPathCountCuArray[1]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum1, deviceTableVoxelToReceiverPathSosSumCuArray[1]);
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount1, deviceTableVoxelToReceiverPathCountCuArray[1]);
+				#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat2
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[1](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[1]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth1, deviceTableVoxelToRecPathSosBothCuArray[1]);
+				#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat4
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[1](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[1]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth1, deviceTableVoxelToRecPathSosBothCuArray[1]);
+				#endif
+			}
+			if ( TableVoxelToReceiverPathSosAllocationCount > 2){
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat1
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToReceiverPathSosSumCuArray[2](%X) deviceTableVoxelToReceiverPathCountCuArray[2](%X)\n", deviceTableVoxelToReceiverPathSosSumCuArray[2], deviceTableVoxelToReceiverPathCountCuArray[2]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum2, deviceTableVoxelToReceiverPathSosSumCuArray[2]);
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount2, deviceTableVoxelToReceiverPathCountCuArray[2]);
+				#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat2
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[2](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[2]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth2, deviceTableVoxelToRecPathSosBothCuArray[2]);
+				#endif
+				#ifdef SaftTextureForEmRecSosPathsTablesFloat4
+					#ifdef debug_CudaPrecalculateKernel
+						printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[2](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[2]);
+					#endif
+					cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth2, deviceTableVoxelToRecPathSosBothCuArray[2]);
+				#endif
+			}
+		}
+	#endif
+
+//		cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum0, deviceTableVoxelToReceiverPathSosSumCuArray[0]);
+//		cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum1, deviceTableVoxelToReceiverPathSosSumCuArray[1]);
+//		cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum2, deviceTableVoxelToReceiverPathSosSumCuArray[2]);
+
+//		cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount0, deviceTableVoxelToReceiverPathCountCuArray[0]);
+//		cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount1, deviceTableVoxelToReceiverPathCountCuArray[1]);
+//		cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount2, deviceTableVoxelToReceiverPathCountCuArray[2]);
+
+
+    precalculateAverageSpeedOfSoundKernel <<< blocksPerGrid, threadsPerBlock >>>
+        (
+			#ifndef SaftTextureForBresenhamSosPaths
+        		deviceSpeedOfSoundField,
+			#else
+				#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
+        			deviceSpeedOfSoundFieldCuArray,
+				#endif
+				#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+					deviceSosAttFieldCuArray,
+				#endif
+			#endif
+            firstZLayer,
+            sosZLayerCount,
+            deviceListGeometry,
+            geometryElementCount,
+            maxSoSReceiverArrayForTexture,		// maximale Anzahl an Receivern in einem CUDA Array
+
+            //deviceVoxelCountOutput,
+            deviceVoxelCountOutputFloat,
+            deviceSpeedOfSoundSumOutput,
+//            regionOfInterestOffset,
+            SOSGrid_XYZ,
+            sosOffset,
+            regionOfInterestOffset,
+            IMAGE_RESOLUTION,
+            SOS_RESOLUTION,
+            debugMode,
+            debugModeParameter
+            );
+    CUDA_CHECK(cudaGetLastError());
+
+	#ifdef SaftTextureForBresenhamSosPaths
+		#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
+			CUDA_CHECK(cudaUnbindTexture( &texRefSpeedOfSoundField ));
+		#endif
+
+		#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+			CUDA_CHECK(cudaUnbindTexture( &texRefSosAttField ));
+		#endif
+	#endif
+
+
+
+
+	#ifdef debug_OutputFunctions
+		printf( "<== SAFTHandler::precalculateAverageSpeedOfSound - End\n");
+	#endif
+}
--- a/SAFT_ATT/src/kernel/rayTracing.hcu
+++ b/SAFT_ATT/src/kernel/rayTracing.hcu
--- a/SAFT_ATT/src/kernel/saftKernel.hcu
+++ b/SAFT_ATT/src/kernel/saftKernel.hcu
--- a/SAFT_ATT/src/processAScans.cpp
+++ b/SAFT_ATT/src/processAScans.cpp
--- a/SAFT_ATT/src/saft.cpp
+++ b/SAFT_ATT/src/saft.cpp
@@ -0,0 +1,650 @@
+#include <mex.h>
+
+#include <iostream>
+#include <vector>
+
+#include <cstdlib>
+#include <ctime>
+#include <cmath>
+
+//#include <sys/time.h>
+
+//#include <ail/file.hpp>
+//#include <ail/string.hpp>
+//#include <ail/time.hpp>
+
+//#include "configuration.hpp"
+#include "saft.hpp"
+
+
+
+
+/**
+   Clumsy constructor of the core reconstruction class.
+   - Unbeholfener Konstruktor der Kern Rekonstuktionsklasse
+ */
+SAFTHandler::SAFTHandler(
+    int deviceId, 			///< CUDA ID of the device to be used.
+    int deviceIndex,		///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
+    float *aScan_ptr,    	///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath, ///< Path to the actual A-scan samples.
+    double *output_ptr,   	///< Zeiger zu den daten // std::string const & Path, ///< Path to a file in which the output of the image reconstruction is to be stored.
+    double *Duration_ptr, 	///< Zeiger auf R<>ckgabewert fuer Matlab fuer Laufzeit des Kernels
+	unsigned short *receiver_index_ptr, ///<
+	unsigned short *emitter_index_ptr,  ///<
+	float *receiver_list_ptr, 			///<
+	int receiver_list_Size,
+	float *emitter_list_ptr,			///<
+	int emitter_list_Size,
+    float *speed_vec_ptr,				///< Zeiger auf die SoS-Daten in Block-/Gridmode
+    int3 SOSGrid_XYZ,
+	float3 sosOffset, 					///< Startpoint of SoSGrid
+	float SOS_RESOLUTION,				///< Aufloesung des SoSGrid
+	float *att_vec_ptr,					///< Zeiger auf die Att-Daten inm Gridmode
+
+    int aScanCount,
+    int aScanLength,
+    int3 IMAGE_SIZE_XYZ,
+    float sampleRate,
+    float3 regionOfInterestOffset,
+    float IMAGE_RESOLUTION,
+    dim3 const & fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
+    int medianWindowSize, 				///< define width of used median filter
+    float debugMode,
+    float debugModeParameter,
+    bool SOSMode_3DVolume,
+    bool ATTMode_3DVolume,
+
+    int SAFT_MODE,
+	int *SAFT_VARIANT
+    ):
+    deviceId(deviceId),			// Das hier ist eine Initialisation der Klassenvariablen mit den <20>bergebenen Werten aehnlich Konstruktor, called Initializer list
+    deviceIndex(deviceIndex),
+
+	aScan_ptr(aScan_ptr),			//aScanSamplesPath(aScanSamplesPath),
+
+	output_ptr(output_ptr),			//Path(Path),
+	Duration_ptr(Duration_ptr),
+
+	receiver_index_ptr(receiver_index_ptr), 	//
+	emitter_index_ptr(emitter_index_ptr),  		//
+	receiver_list_ptr(receiver_list_ptr), 		//
+	receiver_list_Size(receiver_list_Size),
+	emitter_list_ptr(emitter_list_ptr),		//
+	emitter_list_Size(emitter_list_Size),
+	speed_vec_ptr(speed_vec_ptr), 			///< SoS-Daten im Blockmode oder SoSGrid
+	SOSGrid_XYZ(SOSGrid_XYZ),	  			// Groesse des SoSGrids
+	sosOffset(sosOffset), 					///< Startpoint of SoSGrid
+	SOS_RESOLUTION(SOS_RESOLUTION),			///< Aufloesung des SoSGrid
+
+	att_vec_ptr(att_vec_ptr),				///< Att-Daten als ATTGrid
+
+	aScanCount(aScanCount),
+	aScanLength(aScanLength),
+	IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
+	sampleRate(sampleRate),
+	regionOfInterestOffset(regionOfInterestOffset),
+	IMAGE_RESOLUTION(IMAGE_RESOLUTION),
+
+    fixedBlockDimensions(fixedBlockDimensions),
+    medianWindowSize(medianWindowSize),
+	debugMode(debugMode),
+	debugModeParameter(debugModeParameter),
+	SOSMode_3DVolume(SOSMode_3DVolume),
+	ATTMode_3DVolume(ATTMode_3DVolume),
+
+	SAFT_MODE(SAFT_MODE),
+	SAFT_VARIANT(SAFT_VARIANT)
+{
+	#ifdef debug_OutputFunctions
+		// printf( "==> SAFTHandler::SAFTHandler - Start\n");
+	#endif
+
+	#ifdef debug_OutputInfo
+		// printf( "SAFTHandler Constructor\n");
+	#endif
+
+    aScanAllocationCount = 1;			// Speicher der Allokiert wird, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.
+
+	IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION;   // Auflösung im OutputVolumen
+	SOS_RESOLUTION_FACTOR   = 1 / SOS_RESOLUTION;	  // Auflösung im SoS-Grid
+
+	#ifdef debug_OutputVariables
+		// printf( "IMAGE_RESOLUTION_FACTOR = %e\n", IMAGE_RESOLUTION_FACTOR);
+		// printf( "SOS_RESOLUTION_FACTOR 	= %e\n", SOS_RESOLUTION_FACTOR);
+		// printf( "Samplerate = %e\n", sampleRate);
+	#endif
+
+	#ifdef debug_OutputFunctions
+		// printf( "<== SAFTHandler::SAFTHandler - End\n");
+	#endif
+
+}
+
+/**
+   Top level function of the SAFTHandler class that performs the image reconstruction.
+   - Top Level Funktion der SAFTHandler Klasse die die Bildrekonstruktion durchf<68>hrt.
+*/
+void SAFTHandler::performReconstruction()
+{
+	#ifdef debug_OutputFunctions
+		// printf( "==> SAFTHandler::performReconstruction - Start\n");
+	#endif
+
+	#ifdef debug_OutputInfo													// Name des Device mit ID ausgeben
+		// printf( "Device ID: %i\n", deviceId);
+	#endif
+
+	#ifdef debug_OutputFunctions
+	// printf( "==> loadDevices - Start\n");
+	#endif
+	int deviceCount;
+	CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
+
+
+	// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, könnte auch nur einmal ausgelesen werden aber zweitrangig.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	//DeviceProperties &  outputProb = deviceProperties; // lokalen Zeiger auf Vektor erstellen der auf Klassenvektor zeigt.
+
+	//// printf("1: size(%i) capacity(%i) max_size(%i)\n",  outputProb.size(), outputProb.capacity(), outputProb.max_size());
+
+	//outputProb.reserve(static_cast<std::size_t>(deviceCount));	// Request Vector with size deviceCount
+	deviceProperties.reserve(static_cast<std::size_t>(deviceCount));	// Request Vector with size deviceCount
+
+
+	//cudaDeviceProp & device = outputProb[deviceId];				//
+	cudaDeviceProp & device = deviceProperties[deviceId];			//
+	CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));
+	//// printf("%i. %s\n", deviceId, device.name);
+	//// printf("%i. %s\n", deviceId, deviceProperties[deviceId].name);
+
+	#ifdef debug_OutputInfo
+			// printf("%i. %s\n", deviceId, device.name);
+			// printf("   Byte Total Global Mem:  %lld \n",  device.totalGlobalMem);
+			// printf("   Compute Capability:     %i.%i\n",  device.major,device.minor);
+
+	        // printf("   Name:                          %s\n",  device.name);
+	        // printf("   Major revision number:         %d\n",  device.major);
+	        // printf("   Minor revision number:         %d\n",  device.minor);
+	        // printf("   Total global memory:           %lld\n",  device.totalGlobalMem);
+	        // printf("   Total shared memory per block: %u\n",  device.sharedMemPerBlock);
+	        // printf("   Total registers per block:     %d\n",  device.regsPerBlock);
+	        // printf("   Warp size:                     %d\n",  device.warpSize);
+	        // printf("   Maximum memory pitch:          %lld\n",  device.memPitch);
+	        // printf("   Maximum threads per block:     %d\n",  device.maxThreadsPerBlock);
+	        for (int i = 0; i < 3; ++i) {
+	        	// printf("        Maximum dimension %d of block:  %lld\n", i, device.maxThreadsDim[i]);
+			}
+	        for (int i = 0; i < 3; ++i) {
+	        	// printf("        Maximum dimension %d of grid:   %lld\n", i, device.maxGridSize[i]);
+			}
+	        // printf("   Clock rate:                    %d\n",  device.clockRate);
+	        // printf("   Total constant memory:         %u\n",  device.totalConstMem);
+	        // printf("   Texture alignment:             %u\n",  device.textureAlignment);
+	        // printf("   Concurrent copy and execution: %s\n",  (device.deviceOverlap ? "Yes" : "No"));
+	        // printf("   Number of multiprocessors:     %d\n",  device.multiProcessorCount);
+	        // printf("   Kernel execution timeout:      %s\n\n",  (device.kernelExecTimeoutEnabled ? "Yes" : "No"));
+	#endif
+
+		//outputProb.push_back(device);		// Add element at the end of the vector outputProb
+		deviceProperties.push_back(device);		// Add element at the end of the vector outputProb
+
+		//// printf("2: size(%i) capacity(%i) max_size(%i)\n",  outputProb.size(), outputProb.capacity(), outputProb.max_size());
+
+	#ifdef debug_OutputFunctions
+	// printf( "<== loadDevices - End\n");
+	#endif
+	// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, aber erstmal egal.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+	// siehe http://gpucoder.livejournal.com/1064.html
+
+	//		int devCount;
+	//	    cudaGetDeviceCount(&devCount);
+	//	    // printf("CUDA Device Query...\n");
+	//	    // printf("There are %d CUDA devices.\n", devCount);
+	//
+	//	    // Iterate through devices
+	//	    for (int i = 0; i < devCount; ++i)
+	//	    {
+	//	        // Get device properties
+	//	        // printf("\nCUDA Device #%d\n", i);
+	//	        cudaDeviceProp devProp;
+	//	        cudaGetDeviceProperties(&devProp, i);
+	//	        printDevProp(devProp);
+	//	    }
+	//
+	//	    // printf("\nPress any key to exit...");
+	//	    char c;
+	//	    scanf("%c", &c);
+
+
+	//cudaDeviceProp & device = deviceProperties[deviceId];
+	//CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));		// Eingenschaften des Devices auslesen
+
+	//#ifdef debug_OutputInfo													// Name des Device mit ID ausgeben
+		// printf( "Device used: %18s  (HW-ID %i) (Idx %i)\n", device.name , deviceId, deviceIndex);
+	//#endif
+	CUDA_CHECK(cudaSetDevice(deviceId));
+
+	#ifdef debug_OutputInfo													// Reset Device
+		// printf("Reset Device\n");
+	#endif
+	//CUDA_CHECK(cudaDeviceReset());
+
+//	std::string errorMessage = cudaGetErrorString(cudaPeekAtLastError());
+//	std::cout << errorMessage << std::endl;
+
+	//memoryCheck();		// Freier Speicher am Anfang
+
+
+	// Check and set Block and Grid-Dimensions
+	genericSAFTBlockDimensions = fixedBlockDimensions;
+	genericSAFTGridDimensions = dim3(
+		(IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x-1)/ genericSAFTBlockDimensions.x,	// hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
+		(IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y-1)/ genericSAFTBlockDimensions.y,	// in Blockgroesse geteilt werden kann, muss ein weiterer
+		(IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z	// Block berechnet werden. Zu Viele werden im Kernel aussortiert.
+		);
+
+	#if defined(debug_OutputVariables) || defined(debug_OutputZSteps)
+		if (deviceIndex == DebugOutputGPUIdx){
+			// printf( "genericSAFTBlockDimensions X,Y,Z = (%i %i %i)\n",genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z);
+			// printf( "genericSAFTGridDimensions X,Y,Z  = (%i %i %i)\n",genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z);
+		}
+	#endif
+
+    //Pointeruebergabe der AScan-Daten Geometrie-Daten und Output-Daten von Matlab
+	#ifdef debug_OutputInfo
+		// printf( "Give Pointer Names for AScan, Geometry, Output and SoS-Data from Matlab\n");
+		// printf( "Uebergebener Pointer SoSData fuer SoS-Daten aus Matlab\n");
+	#endif
+    aScanSamples = (float*)aScan_ptr;
+
+    #ifdef debug_OutputInfo
+    	// printf( "Uebergebene Geometry Pointer fuer Index sowie der Zuordnungs-Tabelle aus Matlab\n");
+	#endif
+
+    emitter_index  = (unsigned short*) emitter_index_ptr;			// Index for associating emitter to corresponding coordinates
+    receiver_index = (unsigned short*) receiver_index_ptr;			// Index for associating receiver to corresponding coordinates
+    emitter_list   = (float3*) emitter_list_ptr;					// Lookuptable for emitter coordinates
+    receiver_list  = (float3*) receiver_list_ptr;					// Lookuptable for receiver coordinates
+
+	#ifdef debug_OutputInfo
+    	// printf( "Uebergebener Pointer output fuer Ausgabe-Daten aus Matlab\n");
+	#endif
+    output = (double *)output_ptr;
+
+
+    speedOfSoundFieldVoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z;
+    speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount * sizeof(float);
+	#ifdef debug_OutputVariables
+		// printf("  speedOfSoundFieldVoxelCount [%ix%ix%i] = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, speedOfSoundFieldVoxelCount);
+		// printf("  speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount(%i) x sizeof(float = 4)] = %i\n", speedOfSoundFieldVoxelCount, speedOfSoundFieldBytes);
+	#endif
+
+	#ifdef debug_OutputInfo
+		// printf( "Uebergebener Pointer speedOfSoundField fuer SoS-Daten aus Matlab\n");
+		// printf( "Uebergebener Pointer SoSData           fuer SoS-Daten aus Matlab\n");
+		// printf( "Uebergebener Pointer attenuationField  fuer ATT-Daten aus Matlab\n");
+	#endif
+    speedOfSoundField = (float*)speed_vec_ptr;	// Fuer SoSGrid-Mode fuer korrekte Schallgeschwindigkeitskorrektur
+    SoSData 		  = (float*)speed_vec_ptr;	// Fuer Blockmode
+    attenuationField  = (float*)att_vec_ptr;    // Fuer SoSGrid-Mode fuer Daempfungskorrektur
+
+
+    // Uebergabe der Outputgroessen aus Matlab.
+    regionOfInterestVoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen
+	outputSize = regionOfInterestVoxelCount * sizeof(double);	// Speicherbedarf fuer alle Voxel im Volumen
+
+	#ifdef debug_OutputVariables
+		// printf("  regionOfInterestVoxelCount [%ix%ix%i]= %lld\n",IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, regionOfInterestVoxelCount);
+		// printf("  outputSize [%lld x sizeof(double = 8)] = %lld\n", regionOfInterestVoxelCount, outputSize);
+	#endif
+
+	//Hier auf maximale Outputgroesse von 32-BitSystem ueberpruefen --> falls Probleme mit 32-Bitsystemen hier noch Abfrage und Abbruch implementieren
+	if (regionOfInterestVoxelCount > (uint64_t)(2^32 / sizeof(double)) ){ // 2^32 / sizeof(double) = 536870912
+			// printf("outputSize > 2^32 !!! => works only in 64-Bit System\n");
+	}
+
+	//Groesse der Datenbloecke fuer die Blockverarbeitung wird mit aScanCount angegeben
+	//Die selbe Anzahl wird auch fuer die Geometriedaten erwartet
+	#ifdef debug_OutputVariables
+		// printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
+	#endif
+	aScanSize = aScanLength * sizeof(float);
+	batchSize = aScanCount;					// Anzahl der Blockgroesse d.h. wie viele AScans gleichzeitig verarbeitet werden. Batchgroesse ist gleich der Anzahl der uebergebenen Blockgroesse aus Matlab
+	aScanBatchSize = batchSize * aScanSize; // Batchgroesse der AScans (* 3000 * sizeof(float)) in Byte
+	#ifdef debug_OutputVariables
+		// printf( "aScanSize = aScanLength(%i) * sizeof(float=4) = %i\n", aScanLength, aScanSize);
+		// printf( "batchSize = aScanCount = %i\n", batchSize);
+		// printf( "aScanBatchSize = batchSize * aScanSize ( = %i * sizeof(float)) = %i\n", aScanLength, aScanBatchSize);
+	#endif
+
+//	if(batchSize > aScanCount)		// Abfrage macht keinen Sinn mehr wenn batchSize = aScanCount;
+//	{
+//		mexErrMsgTxt("A-scan window size cannot be larger than the total number of A-scans");
+//		//throw ail::exception("A-scan window size cannot be larger than the total number of A-scans");
+//	}
+
+	#ifdef debug_OutputInfo
+
+    	// printf("\nParameter for Image Reconstruction\n");
+    	// printf(  "========================================================================\n");
+    	//std::cout << "ROI dimensions: " << regionOfInterestResolutionX << " x " <<  regionOfInterestResolutionY << " x " <<  regionOfInterestResolutionZ << std::endl;
+		std::cout << "IMAGE_SIZE_XYZ:                      [" << IMAGE_SIZE_XYZ.x << " x " <<  IMAGE_SIZE_XYZ.y << " x " <<  IMAGE_SIZE_XYZ.z << "]" <<std::endl;
+		std::cout << "Voxel count in Volume:               " << regionOfInterestVoxelCount << std::endl;
+		//std::cout << "Increment vector: (" << regionOfInterestIncrementVector.x << ", " << regionOfInterestIncrementVector.y << ", " << regionOfInterestIncrementVector.z << ")" << std::endl;
+		std::cout << "Increment vector/Resolution:         (" << IMAGE_RESOLUTION << ")" << std::endl;
+
+		std::cout << "IMAGE_STARTPOINT in meters:          " << regionOfInterestOffset.x << " " << regionOfInterestOffset.y << " " << regionOfInterestOffset.z << std::endl;
+
+		regionOfInterestSize.x = IMAGE_SIZE_XYZ.x * IMAGE_RESOLUTION;
+		regionOfInterestSize.y = IMAGE_SIZE_XYZ.y * IMAGE_RESOLUTION;
+		regionOfInterestSize.z = IMAGE_SIZE_XYZ.z * IMAGE_RESOLUTION;
+		std::cout << "ROI size in metres:                  " << regionOfInterestSize.x << " " << regionOfInterestSize.y << " " << regionOfInterestSize.z  << std::endl;
+		std::cout << "Batch size/Blocks(Ascan, R/E-Combi): " << batchSize << std::endl;
+		// printf(  "========================================================================\n\n");
+	#endif
+
+
+
+//	#ifdef debug_OutputPerformance
+//		struct timeval startPerformCoreReconstruction, stopPerformCoreReconstruction;
+//		gettimeofday(&startPerformCoreReconstruction, NULL);
+//	#endif
+
+		//perform processing with AScan-Data
+		//===========================================================================================================
+		ullong duration;
+		processAScans(duration);
+		//===========================================================================================================
+
+
+
+	#ifdef debug_OutputPerformance
+		double numerator = static_cast<double>(aScanCount) * regionOfInterestVoxelCount;	// Performanz [Ascans * GVoxel/s]
+
+		double performance = numerator / duration;
+		//adjust for the change from voxels per millisecond to gigavoxels per second (=> 10^3 * 10^-9 = 10^-6)
+		performance /= 1e9;
+
+		//std::cout << "# Device ("<< (int)deviceId <<"): Duration of main processing: " << (int)duration << " us" << std::endl;
+		//std::cout << "# Device ("<< (int)deviceId <<"): Performance: " << performance << " AScan * GVoxel/s" << std::endl;
+	#endif
+
+	//Duration_ptr[(deviceId+1)] = (double)duration;	// Für jede GPU einen Laufzeitwert in µs übermitteln	// Angabe von ID der GPU abhaengig
+	Duration_ptr[(deviceIndex+1)] = (double)duration;	// Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig
+
+	#ifdef debug_OutputVariables
+		//// printf( "Duration_ptr[%i] = duration(%i) = %f\n", (deviceId+1), duration, Duration_ptr[(deviceId+1)]);
+		// printf( "  GPU (%s:ID %i,Index %i): => Duration_ptr[%i] = duration(%i µs) = %.2f s\n", device.name, deviceId, deviceIndex, (deviceIndex+1), duration, Duration_ptr[(deviceIndex+1)]/1000/1000);
+	#endif
+
+//	#ifdef debug_OutputVariables
+//		// printf( "Duration_ptr[0] = duration(%i) = %f\n", duration, Duration_ptr[0]);
+//	#endif
+
+// Reset Device
+//	#ifdef debug_OutputInfo
+//		// printf( "Device was used: %s  (%i)\n", deviceProperties[deviceId].name , deviceId);
+//	#endif
+//	CUDA_CHECK(cudaSetDevice(deviceId));
+
+	#ifdef debug_OutputInfo													// Reset Device
+		// printf("Reset Device\n");
+	#endif
+	//CUDA_CHECK(cudaDeviceReset());
+
+	#ifdef debug_OutputFunctions
+		// printf( "<== SAFTHandler::performReconstruction - End\n");
+	#endif
+
+}
+
+/**
+   The SAFT kernel expects arguments in which the grid dimensions have been reduced to less than three dimensions and the block dimensions are reduced to only one dimension.
+   This also depends on the properties of the hardware available (shader model).
+   - Der SAFT Kernel erwartet Argumente in den die Grid Dimension auf drei Dimensionen reduziert wurde und die Block-Dimensionen auf nur eine Dimension reduziert ist.
+   - Das haengt auch von den Eigenschaften der verfuegbaren HW ab (shader model)
+*/
+void SAFTHandler::reduceKernelDimensions(
+    dim3 const & gridDimensions, ///< Input grid dimensions.
+    dim3 const & blockDimensions, ///< Input block dimensions.
+    dim3 & reducedGridDimensions, ///< Reduced output grid dimensions.
+    dim3 & reducedBlockDimensions ///< Reduced output block dimensions.
+    )
+{
+
+	#ifdef debug_OutputFunctions
+		// printf( "==> SAFTHandler::reduceKernelDimensions - Start\n");
+	#endif
+
+
+	if(deviceProperties[deviceId].maxGridSize[2] > 1)
+    {
+        reducedGridDimensions = gridDimensions;
+		#ifdef debug_OutputParameter
+        	// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
+		#endif
+    }
+    else
+    {
+        reducedGridDimensions = dim3(
+            gridDimensions.x * gridDimensions.y,
+            gridDimensions.z,
+            1
+            );
+		#ifdef debug_OutputParameter
+        	// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
+		#endif
+    }
+
+    reducedBlockDimensions = dim3(blockDimensions.x * blockDimensions.y * blockDimensions.z);
+	#ifdef debug_OutputParameter
+    	// printf( "reducedBlockDimensions X,Y,Z = (%i %i %i)\n", reducedBlockDimensions.x, reducedBlockDimensions.y, reducedBlockDimensions.z);
+	#endif
+
+	#ifdef debug_OutputFunctions
+		// printf( "<== SAFTHandler::reduceKernelDimensions - End\n");
+	#endif
+
+}
+
+/**
+   Utility function to perform integer based divison which rounds up instead of down.
+   - N<>tzliche Funktion: eine Integerbasierte Division die aufrundet und nicht abrundet
+   @return Quotient of the divison, rounded up.
+*/
+std::size_t ceilingDivision(
+    std::size_t dividend, ///< Dividend of the division.
+    std::size_t divisor ///< Divisor of the division.
+    )
+{
+    std::size_t output = dividend / divisor;
+    if(dividend % divisor)
+    	output ++;
+    return output;
+}
+
+/**
+   Converts an offset based on two different resolutions.
+   This is a utility function used to deal with the number of z-layers in the speed of sound grid.
+   - Konvertiert einen Offset, basierend auf zwei verschiedenen Aufloesungen
+   - Diese n<>tzliche Funktion wird genutzt um mit der Anzahl der z-Layer in dem Spallgeschwindigkeits-Grid umzugehen.
+   @return Result of the conversion.
+ */
+std::size_t SAFTHandler::resolutionConversion(
+    std::size_t input, ///< Offset.
+    std::size_t greaterResolution, ///< Greater resolution.
+    std::size_t lowerResolution ///< Lower resolution.
+    )
+{
+	#ifdef debug_OutputFunctions
+		// printf( "==> SAFTHandler::resolutionConversion - Start\n");
+		// printf( "<== SAFTHandler::resolutionConversion - End\n");
+	#endif
+
+    return ceilingDivision(input * lowerResolution, greaterResolution);
+}
+
+/**
+   Perform calculations pertaining to the execution of the speed of sound pre-calculations.
+   - F<>hre Berechnungen der Schallgeschwindigkeit-Vorberechnung aus
+*/
+void SAFTHandler::determineSpeedOfSoundData(
+    std::size_t regionOfInterestZLayers ///< Number of z-layers within the region of interest that are currently being processed. This number is often smaller than the total number of z-layers.
+    )
+{
+	#ifdef debug_OutputFunctions
+		// printf( "==> SAFTHandler::determineSpeedOfSoundData - Start\n");
+	#endif
+
+//	//Determine the maximum number of z-layers to be pre-calculated within the speed of sound grid
+//	//Bestimme die maximale Anzahl an Z-layer, die in dem SoS-Grid Vorberechnet werden.
+//	//std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, regionOfInterestResolutionZ, regionOfInterestGridSizeZ);
+//	std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, IMAGE_SIZE_XYZ.z, regionOfInterestGridSizeZ);
+//
+//    partialSpeedOfSoundVoxelCount = maximumSpeedOfSoundPartialZLayerCount * regionOfInterestGridSizeX * regionOfInterestGridSizeY;
+//
+////	deviceTableVoxelToEmitterPathCountSize   = sosZLayerVoxelCount * emitter_list_Size   * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Emitter
+////	deviceTableVoxelToEmitterPathSosSumSize  = sosZLayerVoxelCount * emitter_list_Size   * partialSoSZLayerCount * sizeof(float);
+////	deviceTableVoxelToReceiverPathCountSize  = sosZLayerVoxelCount * receiver_list_Size  * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Receiver
+////	deviceTableVoxelToReceiverPathSosSumSize = sosZLayerVoxelCount * receiver_list_Size  * partialSoSZLayerCount * sizeof(float);
+//
+//    std::size_t
+//        emitterSpeedOfSoundVoxelCombinations = emitterCount * partialSpeedOfSoundVoxelCount,
+//        receiverSpeedOfSoundVoxelCombinations = receiverCount * partialSpeedOfSoundVoxelCount;
+//
+//    emitterToVoxelPathVoxelDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
+//    emitterToVoxelPathSpeedDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(float);
+//
+//    voxelToReceiverPathVoxelDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
+//    voxelToReceiverPathSpeedDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(float);
+
+	#ifdef debug_OutputFunctions
+		// printf( "<== SAFTHandler::determineSpeedOfSoundData - End\n");
+	#endif
+}
+
+/**
+   Perform initialisations for the partial reconstructions for both the speed of sound pre-calculation and the actual reconstruction.
+   - F<>hre Initialisierungen fuer eine Teilrekonstruktion von beiden durch: Der Schallgeschwindigkeit und der aktuellen Rekonstruktion
+*/
+void SAFTHandler::partialReconstructionInitialisation()
+{
+	#ifdef debug_OutputFunctions
+		// printf( "==> SAFTHandler::partialReconstructionInitialisation - Start\n");
+	#endif
+//
+//	if(!partialReconstructionInitialised)
+//    {
+//        std::cout << "Initialising partial reconstruction data" << std::endl;
+//
+//        //zLayerVoxelCount = regionOfInterestResolutionX * regionOfInterestResolutionY;
+//        zLayerVoxelCount = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y;			// Anzahl der X-Y-Voxel bestimmen den Schritt in das naechste Layer.
+//
+//        partialOutputVoxelCount = partialOutputSize / sizeof(double);
+////        if(partialOutputVoxelCount % zLayerVoxelCount != 0)								//Sicherheitsabfrage nun im kernel
+////        	mexErrMsgTxt("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
+//            //throw ail::exception("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
+//        partialOutputZLayerCount = partialOutputVoxelCount / zLayerVoxelCount;
+//
+////        if(partialOutputZLayerCount % genericSAFTBlockDimensions.z != 0)								//Sicherheitsabfrage nun im kernel
+////        	mexErrMsgTxt("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
+//            //throw ail::exception("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
+//
+//        //Make dynamically sized allocations for the pre-calculated speed of sound data.
+//        //The size depends on the number of z-layers in the output window.
+//        //These particular pre-calculations are no longer performed only once for all voxels.
+//        //Instead, they are performed partially, prior to each launch of the SAFT kernel.
+//        //This lowers the pressure on GPU global memory.
+//
+//        //F<>hre Allokationen mit dynamischer Groesse aus fuer die Vor-Verarbeitung der SoS-Daten
+//        //Die Groesse haengt von der Anzahl der z-Layer in dem -Fenster ab.
+//        //Diese partielle-Vorberechnung muss nur einmal fuer alle Voxel durchgef<65>hrt werden.
+//        //Stattdessen werden sie immer partiell durchgef<65>hrt, vor jedem Start des SAFT-Kernels.
+//        //Das entlastet den globalen GPU-Speicher.
+//
+//        determineSpeedOfSoundData(partialOutputZLayerCount);
+//
+//        // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathVoxelCounts der Groesse:%i\n", emitterToVoxelPathVoxelDataSize);
+//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathVoxelCounts), emitterToVoxelPathVoxelDataSize));
+//        // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathSpeedOfSoundSum der Groesse:%i\n", emitterToVoxelPathSpeedDataSize);
+//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathSpeedOfSoundSum), emitterToVoxelPathSpeedDataSize));
+//
+//        // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathVoxelCounts der Groesse:%i\n", voxelToReceiverPathVoxelDataSize);
+//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathVoxelCounts), voxelToReceiverPathVoxelDataSize));
+//        // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathSpeedOfSoundSum der Groesse:%i\n", voxelToReceiverPathSpeedDataSize);
+//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathSpeedOfSoundSum), voxelToReceiverPathSpeedDataSize));
+//
+//        partialReconstructionInitialised = true;
+//    }
+
+	#ifdef debug_OutputFunctions
+		// printf( "<== SAFTHandler::partialReconstructionInitialisation - End\n");
+	#endif
+}
+
+
+
+
+
+
+/**
+   Print free/total memory available on the chosen device.
+   - Gibt freien/totalen zur verf<72>gung stehenden Speicher auf dem gew<65>hlten Device aus.
+*/
+void memoryCheck()
+{
+	#ifdef debug_OutputFunctions
+		// printf( "==> memoryCheck - Start\n");
+	#endif
+
+	std::size_t
+        totalMemory,
+        freeMemory;
+    CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
+
+    #if defined(debug_OutputInfo) || defined(debug_OutputMaxMemory)
+		//printSize("   Total memory ", totalMemory);
+		//std::cout << " ( " << totalMemory << " )" << std::endl;
+		//printSize("    Free memory ", freeMemory);
+		//std::cout << " ( " << freeMemory << " )" << std::endl;
+
+		//printSize(" => Used memory ", (totalMemory-freeMemory));
+		//std::cout << " ( " << (totalMemory-freeMemory) << " )" << std::endl;
+	#endif
+
+    #ifdef debug_OutputFunctions
+    	// printf( "<== memoryCheck - End\n");
+	#endif
+
+}
+
+
+
+
+
+
+/**
+   Generic CUDA call wrapper.
+   Check the result of a CUDA operation and throw an exception if an error occurred.
+   This is used in combination with a macro in saft.hpp.
+   - Generischer CUDA Call Wrapper
+   - <20>berpr<70>ft die Ergebnisse einer CUDA Operation und wirft eine Exception wenn ein Fehler auftritt
+   - Das wird wird mit einer Kombination mit einem Makro in saft.hpp genutzt.
+*/
+//inline	// Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
+void performCUDAResultCheck(
+    cudaError_t result, ///< Result of the CUDA operation.
+    std::string const & file, ///< Path to the source code file.
+    int line ///< Line within the source code
+    )
+{
+    if(result != cudaSuccess)
+    {
+    	//// printf("A CUDA operation failed in file \"%s\" (line %i):  %s \n", file, line, cudaGetErrorString(result).c_str() );
+    	// printf("%s\n", cudaGetErrorString( cudaGetLastError() ) );
+
+    	//std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
+        //std::cout << errorMessage << std::endl;
+        mexErrMsgTxt("-> Error occurred");
+    }
+}
+
--- a/SAFT_ATT/src/saft.cu
+++ b/SAFT_ATT/src/saft.cu
@@ -0,0 +1,15 @@
+#include <iostream>
+
+#include "saft.hpp"
+
+/*!
+	This is the central CUDA file which really just includes the other modules.
+	This is done because CUDA does not support external references for referencing data from other compilation units.
+	- Dies ist das zentrale CUDA-File welches nur die anderen Module einbindet
+	- Das wird gemacht, weil CUDA keine externen Referenzen unterst<73>tzt, um Daten von anderen Compilierungs Einheiten zu referenzieren.
+*/
+
+#include "kernel/constantMemory.hcu"
+#include "kernel/rayTracing.hcu"
+#include "kernel/precalculateSpeedOfSoundKernel.hcu"
+#include "kernel/saftKernel.hcu"
--- a/SAFT_ATT/src/saft.hpp
+++ b/SAFT_ATT/src/saft.hpp
@@ -0,0 +1,594 @@
+// 1. Compilieren mit make
+//    -> es wird folgende Datei erstellt: output/saft_sos.mexa64
+// 2. Kopieren in Arbeitsordner
+//    cp /home/kretzek/fser/sandbox/SAFT-GPU/output/saft_sos.mexa64 /home/kretzek/fser/USCT_SW/3DReconstruction/Reconstruction/Reflection/trunk/saft_sos_compute2_debugSoS.mexa64
+
+
+#pragma once
+
+#include <string>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#include <stdint.h>
+#include <stdio.h>      // standard input/output
+#include <vector>       // stl vector header
+
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned long ulong;
+typedef unsigned long long ullong;
+
+//Define Outputs for Debugmode
+//============================
+	//#define debug_OutputFunctions						// Funktionenaufrufe ausgeben
+	//#define debug_OutputVariables						// Werte der Variablen ausgeben
+	//#define debug_OutputParameter						// Uebersicht der Eingabedaten anzeigen sowie Infoblöcke in den einzelnen Schritten
+	//#define debug_OutputMemory						// Speicherverwaltung, Malloc, Free, Groessen
+	//#define debug_OutputMaxMemory						// Gibt aktuellen Speicherverbrauch an, wenn memoryCheck aufgerufen wird
+	//#define debug_OutputInfo							// Gibt Infos zu Schritten, Variablen,... aus
+	//#define debug_OutputPerformance					// Gibt die Laufzeiten und die eizelnen Multi-GPU Performanzwerte von ProcessAscans aus (MemAlloc,PerformCoreReconstruction, Duration, FreeMem)
+	//#define debug_OutputStepsPerformance				// Gibt die Laufzeiten und für die eizelnen Schritte in performCoreReconstruction aus (Copy Ascans, Precalc, PerfCoreReconstruction, copy back)
+	//#define debug_OutputZSteps							// Gibt die Einteilung in Z-Richtung aus
+		#define DebugOutputGPUIdx  0
+	//#define debug_OutputHostStepsPerformance			// Gibt die Laufzeiten für die eizelnen Schritte auf dem HOST aus (Preintegrated Ascans)
+	//#define debug_OutputSAFTHandlerThreadPerformance 	// Gibt die Gesamt-Laufzeiten der einzelnen Multi-GPU Threads aus
+	//#define debug_OutputMultiGpu						// Einteilung des Volumens auf mehrerer GPUs ausgeben
+	//#define debug_OutputStreams						// Gibt die Schritte der Berechnung der Streams aus
+	//#define debug_OutputSOSPaths						// Gibt die Schritte und Werte der SOSPfadberechnung aus
+	//#define debug_OutputSOSStepsParameter				// Einteilung der ZLayer in SOSZlayer
+	//#define debug_OutputLookUpGeometryMemoryList		// Debugausgabe fuer die LookUpGeometryMemoryList (Constant Memory)
+
+	//#define OutputVolume						// Ausgabe des  Volumens
+
+
+// Debugging CUDA Kernels
+//================================================
+	//#define debug_CudaSAFTKernel
+	//#define debug_CudaSAFTKernel_Median
+	//#define debug_CudaPrecalculateKernel
+	//#define debug_CudaRayTraceKernel
+	//#define debug_CudaRayTraceKernelLive
+
+	//#define DebugSetMemoryToZero  // Set SOSPathMemory to Zero as Initialisation
+
+// Define specific Hardware-Versions
+	#define GTX_590
+	//#define GTX_690
+	//#define GTX_TITAN
+
+	#if defined(GTX_590)
+		#define GTX_Fermi
+	#endif
+	#if defined(GTX_690) || defined(GTX_TITAN)
+		#define GTX_Kepler
+	#endif
+
+// Speichermanagement der GPU sowie Errordetektion
+//================================================
+	//#define SaftNoTexture
+		//#define SaftCorrectSumOneAscan		// 9.7-9.9 GVA/s        // Skip wrong Numbers
+		#define SaftCorrectSumAllAscan	    // 8.2 GVA/s		    // Recalculation if too high numbers are calculated
+
+	#define SaftEmitterCache				// Caching for Emitter Coordinates and Distance
+	//#define SaftEmitterCacheTernery		// Caching for Emitter Coordinates and Distance
+
+
+// SAFT- SOS Implementierungen
+//================================================
+	//#define SaftSoSNoCache
+	//#define SaftSoSEmitterCache
+	//#define SaftSoSCombineTasCache		// noch nicht implementiert
+	//#define SaftSoSCombineInSoSVoxelCache
+	#define SaftSoSWithPrecalculateSoSZLayer
+
+
+#define SaftMedian
+	#define BRANCHLESS_MEDIAN  // Ohne kommts zum Absturz!
+	//#define SaftMedian_withMean3		// Mean of 3 Values
+	//#define SaftMedian_withMean5		// Mean of 5 Values
+	//#define SaftMedian_CalcOnlyMean	// Mean of all buffered Values in Window
+
+	#define maxMedianWindowSize 96
+	#ifndef FLT_MAX //is not defined in cuda kernel?
+	#define FLT_MAX 0x1.fffffep127f
+	#endif
+
+
+
+
+// Integration der A-scans im Vornherein durchfuehren um Samplebreite an zu rekonstruierende Aufloesung anzupassen
+
+	#define preAscanIntegrationToMatchSamplerateToResolution  // Integration der Ascans ueber Fensterbreite durchfuehren
+		//#define debug_preAscanIntegration
+			#define DebugSammleMin  2990
+			#define DebugSammleMax  3000
+		//#define preAscanIntegrationVersion1Michael	// direkt übernommene Version von Michael
+		#define preAscanIntegrationVersion2Ernst	// korrigierte Variante mit genauerer Fensterbreite
+
+
+
+
+// Parameter fuer SAFT-Kernel
+	#define SaftLinearInterpolation  // Lineare Interpolation beim Zugriff auf A-scans durchführen
+
+	#define SaftUseConstantMemforGeometry				// Geometriedaten im Constantmemory nutzen
+	//#define SaftTextureForERIndexBlock 				// Texturmemory für das Laden der Emitter und Receiver Indexe fuer entsprechenden AScan nutzen
+
+	#define debug_CudaSAFTKernelModes			// Use variable debugMode for different calulations methods and output
+		//#define debug_CudaSAFTKernel_EnableAnalyticAverageSpeedCalculation	// Fuer Fehlerberchnungen
+
+	//#define SaftTextureForEmRecSosPathsTablesFloat1	// Use Float1-Textur for loading SOS-Paths -> Sum, Count separated
+	//#define SaftTextureForEmRecSosPathsTablesFloat2		// Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position
+	#define SaftTextureForEmRecSosPathsTablesFloat4		// Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position
+
+	#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
+		#define SaftTextureForEmRecSosPathsTables		// Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible
+	#endif
+
+	// Several SAFT_VARIANTs
+	#define SAFT_VARIANT_AscanPreintegration					0
+	#define SAFT_VARIANT_AscanInterpolation						1
+	#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing	2	// Use interpolation while Preprocessing
+	#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction	3	// Use interpolation while Reconstruction
+	#define SAFT_VARIANT_CalcStandardDeviation					4
+	#define SAFT_VARIANT_SumUpOverBoarderIndices				5
+
+
+// Cache <-> shared Memory
+    //#define SaftPreferSharedMem 	// cudaFuncCachePreferShared: shared memory is 48 KB
+    #define SaftPreferL1SharedMem 	// cudaFuncCachePreferL1: shared memory is 16
+	//#define SaftPreferNone			// cudaFuncCachePreferNone: no preference
+
+// Receiver Cache mit shared Memory (nur bei kleinen Blockgroeßen)
+    //#define SaftReceiverSharedMemCacheReceiverDistance
+	//#define SaftCacheReceiverSOS
+		//#define SaftReceiverSharedMemCacheReceiverSOS	// Use Shared Memory for Caching
+		//#define SaftRegisterCacheReceiverSOS			// Use Register for Caching
+
+// Berechnung der mittleren Schallgeschwindigkeit
+//================================================
+	//#define SaftUseArithmeticMean 	// arithmetic Mean
+	#define SaftUseHarmonicMean       	// harmonic Mean	//das Richtige!!
+
+
+	//#define SaftCalcSoSInKernel				 // Bresenham wird noch mal speziell bei jedem Voxel und Pfad durchgerechnet!
+			// ! SOS_Version2 rausnehmen sonst gehts nicht!
+
+	#define SaftTextureForBresenhamSosPaths	 // Texturmemory für SOS-Volumen nutzen
+			//#define SaftTextureForBresenhamInterpolated	//iSOS-Version --> wird nun ueber Parameter uebergeben
+			//#define SaftUseFastMath				//FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig.
+
+	//#define SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert
+	#define SaftUseSosAttFloat2		// Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+
+	#define SOS_Version2	// korrekte Version mit Definitionen im Mittelpunkt
+	//#define SOS_Version3	// Mit extra Angabe der Endpkte
+
+
+// MultiGPU
+//================================================
+//	#define debug_SetNumGPU		// Anzahl der GPUs festlegen
+//	//#undef debug_SetNumGPU
+//
+//	#ifdef debug_SetNumGPU
+//		#define NUM_GPUS 1
+//		#define NUM_DEVICEGPU	1	// Um diese Anzahl verschiebt sich alles also zB bei +1
+//	#endif
+
+const int MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY = 2340;
+
+	#define Distanz_Standard //172 MV/s                             //14,5 GVA/s
+    //#define Distanz_Heron2
+    //#define Distanz_Memory 100 //Mit 100-Werte LUT-Memory          //11,53 GVA/s      //Diff [0 .. 0.0828] sehr schlecht!
+    //#define Distanz_Memory 1000 //Mit 1000-Werte LUT-Memory        //12,6 GVA/s      //Diff [0 .. 0.0096]
+    //#define Distanz_Memory 1000_Heron                              //281 MV/s      //Diff [0 .. 2.3176e-004]
+	//#define Distanz_Memory 10000 //Mit 10000-Werte LUT-Memory        //11,58 GVA/s      //Diff [0 .. 9.6333e-004]
+    //#define Distanz_Memory 100000 //Mit 10000-Werte LUT-Memory     //375 MV/s
+		//#define Use_Distanz_SharedMemory
+
+//Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred.
+#define CUDA_CHECK(operation) performCUDAResultCheck(operation, __FILE__, __LINE__);
+
+//Macro used to see when a particular line of code is executed on the host.
+#define DEBUG_MARK std::cout << "[DEBUG] file " << __FILE__ << ", line " << __LINE__ << std::endl
+
+
+//Convenient typedefs for containers
+typedef std::vector<cudaDeviceProp> DeviceProperties;
+typedef std::vector<dim3> Dimensions;
+
+
+/**
+   Most important class in the application.
+   - Haupt-Klasse der Applikation
+   It is responsible for all of the image reconstruction.
+   - Sie ist verantwortlich fuer alle BildRekonstruktionen
+*/
+class SAFTHandler
+{
+public:
+    SAFTHandler(int deviceId,
+    			int deviceIndex,
+    			float *aScan_ptr,    				///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath,
+    			double *output_ptr,  				///< Zeiger zu den Outputdaten //std::string const & outputPath,
+    			double *Duration_ptr,  				///< Zeiger auf Ausgabewert f<>r benoetigte Laufzeit des SAFT-Kernels
+    			unsigned short *receiver_index_ptr, ///<
+    			unsigned short *emitter_index_ptr,  ///<
+    			float *receiver_list_ptr, 			///<
+    			int receiver_list_Size, 			///<
+    			float *emitter_list_ptr,			///<
+    			int emitter_list_Size, 				///<
+    			float *speed_vec_ptr,
+    			int3 SOSGrid_XYZ,
+    			float3 sosOffset, 					///< Startpoint of SoSGrid
+    			float SOS_RESOLUTION,				///< Aufloesung des SoSGrid
+    			float *att_vec_ptr,    				//att_vec_ptr
+
+    			int aScanCount,
+    			int aScanLength,
+    			int3 IMAGE_SIZE_XYZ,
+    		    float sampleRate,
+    			float3 regionOfInterestOffset,
+    			float IMAGE_RESOLUTION,
+    			dim3 const & fixedBlockDimensions,
+
+    			int medianWindowSize, 				///< define width of used median filter
+
+    			float debugMode,
+    			float debugModeParameter,
+    			//bool useFixedPartialOutputWindow,
+
+    			bool SOSMode_3DVolume,
+    			bool ATTMode_3DVolume,
+
+    			int SAFT_MODE,
+    			int *SAFT_VARIANT
+    			);
+
+    void performReconstruction();
+
+private:
+    bool SOSMode_3DVolume,
+    	 ATTMode_3DVolume;
+
+	int SAFT_MODE;
+	int *SAFT_VARIANT;
+	int *deviceSAFT_VARIANT;
+
+	#ifdef Distanz_Memory
+		float *deviceWurzelApprox;
+	#endif
+
+    int deviceId;
+    int deviceIndex;
+    float  debugMode;
+    float  debugModeParameter;
+
+    DeviceProperties deviceProperties;
+    float
+    	*aScan_ptr;
+
+//    float
+//    	*rec_vec_ptr,
+//    	*send_vec_ptr;
+
+    unsigned short
+    	*emitter_index_ptr,
+    	*receiver_index_ptr;
+
+    float
+    	*emitter_list_ptr,
+    	*receiver_list_ptr;
+
+    int
+    	receiver_list_Size,
+    	emitter_list_Size;
+
+    double
+       	*output_ptr;
+
+    double
+    	*Duration_ptr;
+
+    float
+    	Sos,
+    	*speed_vec_ptr,
+    	*att_vec_ptr;
+
+    int3
+    	SOSGrid_XYZ;
+
+	float3
+		sosOffset; 					///< Startpoint of SoSGrid
+
+    int
+    	aScanCount,
+    	aScanLength;
+
+    int3
+    	IMAGE_SIZE_XYZ;
+
+    float3 regionOfInterestSize; // ROI-Groesse in meter
+
+    float3
+    	regionOfInterestOffset; //imageStartpoint; TODO: umbenennen!
+
+    float
+    	IMAGE_RESOLUTION,		  ///< Aufl<66>sung im OutputVolumen
+	    IMAGE_RESOLUTION_FACTOR,  ///< 1/Aufl<66>sung im OutputVolumen
+	    SOS_RESOLUTION,			  ///< Aufloesung des SoSGrid
+	    SOS_RESOLUTION_FACTOR;	  ///< 1/Aufl<66>sung im SoS-Grid
+
+    std::string
+        emitterGeometryPath,
+        receiverGeometryPath,
+        aScanSamplesPath,
+        outputPath;
+
+//    bool
+//        printPerformanceAnalysis,
+//        printSortedAutoTuningResults;
+
+
+    float *aScanSamples;
+    double *output;
+    //int aScanCount;
+    int
+    	aScanSize,
+    	batchSize,
+    	aScanBatchSize;
+
+    float voxelSize;
+
+    float sampleRate;
+
+    //size_t
+    uint64_t
+    	regionOfInterestVoxelCount,
+        outputSize;
+
+    uint64_t
+    	partialOutputZLayerOffset;
+
+    int
+    	partialOutputZLayerOffsetCount,
+    	partialOutputSoSZLayerCount,
+    	currentZLayerCount,
+    	partialSoSZLayerCount;
+
+    double *currentHostOutputAdress;
+
+    // Pointer of Inputdata in memory of Ascanblock
+    float3
+        *receiver_list,			// LookUpTable receiverNr -> coordinates
+        *emitter_list;				// LookUpTable emitterNr -> coordinates
+
+    unsigned short
+    	*receiver_index,			// Input Ascanblockdata: corresponding receiverNr
+    	*emitter_index;			// Input Ascanblockdata: corresponding emitterNr
+
+    float
+    	*SoSData;					// Input Ascanblockdata: Corresponding SOS value
+
+    float *speedOfSoundField;		// Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen
+    float *attenuationField;		// Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen
+
+	#ifdef SaftUseSosAttFloat2
+    	float2 *hostSosAttField;
+	#endif
+
+    // Memorysizes
+    //std::size_t
+    int
+    	speedOfSoundFieldVoxelCount,				//
+    	speedOfSoundFieldBytes,						//
+    	speedOfSoundEmitterVoxelPathCountByteSize,	// Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen
+    	speedOfSoundEmitterVoxelPathSumByteSize;	// Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel
+
+    dim3
+        fixedBlockDimensions,			// kann ws durch genericSAFTBlockDimensions ersetzt
+        genericSAFTBlockDimensions,
+        genericSAFTGridDimensions,
+        windowGridDimensions;
+
+    int medianWindowSize; 	// define width of used median filter
+
+
+#ifdef SaftNoTexture
+    float ** deviceAScans;
+#else
+	cudaArray **deviceAScansCuArray;
+
+#endif
+
+#ifdef SaftTextureForBresenhamSosPaths
+
+	#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
+		cudaArray *deviceSpeedOfSoundFieldCuArray;			// SOS volume
+		cudaArray *deviceAttenuationFieldCuArray;			// ATT volume
+	#endif
+
+	#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
+		cudaArray *deviceSosAttFieldCuArray;
+	#endif
+#endif
+
+
+
+	int maxSoSReceiverArrayForTexture;
+	int TableVoxelToReceiverPathSosAllocationCount;
+	std::size_t receiver_list_Size_deviceMemory;
+
+#ifdef SaftTextureForEmRecSosPathsTables
+	// Für Emitter ----- normal definieren
+	cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray;			//SoSSum
+	//cudaPitchedPtr pitchedTableVoxelToEmitterPathSosSumDevPtr;
+
+	cudaArray *deviceTableVoxelToEmitterPathCountCuArray;			//Count
+	//cudaPitchedPtr pitchedTableVoxelToEmitterPathCountDevPtr;
+
+	// Für Receiver ----- als Arrays definieren
+	cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray;		//SoSSum
+	//cudaPitchedPtr * pitchedTableVoxelToReceiverPathSosSumDevPtr;
+
+	cudaArray **deviceTableVoxelToReceiverPathCountCuArray;		//Count
+	//cudaPitchedPtr * pitchedTableVoxelToReceiverPathCountDevPtr;
+#endif
+
+#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
+	cudaArray *deviceTableVoxelToEmPathSosBothCuArray;  			//Emitter SoSSum + Count
+	cudaArray **deviceTableVoxelToRecPathSosBothCuArray;  			//Receiver SoSSum + Count
+#endif
+
+#ifdef SaftTextureForERIndexBlock
+	cudaArray * deviceEmIndexBlockCuArray;
+	cudaArray * deviceRecIndexBlockCuArray;
+#endif
+
+	// Schallgeschwindigkeitskorrektur-Mode
+    float *deviceSpeedOfSoundField;			// Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU
+
+    // Block-Mode
+    unsigned short *deviceEmitterIndex_block;		// Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU
+    unsigned short *deviceReceiverIndex_block;
+    float3 *deviceListEmitterGeometry;			// Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU
+    float3 *deviceListReceiverGeometry;
+
+    float  *deviceSoSData_block;				// Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU
+
+//    VoxelCountType								// Adressen fuer Speicher der SoS-Pfade auf der GPU
+//        * deviceTableVoxelToEmitterPathCount,
+//        * deviceTableVoxelToReceiverPathCount;
+    float
+    	*deviceTableVoxelToEmitterPathCountFloat,
+    	*deviceTableVoxelToReceiverPathCountFloat,
+        *deviceTableVoxelToEmitterPathSosSum,
+        *deviceTableVoxelToReceiverPathSosSum;
+
+
+    bool *deviceValidEmitterReceiverCombinations;
+
+    int *deviceTransducerVectorAnalysisDistributionCounters;
+
+//    float3
+//        * deviceEmitterGeometry,
+//        * deviceReceiverGeometry;
+
+    int   usedAmountOfEmitter, 	// amount of used emitter
+    	  usedAmountOfReceiver; // amount of used receiver
+
+    // Output volume
+	double *deviceOutput;
+
+    //Streams used for synchronisation
+    cudaStream_t
+        copyStream,
+        calculationStream;
+
+    //This variable describes the number of allocations used by the current SAFT mode
+    std::size_t aScanAllocationCount;
+
+    int
+        invalidEmitterReceiverCombinationsCount,
+        validEmitterReceiverCombinationsCount;
+
+    Dimensions validBlockDimensions;
+    bool useAutoTuning;
+//    AutoTuningConfiguration autoTuningConfiguration;
+
+    size_t
+    	partialOutputSize,
+    	partialVolumeSize,				// Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benötigt wuerde
+    	partialSosPathSize,				// Speicher(OutputVolumen), der fuer die entsprechende Anzahl an SoS-Z-Layer benötigt wuerde
+    	maxFeasibleZLayerCount,			// Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt.
+    	maxFeasibleSosZLayerCount;		// Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt.
+
+    int
+        minimumAutoTuningThreadCount,
+        maximumAutoTuningThreadCount;
+
+
+    //New partial reconstruction data
+
+    std::size_t partialSpeedOfSoundVoxelCount;
+    std::size_t partialOutputZLayerCount;
+    std::size_t zLayerVoxelCount;
+    std::size_t sosZLayerVoxelCount;           // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp
+    std::size_t partialOutputVoxelCount;
+
+    std::size_t
+		//deviceTableVoxelToEmitterPathCountSize,
+		deviceTableVoxelToEmitterPathCountFloatSize,
+		deviceTableVoxelToEmitterPathSosSumSize,
+		//deviceTableVoxelToReceiverPathCountSize,
+		deviceTableVoxelToReceiverPathCountFloatSize,
+		deviceTableVoxelToReceiverPathSosSumSize;
+
+	double diff_time;	// For Time Measurement
+	float transferRate; // For DataTransferrate Measurement
+	float performRate;  // For PerformSAFTrate Measurement
+	cudaDeviceProp deviceProp;		// Ausgabe der Frequenz
+
+
+    //Core reconstruction
+
+    void processAScans(ullong & duration);
+    void performCoreReconstruction();
+
+    //Pre-calculation
+
+    void precalculateAverageSpeedOfSound(int zLayer, int zLayerCount);
+//    void analysisOfTransducerVectors();
+
+//    void normalisePerformanceStatisticsOutput();
+//    void printTransducerVectorStatistics();
+
+    //Auto-tuning
+    bool determineGridDimensions(dim3 const & blockDimensions, dim3 & gridDimensions);
+    void determineValidBlockDimensions();
+
+
+    void reduceKernelDimensions(dim3 const & gridDimensions, dim3 const & blockDimensions, dim3 & reducedGridDimensions, dim3 & reducedBlockDimensions);
+
+    //Pre-calculation kernels
+	#ifdef SaftUseConstantMemforGeometry
+		//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
+		void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
+	#else
+		//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceSpeedOfSoundSumOutput);
+		void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, float * deviceSpeedOfSoundSumOutput);
+	#endif
+
+
+    //    void analyseTransducerVectors(dim3 gridDimensions, dim3 blockDimensions);
+
+    //SAFT kernels
+    //void performInterpolation(float * deviceAScans, float * deviceOutput, dim3 gridDimensions, dim3 blockDimensions, cudaStream_t stream);
+    //void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float const * deviceAScans); //, cudaStream_t stream);
+#ifdef SaftNoTexture
+	void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, float * deviceAScans 		    );   //Ascans im Devicememory
+#else
+	void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceAScansCuArray);   //Ascans in CuArray f<>r Texturmemory
+	//void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceSpeedOfSoundFieldCuArray, cudaArray * deviceAScansCuArray);   //Ascans in CuArray f<>r Texturmemory
+#endif
+
+    //Utility functions
+    bool setGenericDimensions();
+    std::size_t resolutionConversion(std::size_t input, std::size_t greaterResolution, std::size_t lowerResolution);
+    void partialReconstructionInitialisation();
+    std::size_t getCurrentZLayerCount(std::size_t zOffset);
+    void getCurrentSpeedOfSoundVariables(std::size_t zOffset, std::size_t currentZLayerCount, std::size_t & currentSpeedOfSoundZLayer, std::size_t & currentSpeedOfSoundPartialZLayerCount);
+    void determineSpeedOfSoundData(std::size_t regionOfInterestZLayers);
+};
+
+//std::string vectorToString(float3 const & vector);
+//std::string voxelToString(dim3 const & voxel);
+extern void memoryCheck();
+
+extern void performCUDAResultCheck(cudaError_t result, std::string const & file, int line);
+
+