URDepends/SAFT_ATT/src/saft.hpp

// 1. Compilieren mit make
//    -> es wird folgende Datei erstellt: output/saft_sos.mexa64
// 2. Kopieren in Arbeitsordner
//    cp /home/kretzek/fser/sandbox/SAFT-GPU/output/saft_sos.mexa64 /home/kretzek/fser/USCT_SW/3DReconstruction/Reconstruction/Reflection/trunk/saft_sos_compute2_debugSoS.mexa64


#pragma once

#include <string>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#include <stdint.h>
#include <stdio.h>      // standard input/output
#include <vector>       // stl vector header


typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned long ulong;
typedef unsigned long long ullong;

//Define Outputs for Debugmode
//============================
	//#define debug_OutputFunctions						// Funktionenaufrufe ausgeben
	//#define debug_OutputVariables						// Werte der Variablen ausgeben
	//#define debug_OutputParameter						// Uebersicht der Eingabedaten anzeigen sowie Infoblöcke in den einzelnen Schritten
	//#define debug_OutputMemory						// Speicherverwaltung, Malloc, Free, Groessen
	//#define debug_OutputMaxMemory						// Gibt aktuellen Speicherverbrauch an, wenn memoryCheck aufgerufen wird
	//#define debug_OutputInfo							// Gibt Infos zu Schritten, Variablen,... aus
	//#define debug_OutputPerformance					// Gibt die Laufzeiten und die eizelnen Multi-GPU Performanzwerte von ProcessAscans aus (MemAlloc,PerformCoreReconstruction, Duration, FreeMem)
	//#define debug_OutputStepsPerformance				// Gibt die Laufzeiten und für die eizelnen Schritte in performCoreReconstruction aus (Copy Ascans, Precalc, PerfCoreReconstruction, copy back)
	//#define debug_OutputZSteps							// Gibt die Einteilung in Z-Richtung aus
		#define DebugOutputGPUIdx  0
	//#define debug_OutputHostStepsPerformance			// Gibt die Laufzeiten für die eizelnen Schritte auf dem HOST aus (Preintegrated Ascans)
	//#define debug_OutputSAFTHandlerThreadPerformance 	// Gibt die Gesamt-Laufzeiten der einzelnen Multi-GPU Threads aus
	//#define debug_OutputMultiGpu						// Einteilung des Volumens auf mehrerer GPUs ausgeben
	//#define debug_OutputStreams						// Gibt die Schritte der Berechnung der Streams aus
	//#define debug_OutputSOSPaths						// Gibt die Schritte und Werte der SOSPfadberechnung aus
	//#define debug_OutputSOSStepsParameter				// Einteilung der ZLayer in SOSZlayer
	//#define debug_OutputLookUpGeometryMemoryList		// Debugausgabe fuer die LookUpGeometryMemoryList (Constant Memory)

	//#define OutputVolume						// Ausgabe des  Volumens


// Debugging CUDA Kernels
//================================================
	//#define debug_CudaSAFTKernel
	//#define debug_CudaSAFTKernel_Median
	//#define debug_CudaPrecalculateKernel
	//#define debug_CudaRayTraceKernel
	//#define debug_CudaRayTraceKernelLive

	//#define DebugSetMemoryToZero  // Set SOSPathMemory to Zero as Initialisation

// Define specific Hardware-Versions
	#define GTX_590
	//#define GTX_690
	//#define GTX_TITAN

	#if defined(GTX_590)
		#define GTX_Fermi
	#endif
	#if defined(GTX_690) || defined(GTX_TITAN)
		#define GTX_Kepler
	#endif

// Speichermanagement der GPU sowie Errordetektion
//================================================
	//#define SaftNoTexture
		//#define SaftCorrectSumOneAscan		// 9.7-9.9 GVA/s        // Skip wrong Numbers
		#define SaftCorrectSumAllAscan	    // 8.2 GVA/s		    // Recalculation if too high numbers are calculated

	#define SaftEmitterCache				// Caching for Emitter Coordinates and Distance
	//#define SaftEmitterCacheTernery		// Caching for Emitter Coordinates and Distance


// SAFT- SOS Implementierungen
//================================================
	//#define SaftSoSNoCache
	//#define SaftSoSEmitterCache
	//#define SaftSoSCombineTasCache		// noch nicht implementiert
	//#define SaftSoSCombineInSoSVoxelCache
	#define SaftSoSWithPrecalculateSoSZLayer


#define SaftMedian
	#define BRANCHLESS_MEDIAN  // Ohne kommts zum Absturz!
	//#define SaftMedian_withMean3		// Mean of 3 Values
	//#define SaftMedian_withMean5		// Mean of 5 Values
	//#define SaftMedian_CalcOnlyMean	// Mean of all buffered Values in Window

	#define maxMedianWindowSize 96
	#ifndef FLT_MAX //is not defined in cuda kernel?
	#define FLT_MAX 0x1.fffffep127f
	#endif


// Integration der A-scans im Vornherein durchfuehren um Samplebreite an zu rekonstruierende Aufloesung anzupassen

	#define preAscanIntegrationToMatchSamplerateToResolution  // Integration der Ascans ueber Fensterbreite durchfuehren
		//#define debug_preAscanIntegration
			#define DebugSammleMin  2990
			#define DebugSammleMax  3000
		//#define preAscanIntegrationVersion1Michael	// direkt übernommene Version von Michael
		#define preAscanIntegrationVersion2Ernst	// korrigierte Variante mit genauerer Fensterbreite


// Parameter fuer SAFT-Kernel
	#define SaftLinearInterpolation  // Lineare Interpolation beim Zugriff auf A-scans durchführen

	#define SaftUseConstantMemforGeometry				// Geometriedaten im Constantmemory nutzen
	//#define SaftTextureForERIndexBlock 				// Texturmemory für das Laden der Emitter und Receiver Indexe fuer entsprechenden AScan nutzen

	#define debug_CudaSAFTKernelModes			// Use variable debugMode for different calulations methods and output
		//#define debug_CudaSAFTKernel_EnableAnalyticAverageSpeedCalculation	// Fuer Fehlerberchnungen

	//#define SaftTextureForEmRecSosPathsTablesFloat1	// Use Float1-Textur for loading SOS-Paths -> Sum, Count separated
	//#define SaftTextureForEmRecSosPathsTablesFloat2		// Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position
	#define SaftTextureForEmRecSosPathsTablesFloat4		// Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position

	#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
		#define SaftTextureForEmRecSosPathsTables		// Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible
	#endif

	// Several SAFT_VARIANTs
	#define SAFT_VARIANT_AscanPreintegration					0
	#define SAFT_VARIANT_AscanInterpolation						1
	#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing	2	// Use interpolation while Preprocessing
	#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction	3	// Use interpolation while Reconstruction
	#define SAFT_VARIANT_CalcStandardDeviation					4
	#define SAFT_VARIANT_SumUpOverBoarderIndices				5


// Cache <-> shared Memory
    //#define SaftPreferSharedMem 	// cudaFuncCachePreferShared: shared memory is 48 KB
    #define SaftPreferL1SharedMem 	// cudaFuncCachePreferL1: shared memory is 16
	//#define SaftPreferNone			// cudaFuncCachePreferNone: no preference

// Receiver Cache mit shared Memory (nur bei kleinen Blockgroeßen)
    //#define SaftReceiverSharedMemCacheReceiverDistance
	//#define SaftCacheReceiverSOS
		//#define SaftReceiverSharedMemCacheReceiverSOS	// Use Shared Memory for Caching
		//#define SaftRegisterCacheReceiverSOS			// Use Register for Caching

// Berechnung der mittleren Schallgeschwindigkeit
//================================================
	//#define SaftUseArithmeticMean 	// arithmetic Mean
	#define SaftUseHarmonicMean       	// harmonic Mean	//das Richtige!!


	//#define SaftCalcSoSInKernel				 // Bresenham wird noch mal speziell bei jedem Voxel und Pfad durchgerechnet!
			// ! SOS_Version2 rausnehmen sonst gehts nicht!

	#define SaftTextureForBresenhamSosPaths	 // Texturmemory für SOS-Volumen nutzen
			//#define SaftTextureForBresenhamInterpolated	//iSOS-Version --> wird nun ueber Parameter uebergeben
			//#define SaftUseFastMath				//FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig.

	//#define SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert
	#define SaftUseSosAttFloat2		// Nutze nur eine Textur fuer beide Volumen (Sos+Att)

	#define SOS_Version2	// korrekte Version mit Definitionen im Mittelpunkt
	//#define SOS_Version3	// Mit extra Angabe der Endpkte


// MultiGPU
//================================================
//	#define debug_SetNumGPU		// Anzahl der GPUs festlegen
//	//#undef debug_SetNumGPU
//
//	#ifdef debug_SetNumGPU
//		#define NUM_GPUS 1
//		#define NUM_DEVICEGPU	1	// Um diese Anzahl verschiebt sich alles also zB bei +1
//	#endif

const int MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY = 2340;

	#define Distanz_Standard //172 MV/s                             //14,5 GVA/s
    //#define Distanz_Heron2
    //#define Distanz_Memory 100 //Mit 100-Werte LUT-Memory          //11,53 GVA/s      //Diff [0 .. 0.0828] sehr schlecht!
    //#define Distanz_Memory 1000 //Mit 1000-Werte LUT-Memory        //12,6 GVA/s      //Diff [0 .. 0.0096]
    //#define Distanz_Memory 1000_Heron                              //281 MV/s      //Diff [0 .. 2.3176e-004]
	//#define Distanz_Memory 10000 //Mit 10000-Werte LUT-Memory        //11,58 GVA/s      //Diff [0 .. 9.6333e-004]
    //#define Distanz_Memory 100000 //Mit 10000-Werte LUT-Memory     //375 MV/s
		//#define Use_Distanz_SharedMemory

//Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred.
#define CUDA_CHECK(operation) performCUDAResultCheck(operation, __FILE__, __LINE__);

//Macro used to see when a particular line of code is executed on the host.
#define DEBUG_MARK std::cout << "[DEBUG] file " << __FILE__ << ", line " << __LINE__ << std::endl


//Convenient typedefs for containers
typedef std::vector<cudaDeviceProp> DeviceProperties;
typedef std::vector<dim3> Dimensions;


/**
   Most important class in the application.
   - Haupt-Klasse der Applikation
   It is responsible for all of the image reconstruction.
   - Sie ist verantwortlich fuer alle BildRekonstruktionen
*/
class SAFTHandler
{
public:
    SAFTHandler(int deviceId,
    			int deviceIndex,
    			float *aScan_ptr,    				///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath,
    			double *output_ptr,  				///< Zeiger zu den Outputdaten //std::string const & outputPath,
    			double *Duration_ptr,  				///< Zeiger auf Ausgabewert f<>r benoetigte Laufzeit des SAFT-Kernels
    			unsigned short *receiver_index_ptr, ///<
    			unsigned short *emitter_index_ptr,  ///<
    			float *receiver_list_ptr, 			///<
    			int receiver_list_Size, 			///<
    			float *emitter_list_ptr,			///<
    			int emitter_list_Size, 				///<
    			float *speed_vec_ptr,
    			int3 SOSGrid_XYZ,
    			float3 sosOffset, 					///< Startpoint of SoSGrid
    			float SOS_RESOLUTION,				///< Aufloesung des SoSGrid
    			float *att_vec_ptr,    				//att_vec_ptr

    			int aScanCount,
    			int aScanLength,
    			int3 IMAGE_SIZE_XYZ,
    		    float sampleRate,
    			float3 regionOfInterestOffset,
    			float IMAGE_RESOLUTION,
    			dim3 const & fixedBlockDimensions,

    			int medianWindowSize, 				///< define width of used median filter

    			float debugMode,
    			float debugModeParameter,
    			//bool useFixedPartialOutputWindow,

    			bool SOSMode_3DVolume,
    			bool ATTMode_3DVolume,

    			int SAFT_MODE,
    			int *SAFT_VARIANT
    			);

    void performReconstruction();

private:
    bool SOSMode_3DVolume,
    	 ATTMode_3DVolume;

	int SAFT_MODE;
	int *SAFT_VARIANT;
	int *deviceSAFT_VARIANT;

	#ifdef Distanz_Memory
		float *deviceWurzelApprox;
	#endif

    int deviceId;
    int deviceIndex;
    float  debugMode;
    float  debugModeParameter;

    DeviceProperties deviceProperties;
    float
    	*aScan_ptr;

//    float
//    	*rec_vec_ptr,
//    	*send_vec_ptr;

    unsigned short
    	*emitter_index_ptr,
    	*receiver_index_ptr;

    float
    	*emitter_list_ptr,
    	*receiver_list_ptr;

    int
    	receiver_list_Size,
    	emitter_list_Size;

    double
       	*output_ptr;

    double
    	*Duration_ptr;

    float
    	Sos,
    	*speed_vec_ptr,
    	*att_vec_ptr;

    int3
    	SOSGrid_XYZ;

	float3
		sosOffset; 					///< Startpoint of SoSGrid

    int
    	aScanCount,
    	aScanLength;

    int3
    	IMAGE_SIZE_XYZ;

    float3 regionOfInterestSize; // ROI-Groesse in meter

    float3
    	regionOfInterestOffset; //imageStartpoint; TODO: umbenennen!

    float
    	IMAGE_RESOLUTION,		  ///< Aufl<66>sung im OutputVolumen
	    IMAGE_RESOLUTION_FACTOR,  ///< 1/Aufl<66>sung im OutputVolumen
	    SOS_RESOLUTION,			  ///< Aufloesung des SoSGrid
	    SOS_RESOLUTION_FACTOR;	  ///< 1/Aufl<66>sung im SoS-Grid

    std::string
        emitterGeometryPath,
        receiverGeometryPath,
        aScanSamplesPath,
        outputPath;

//    bool
//        printPerformanceAnalysis,
//        printSortedAutoTuningResults;


    float *aScanSamples;
    double *output;
    //int aScanCount;
    int
    	aScanSize,
    	batchSize,
    	aScanBatchSize;

    float voxelSize;

    float sampleRate;

    //size_t
    uint64_t
    	regionOfInterestVoxelCount,
        outputSize;

    uint64_t
    	partialOutputZLayerOffset;

    int
    	partialOutputZLayerOffsetCount,
    	partialOutputSoSZLayerCount,
    	currentZLayerCount,
    	partialSoSZLayerCount;

    double *currentHostOutputAdress;

    // Pointer of Inputdata in memory of Ascanblock
    float3
        *receiver_list,			// LookUpTable receiverNr -> coordinates
        *emitter_list;				// LookUpTable emitterNr -> coordinates

    unsigned short
    	*receiver_index,			// Input Ascanblockdata: corresponding receiverNr
    	*emitter_index;			// Input Ascanblockdata: corresponding emitterNr

    float
    	*SoSData;					// Input Ascanblockdata: Corresponding SOS value

    float *speedOfSoundField;		// Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen
    float *attenuationField;		// Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen

	#ifdef SaftUseSosAttFloat2
    	float2 *hostSosAttField;
	#endif

    // Memorysizes
    //std::size_t
    int
    	speedOfSoundFieldVoxelCount,				//
    	speedOfSoundFieldBytes,						//
    	speedOfSoundEmitterVoxelPathCountByteSize,	// Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen
    	speedOfSoundEmitterVoxelPathSumByteSize;	// Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel

    dim3
        fixedBlockDimensions,			// kann ws durch genericSAFTBlockDimensions ersetzt
        genericSAFTBlockDimensions,
        genericSAFTGridDimensions,
        windowGridDimensions;

    int medianWindowSize; 	// define width of used median filter


#ifdef SaftNoTexture
    float ** deviceAScans;
#else
	cudaArray **deviceAScansCuArray;

#endif

#ifdef SaftTextureForBresenhamSosPaths

	#ifdef SaftUseSosAttFloat1		// Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
		cudaArray *deviceSpeedOfSoundFieldCuArray;			// SOS volume
		cudaArray *deviceAttenuationFieldCuArray;			// ATT volume
	#endif

	#ifdef SaftUseSosAttFloat2      // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
		cudaArray *deviceSosAttFieldCuArray;
	#endif
#endif


	int maxSoSReceiverArrayForTexture;
	int TableVoxelToReceiverPathSosAllocationCount;
	std::size_t receiver_list_Size_deviceMemory;

#ifdef SaftTextureForEmRecSosPathsTables
	// Für Emitter ----- normal definieren
	cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray;			//SoSSum
	//cudaPitchedPtr pitchedTableVoxelToEmitterPathSosSumDevPtr;

	cudaArray *deviceTableVoxelToEmitterPathCountCuArray;			//Count
	//cudaPitchedPtr pitchedTableVoxelToEmitterPathCountDevPtr;

	// Für Receiver ----- als Arrays definieren
	cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray;		//SoSSum
	//cudaPitchedPtr * pitchedTableVoxelToReceiverPathSosSumDevPtr;

	cudaArray **deviceTableVoxelToReceiverPathCountCuArray;		//Count
	//cudaPitchedPtr * pitchedTableVoxelToReceiverPathCountDevPtr;
#endif

#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
	cudaArray *deviceTableVoxelToEmPathSosBothCuArray;  			//Emitter SoSSum + Count
	cudaArray **deviceTableVoxelToRecPathSosBothCuArray;  			//Receiver SoSSum + Count
#endif

#ifdef SaftTextureForERIndexBlock
	cudaArray * deviceEmIndexBlockCuArray;
	cudaArray * deviceRecIndexBlockCuArray;
#endif

	// Schallgeschwindigkeitskorrektur-Mode
    float *deviceSpeedOfSoundField;			// Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU

    // Block-Mode
    unsigned short *deviceEmitterIndex_block;		// Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU
    unsigned short *deviceReceiverIndex_block;
    float3 *deviceListEmitterGeometry;			// Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU
    float3 *deviceListReceiverGeometry;

    float  *deviceSoSData_block;				// Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU

//    VoxelCountType								// Adressen fuer Speicher der SoS-Pfade auf der GPU
//        * deviceTableVoxelToEmitterPathCount,
//        * deviceTableVoxelToReceiverPathCount;
    float
    	*deviceTableVoxelToEmitterPathCountFloat,
    	*deviceTableVoxelToReceiverPathCountFloat,
        *deviceTableVoxelToEmitterPathSosSum,
        *deviceTableVoxelToReceiverPathSosSum;


    bool *deviceValidEmitterReceiverCombinations;

    int *deviceTransducerVectorAnalysisDistributionCounters;

//    float3
//        * deviceEmitterGeometry,
//        * deviceReceiverGeometry;

    int   usedAmountOfEmitter, 	// amount of used emitter
    	  usedAmountOfReceiver; // amount of used receiver

    // Output volume
	double *deviceOutput;

    //Streams used for synchronisation
    cudaStream_t
        copyStream,
        calculationStream;

    //This variable describes the number of allocations used by the current SAFT mode
    std::size_t aScanAllocationCount;

    int
        invalidEmitterReceiverCombinationsCount,
        validEmitterReceiverCombinationsCount;

    Dimensions validBlockDimensions;
    bool useAutoTuning;
//    AutoTuningConfiguration autoTuningConfiguration;

    size_t
    	partialOutputSize,
    	partialVolumeSize,				// Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benötigt wuerde
    	partialSosPathSize,				// Speicher(OutputVolumen), der fuer die entsprechende Anzahl an SoS-Z-Layer benötigt wuerde
    	maxFeasibleZLayerCount,			// Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt.
    	maxFeasibleSosZLayerCount;		// Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt.

    int
        minimumAutoTuningThreadCount,
        maximumAutoTuningThreadCount;


    //New partial reconstruction data

    std::size_t partialSpeedOfSoundVoxelCount;
    std::size_t partialOutputZLayerCount;
    std::size_t zLayerVoxelCount;
    std::size_t sosZLayerVoxelCount;           // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp
    std::size_t partialOutputVoxelCount;

    std::size_t
		//deviceTableVoxelToEmitterPathCountSize,
		deviceTableVoxelToEmitterPathCountFloatSize,
		deviceTableVoxelToEmitterPathSosSumSize,
		//deviceTableVoxelToReceiverPathCountSize,
		deviceTableVoxelToReceiverPathCountFloatSize,
		deviceTableVoxelToReceiverPathSosSumSize;

	double diff_time;	// For Time Measurement
	float transferRate; // For DataTransferrate Measurement
	float performRate;  // For PerformSAFTrate Measurement
	cudaDeviceProp deviceProp;		// Ausgabe der Frequenz


    //Core reconstruction

    void processAScans(ullong & duration);
    void performCoreReconstruction();

    //Pre-calculation

    void precalculateAverageSpeedOfSound(int zLayer, int zLayerCount);
//    void analysisOfTransducerVectors();

//    void normalisePerformanceStatisticsOutput();
//    void printTransducerVectorStatistics();

    //Auto-tuning
    bool determineGridDimensions(dim3 const & blockDimensions, dim3 & gridDimensions);
    void determineValidBlockDimensions();


    void reduceKernelDimensions(dim3 const & gridDimensions, dim3 const & blockDimensions, dim3 & reducedGridDimensions, dim3 & reducedBlockDimensions);

    //Pre-calculation kernels
	#ifdef SaftUseConstantMemforGeometry
		//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
		void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
	#else
		//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceSpeedOfSoundSumOutput);
		void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, float * deviceSpeedOfSoundSumOutput);
	#endif


    //    void analyseTransducerVectors(dim3 gridDimensions, dim3 blockDimensions);

    //SAFT kernels
    //void performInterpolation(float * deviceAScans, float * deviceOutput, dim3 gridDimensions, dim3 blockDimensions, cudaStream_t stream);
    //void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float const * deviceAScans); //, cudaStream_t stream);
#ifdef SaftNoTexture
	void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, float * deviceAScans 		    );   //Ascans im Devicememory
#else
	void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceAScansCuArray);   //Ascans in CuArray f<>r Texturmemory
	//void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceSpeedOfSoundFieldCuArray, cudaArray * deviceAScansCuArray);   //Ascans in CuArray f<>r Texturmemory
#endif

    //Utility functions
    bool setGenericDimensions();
    std::size_t resolutionConversion(std::size_t input, std::size_t greaterResolution, std::size_t lowerResolution);
    void partialReconstructionInitialisation();
    std::size_t getCurrentZLayerCount(std::size_t zOffset);
    void getCurrentSpeedOfSoundVariables(std::size_t zOffset, std::size_t currentZLayerCount, std::size_t & currentSpeedOfSoundZLayer, std::size_t & currentSpeedOfSoundPartialZLayerCount);
    void determineSpeedOfSoundData(std::size_t regionOfInterestZLayers);
};

//std::string vectorToString(float3 const & vector);
//std::string voxelToString(dim3 const & voxel);
extern void memoryCheck();

extern void performCUDAResultCheck(cudaError_t result, std::string const & file, int line);