URDepends/SAFT_ATT/src/saft.cpp

#include <mex.h>

#include <iostream>
#include <vector>

#include <cstdlib>
#include <ctime>
#include <cmath>

//#include <sys/time.h>

//#include <ail/file.hpp>
//#include <ail/string.hpp>
//#include <ail/time.hpp>

//#include "configuration.hpp"
#include "saft.hpp"


/**
   Clumsy constructor of the core reconstruction class.
   - Unbeholfener Konstruktor der Kern Rekonstuktionsklasse
 */
SAFTHandler::SAFTHandler(
    int deviceId, 			///< CUDA ID of the device to be used.
    int deviceIndex,		///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
    float *aScan_ptr,    	///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath, ///< Path to the actual A-scan samples.
    double *output_ptr,   	///< Zeiger zu den daten // std::string const & Path, ///< Path to a file in which the output of the image reconstruction is to be stored.
    double *Duration_ptr, 	///< Zeiger auf R<>ckgabewert fuer Matlab fuer Laufzeit des Kernels
	unsigned short *receiver_index_ptr, ///<
	unsigned short *emitter_index_ptr,  ///<
	float *receiver_list_ptr, 			///<
	int receiver_list_Size,
	float *emitter_list_ptr,			///<
	int emitter_list_Size,
    float *speed_vec_ptr,				///< Zeiger auf die SoS-Daten in Block-/Gridmode
    int3 SOSGrid_XYZ,
	float3 sosOffset, 					///< Startpoint of SoSGrid
	float SOS_RESOLUTION,				///< Aufloesung des SoSGrid
	float *att_vec_ptr,					///< Zeiger auf die Att-Daten inm Gridmode

    int aScanCount,
    int aScanLength,
    int3 IMAGE_SIZE_XYZ,
    float sampleRate,
    float3 regionOfInterestOffset,
    float IMAGE_RESOLUTION,
    dim3 const & fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
    int medianWindowSize, 				///< define width of used median filter
    float debugMode,
    float debugModeParameter,
    bool SOSMode_3DVolume,
    bool ATTMode_3DVolume,

    int SAFT_MODE,
	int *SAFT_VARIANT
    ):
    deviceId(deviceId),			// Das hier ist eine Initialisation der Klassenvariablen mit den <20>bergebenen Werten aehnlich Konstruktor, called Initializer list
    deviceIndex(deviceIndex),

	aScan_ptr(aScan_ptr),			//aScanSamplesPath(aScanSamplesPath),

	output_ptr(output_ptr),			//Path(Path),
	Duration_ptr(Duration_ptr),

	receiver_index_ptr(receiver_index_ptr), 	//
	emitter_index_ptr(emitter_index_ptr),  		//
	receiver_list_ptr(receiver_list_ptr), 		//
	receiver_list_Size(receiver_list_Size),
	emitter_list_ptr(emitter_list_ptr),		//
	emitter_list_Size(emitter_list_Size),
	speed_vec_ptr(speed_vec_ptr), 			///< SoS-Daten im Blockmode oder SoSGrid
	SOSGrid_XYZ(SOSGrid_XYZ),	  			// Groesse des SoSGrids
	sosOffset(sosOffset), 					///< Startpoint of SoSGrid
	SOS_RESOLUTION(SOS_RESOLUTION),			///< Aufloesung des SoSGrid

	att_vec_ptr(att_vec_ptr),				///< Att-Daten als ATTGrid

	aScanCount(aScanCount),
	aScanLength(aScanLength),
	IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
	sampleRate(sampleRate),
	regionOfInterestOffset(regionOfInterestOffset),
	IMAGE_RESOLUTION(IMAGE_RESOLUTION),

    fixedBlockDimensions(fixedBlockDimensions),
    medianWindowSize(medianWindowSize),
	debugMode(debugMode),
	debugModeParameter(debugModeParameter),
	SOSMode_3DVolume(SOSMode_3DVolume),
	ATTMode_3DVolume(ATTMode_3DVolume),

	SAFT_MODE(SAFT_MODE),
	SAFT_VARIANT(SAFT_VARIANT)
{
	#ifdef debug_OutputFunctions
		// printf( "==> SAFTHandler::SAFTHandler - Start\n");
	#endif

	#ifdef debug_OutputInfo
		// printf( "SAFTHandler Constructor\n");
	#endif

    aScanAllocationCount = 1;			// Speicher der Allokiert wird, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.

	IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION;   // Auflösung im OutputVolumen
	SOS_RESOLUTION_FACTOR   = 1 / SOS_RESOLUTION;	  // Auflösung im SoS-Grid

	#ifdef debug_OutputVariables
		// printf( "IMAGE_RESOLUTION_FACTOR = %e\n", IMAGE_RESOLUTION_FACTOR);
		// printf( "SOS_RESOLUTION_FACTOR 	= %e\n", SOS_RESOLUTION_FACTOR);
		// printf( "Samplerate = %e\n", sampleRate);
	#endif

	#ifdef debug_OutputFunctions
		// printf( "<== SAFTHandler::SAFTHandler - End\n");
	#endif

}

/**
   Top level function of the SAFTHandler class that performs the image reconstruction.
   - Top Level Funktion der SAFTHandler Klasse die die Bildrekonstruktion durchf<EFBFBD>hrt.
*/
void SAFTHandler::performReconstruction()
{
	#ifdef debug_OutputFunctions
		// printf( "==> SAFTHandler::performReconstruction - Start\n");
	#endif

	#ifdef debug_OutputInfo													// Name des Device mit ID ausgeben
		// printf( "Device ID: %i\n", deviceId);
	#endif

	#ifdef debug_OutputFunctions
	// printf( "==> loadDevices - Start\n");
	#endif
	int deviceCount;
	CUDA_CHECK(cudaGetDeviceCount(&deviceCount));


	// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, könnte auch nur einmal ausgelesen werden aber zweitrangig.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	//DeviceProperties &  outputProb = deviceProperties; // lokalen Zeiger auf Vektor erstellen der auf Klassenvektor zeigt.

	//// printf("1: size(%i) capacity(%i) max_size(%i)\n",  outputProb.size(), outputProb.capacity(), outputProb.max_size());

	//outputProb.reserve(static_cast<std::size_t>(deviceCount));	// Request Vector with size deviceCount
	deviceProperties.reserve(static_cast<std::size_t>(deviceCount));	// Request Vector with size deviceCount


	//cudaDeviceProp & device = outputProb[deviceId];				//
	cudaDeviceProp & device = deviceProperties[deviceId];			//
	CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));
	//// printf("%i. %s\n", deviceId, device.name);
	//// printf("%i. %s\n", deviceId, deviceProperties[deviceId].name);

	#ifdef debug_OutputInfo
			// printf("%i. %s\n", deviceId, device.name);
			// printf("   Byte Total Global Mem:  %lld \n",  device.totalGlobalMem);
			// printf("   Compute Capability:     %i.%i\n",  device.major,device.minor);

	        // printf("   Name:                          %s\n",  device.name);
	        // printf("   Major revision number:         %d\n",  device.major);
	        // printf("   Minor revision number:         %d\n",  device.minor);
	        // printf("   Total global memory:           %lld\n",  device.totalGlobalMem);
	        // printf("   Total shared memory per block: %u\n",  device.sharedMemPerBlock);
	        // printf("   Total registers per block:     %d\n",  device.regsPerBlock);
	        // printf("   Warp size:                     %d\n",  device.warpSize);
	        // printf("   Maximum memory pitch:          %lld\n",  device.memPitch);
	        // printf("   Maximum threads per block:     %d\n",  device.maxThreadsPerBlock);
	        for (int i = 0; i < 3; ++i) {
	        	// printf("        Maximum dimension %d of block:  %lld\n", i, device.maxThreadsDim[i]);
			}
	        for (int i = 0; i < 3; ++i) {
	        	// printf("        Maximum dimension %d of grid:   %lld\n", i, device.maxGridSize[i]);
			}
	        // printf("   Clock rate:                    %d\n",  device.clockRate);
	        // printf("   Total constant memory:         %u\n",  device.totalConstMem);
	        // printf("   Texture alignment:             %u\n",  device.textureAlignment);
	        // printf("   Concurrent copy and execution: %s\n",  (device.deviceOverlap ? "Yes" : "No"));
	        // printf("   Number of multiprocessors:     %d\n",  device.multiProcessorCount);
	        // printf("   Kernel execution timeout:      %s\n\n",  (device.kernelExecTimeoutEnabled ? "Yes" : "No"));
	#endif

		//outputProb.push_back(device);		// Add element at the end of the vector outputProb
		deviceProperties.push_back(device);		// Add element at the end of the vector outputProb

		//// printf("2: size(%i) capacity(%i) max_size(%i)\n",  outputProb.size(), outputProb.capacity(), outputProb.max_size());

	#ifdef debug_OutputFunctions
	// printf( "<== loadDevices - End\n");
	#endif
	// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, aber erstmal egal.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	// siehe http://gpucoder.livejournal.com/1064.html

	//		int devCount;
	//	    cudaGetDeviceCount(&devCount);
	//	    // printf("CUDA Device Query...\n");
	//	    // printf("There are %d CUDA devices.\n", devCount);
	//
	//	    // Iterate through devices
	//	    for (int i = 0; i < devCount; ++i)
	//	    {
	//	        // Get device properties
	//	        // printf("\nCUDA Device #%d\n", i);
	//	        cudaDeviceProp devProp;
	//	        cudaGetDeviceProperties(&devProp, i);
	//	        printDevProp(devProp);
	//	    }
	//
	//	    // printf("\nPress any key to exit...");
	//	    char c;
	//	    scanf("%c", &c);


	//cudaDeviceProp & device = deviceProperties[deviceId];
	//CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));		// Eingenschaften des Devices auslesen

	//#ifdef debug_OutputInfo													// Name des Device mit ID ausgeben
		// printf( "Device used: %18s  (HW-ID %i) (Idx %i)\n", device.name , deviceId, deviceIndex);
	//#endif
	CUDA_CHECK(cudaSetDevice(deviceId));

	#ifdef debug_OutputInfo													// Reset Device
		// printf("Reset Device\n");
	#endif
	//CUDA_CHECK(cudaDeviceReset());

//	std::string errorMessage = cudaGetErrorString(cudaPeekAtLastError());
//	std::cout << errorMessage << std::endl;

	//memoryCheck();		// Freier Speicher am Anfang


	// Check and set Block and Grid-Dimensions
	genericSAFTBlockDimensions = fixedBlockDimensions;
	genericSAFTGridDimensions = dim3(
		(IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x-1)/ genericSAFTBlockDimensions.x,	// hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
		(IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y-1)/ genericSAFTBlockDimensions.y,	// in Blockgroesse geteilt werden kann, muss ein weiterer
		(IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z	// Block berechnet werden. Zu Viele werden im Kernel aussortiert.
		);

	#if defined(debug_OutputVariables) || defined(debug_OutputZSteps)
		if (deviceIndex == DebugOutputGPUIdx){
			// printf( "genericSAFTBlockDimensions X,Y,Z = (%i %i %i)\n",genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z);
			// printf( "genericSAFTGridDimensions X,Y,Z  = (%i %i %i)\n",genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z);
		}
	#endif

    //Pointeruebergabe der AScan-Daten Geometrie-Daten und Output-Daten von Matlab
	#ifdef debug_OutputInfo
		// printf( "Give Pointer Names for AScan, Geometry, Output and SoS-Data from Matlab\n");
		// printf( "Uebergebener Pointer SoSData fuer SoS-Daten aus Matlab\n");
	#endif
    aScanSamples = (float*)aScan_ptr;

    #ifdef debug_OutputInfo
    	// printf( "Uebergebene Geometry Pointer fuer Index sowie der Zuordnungs-Tabelle aus Matlab\n");
	#endif

    emitter_index  = (unsigned short*) emitter_index_ptr;			// Index for associating emitter to corresponding coordinates
    receiver_index = (unsigned short*) receiver_index_ptr;			// Index for associating receiver to corresponding coordinates
    emitter_list   = (float3*) emitter_list_ptr;					// Lookuptable for emitter coordinates
    receiver_list  = (float3*) receiver_list_ptr;					// Lookuptable for receiver coordinates

	#ifdef debug_OutputInfo
    	// printf( "Uebergebener Pointer output fuer Ausgabe-Daten aus Matlab\n");
	#endif
    output = (double *)output_ptr;


    speedOfSoundFieldVoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z;
    speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount * sizeof(float);
	#ifdef debug_OutputVariables
		// printf("  speedOfSoundFieldVoxelCount [%ix%ix%i] = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, speedOfSoundFieldVoxelCount);
		// printf("  speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount(%i) x sizeof(float = 4)] = %i\n", speedOfSoundFieldVoxelCount, speedOfSoundFieldBytes);
	#endif

	#ifdef debug_OutputInfo
		// printf( "Uebergebener Pointer speedOfSoundField fuer SoS-Daten aus Matlab\n");
		// printf( "Uebergebener Pointer SoSData           fuer SoS-Daten aus Matlab\n");
		// printf( "Uebergebener Pointer attenuationField  fuer ATT-Daten aus Matlab\n");
	#endif
    speedOfSoundField = (float*)speed_vec_ptr;	// Fuer SoSGrid-Mode fuer korrekte Schallgeschwindigkeitskorrektur
    SoSData 		  = (float*)speed_vec_ptr;	// Fuer Blockmode
    attenuationField  = (float*)att_vec_ptr;    // Fuer SoSGrid-Mode fuer Daempfungskorrektur


    // Uebergabe der Outputgroessen aus Matlab.
    regionOfInterestVoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen
	outputSize = regionOfInterestVoxelCount * sizeof(double);	// Speicherbedarf fuer alle Voxel im Volumen

	#ifdef debug_OutputVariables
		// printf("  regionOfInterestVoxelCount [%ix%ix%i]= %lld\n",IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, regionOfInterestVoxelCount);
		// printf("  outputSize [%lld x sizeof(double = 8)] = %lld\n", regionOfInterestVoxelCount, outputSize);
	#endif

	//Hier auf maximale Outputgroesse von 32-BitSystem ueberpruefen --> falls Probleme mit 32-Bitsystemen hier noch Abfrage und Abbruch implementieren
	if (regionOfInterestVoxelCount > (uint64_t)(2^32 / sizeof(double)) ){ // 2^32 / sizeof(double) = 536870912
			// printf("outputSize > 2^32 !!! => works only in 64-Bit System\n");
	}

	//Groesse der Datenbloecke fuer die Blockverarbeitung wird mit aScanCount angegeben
	//Die selbe Anzahl wird auch fuer die Geometriedaten erwartet
	#ifdef debug_OutputVariables
		// printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
	#endif
	aScanSize = aScanLength * sizeof(float);
	batchSize = aScanCount;					// Anzahl der Blockgroesse d.h. wie viele AScans gleichzeitig verarbeitet werden. Batchgroesse ist gleich der Anzahl der uebergebenen Blockgroesse aus Matlab
	aScanBatchSize = batchSize * aScanSize; // Batchgroesse der AScans (* 3000 * sizeof(float)) in Byte
	#ifdef debug_OutputVariables
		// printf( "aScanSize = aScanLength(%i) * sizeof(float=4) = %i\n", aScanLength, aScanSize);
		// printf( "batchSize = aScanCount = %i\n", batchSize);
		// printf( "aScanBatchSize = batchSize * aScanSize ( = %i * sizeof(float)) = %i\n", aScanLength, aScanBatchSize);
	#endif

//	if(batchSize > aScanCount)		// Abfrage macht keinen Sinn mehr wenn batchSize = aScanCount;
//	{
//		mexErrMsgTxt("A-scan window size cannot be larger than the total number of A-scans");
//		//throw ail::exception("A-scan window size cannot be larger than the total number of A-scans");
//	}

	#ifdef debug_OutputInfo

    	// printf("\nParameter for Image Reconstruction\n");
    	// printf(  "========================================================================\n");
    	//std::cout << "ROI dimensions: " << regionOfInterestResolutionX << " x " <<  regionOfInterestResolutionY << " x " <<  regionOfInterestResolutionZ << std::endl;
		std::cout << "IMAGE_SIZE_XYZ:                      [" << IMAGE_SIZE_XYZ.x << " x " <<  IMAGE_SIZE_XYZ.y << " x " <<  IMAGE_SIZE_XYZ.z << "]" <<std::endl;
		std::cout << "Voxel count in Volume:               " << regionOfInterestVoxelCount << std::endl;
		//std::cout << "Increment vector: (" << regionOfInterestIncrementVector.x << ", " << regionOfInterestIncrementVector.y << ", " << regionOfInterestIncrementVector.z << ")" << std::endl;
		std::cout << "Increment vector/Resolution:         (" << IMAGE_RESOLUTION << ")" << std::endl;

		std::cout << "IMAGE_STARTPOINT in meters:          " << regionOfInterestOffset.x << " " << regionOfInterestOffset.y << " " << regionOfInterestOffset.z << std::endl;

		regionOfInterestSize.x = IMAGE_SIZE_XYZ.x * IMAGE_RESOLUTION;
		regionOfInterestSize.y = IMAGE_SIZE_XYZ.y * IMAGE_RESOLUTION;
		regionOfInterestSize.z = IMAGE_SIZE_XYZ.z * IMAGE_RESOLUTION;
		std::cout << "ROI size in metres:                  " << regionOfInterestSize.x << " " << regionOfInterestSize.y << " " << regionOfInterestSize.z  << std::endl;
		std::cout << "Batch size/Blocks(Ascan, R/E-Combi): " << batchSize << std::endl;
		// printf(  "========================================================================\n\n");
	#endif


//	#ifdef debug_OutputPerformance
//		struct timeval startPerformCoreReconstruction, stopPerformCoreReconstruction;
//		gettimeofday(&startPerformCoreReconstruction, NULL);
//	#endif

		//perform processing with AScan-Data
		//===========================================================================================================
		ullong duration;
		processAScans(duration);
		//===========================================================================================================


	#ifdef debug_OutputPerformance
		double numerator = static_cast<double>(aScanCount) * regionOfInterestVoxelCount;	// Performanz [Ascans * GVoxel/s]

		double performance = numerator / duration;
		//adjust for the change from voxels per millisecond to gigavoxels per second (=> 10^3 * 10^-9 = 10^-6)
		performance /= 1e9;

		//std::cout << "# Device ("<< (int)deviceId <<"): Duration of main processing: " << (int)duration << " us" << std::endl;
		//std::cout << "# Device ("<< (int)deviceId <<"): Performance: " << performance << " AScan * GVoxel/s" << std::endl;
	#endif

	//Duration_ptr[(deviceId+1)] = (double)duration;	// Für jede GPU einen Laufzeitwert in µs übermitteln	// Angabe von ID der GPU abhaengig
	Duration_ptr[(deviceIndex+1)] = (double)duration;	// Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig

	#ifdef debug_OutputVariables
		//// printf( "Duration_ptr[%i] = duration(%i) = %f\n", (deviceId+1), duration, Duration_ptr[(deviceId+1)]);
		// printf( "  GPU (%s:ID %i,Index %i): => Duration_ptr[%i] = duration(%i µs) = %.2f s\n", device.name, deviceId, deviceIndex, (deviceIndex+1), duration, Duration_ptr[(deviceIndex+1)]/1000/1000);
	#endif

//	#ifdef debug_OutputVariables
//		// printf( "Duration_ptr[0] = duration(%i) = %f\n", duration, Duration_ptr[0]);
//	#endif

// Reset Device
//	#ifdef debug_OutputInfo
//		// printf( "Device was used: %s  (%i)\n", deviceProperties[deviceId].name , deviceId);
//	#endif
//	CUDA_CHECK(cudaSetDevice(deviceId));

	#ifdef debug_OutputInfo													// Reset Device
		// printf("Reset Device\n");
	#endif
	//CUDA_CHECK(cudaDeviceReset());

	#ifdef debug_OutputFunctions
		// printf( "<== SAFTHandler::performReconstruction - End\n");
	#endif

}

/**
   The SAFT kernel expects arguments in which the grid dimensions have been reduced to less than three dimensions and the block dimensions are reduced to only one dimension.
   This also depends on the properties of the hardware available (shader model).
   - Der SAFT Kernel erwartet Argumente in den die Grid Dimension auf drei Dimensionen reduziert wurde und die Block-Dimensionen auf nur eine Dimension reduziert ist.
   - Das haengt auch von den Eigenschaften der verfuegbaren HW ab (shader model)
*/
void SAFTHandler::reduceKernelDimensions(
    dim3 const & gridDimensions, ///< Input grid dimensions.
    dim3 const & blockDimensions, ///< Input block dimensions.
    dim3 & reducedGridDimensions, ///< Reduced output grid dimensions.
    dim3 & reducedBlockDimensions ///< Reduced output block dimensions.
    )
{

	#ifdef debug_OutputFunctions
		// printf( "==> SAFTHandler::reduceKernelDimensions - Start\n");
	#endif


	if(deviceProperties[deviceId].maxGridSize[2] > 1)
    {
        reducedGridDimensions = gridDimensions;
		#ifdef debug_OutputParameter
        	// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
		#endif
    }
    else
    {
        reducedGridDimensions = dim3(
            gridDimensions.x * gridDimensions.y,
            gridDimensions.z,
            1
            );
		#ifdef debug_OutputParameter
        	// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
		#endif
    }

    reducedBlockDimensions = dim3(blockDimensions.x * blockDimensions.y * blockDimensions.z);
	#ifdef debug_OutputParameter
    	// printf( "reducedBlockDimensions X,Y,Z = (%i %i %i)\n", reducedBlockDimensions.x, reducedBlockDimensions.y, reducedBlockDimensions.z);
	#endif

	#ifdef debug_OutputFunctions
		// printf( "<== SAFTHandler::reduceKernelDimensions - End\n");
	#endif

}

/**
   Utility function to perform integer based divison which rounds up instead of down.
   - N<EFBFBD>tzliche Funktion: eine Integerbasierte Division die aufrundet und nicht abrundet
   @return Quotient of the divison, rounded up.
*/
std::size_t ceilingDivision(
    std::size_t dividend, ///< Dividend of the division.
    std::size_t divisor ///< Divisor of the division.
    )
{
    std::size_t output = dividend / divisor;
    if(dividend % divisor)
    	output ++;
    return output;
}

/**
   Converts an offset based on two different resolutions.
   This is a utility function used to deal with the number of z-layers in the speed of sound grid.
   - Konvertiert einen Offset, basierend auf zwei verschiedenen Aufloesungen
   - Diese n<EFBFBD>tzliche Funktion wird genutzt um mit der Anzahl der z-Layer in dem Spallgeschwindigkeits-Grid umzugehen.
   @return Result of the conversion.
 */
std::size_t SAFTHandler::resolutionConversion(
    std::size_t input, ///< Offset.
    std::size_t greaterResolution, ///< Greater resolution.
    std::size_t lowerResolution ///< Lower resolution.
    )
{
	#ifdef debug_OutputFunctions
		// printf( "==> SAFTHandler::resolutionConversion - Start\n");
		// printf( "<== SAFTHandler::resolutionConversion - End\n");
	#endif

    return ceilingDivision(input * lowerResolution, greaterResolution);
}

/**
   Perform calculations pertaining to the execution of the speed of sound pre-calculations.
   - F<EFBFBD>hre Berechnungen der Schallgeschwindigkeit-Vorberechnung aus
*/
void SAFTHandler::determineSpeedOfSoundData(
    std::size_t regionOfInterestZLayers ///< Number of z-layers within the region of interest that are currently being processed. This number is often smaller than the total number of z-layers.
    )
{
	#ifdef debug_OutputFunctions
		// printf( "==> SAFTHandler::determineSpeedOfSoundData - Start\n");
	#endif

//	//Determine the maximum number of z-layers to be pre-calculated within the speed of sound grid
//	//Bestimme die maximale Anzahl an Z-layer, die in dem SoS-Grid Vorberechnet werden.
//	//std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, regionOfInterestResolutionZ, regionOfInterestGridSizeZ);
//	std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, IMAGE_SIZE_XYZ.z, regionOfInterestGridSizeZ);
//
//    partialSpeedOfSoundVoxelCount = maximumSpeedOfSoundPartialZLayerCount * regionOfInterestGridSizeX * regionOfInterestGridSizeY;
//
////	deviceTableVoxelToEmitterPathCountSize   = sosZLayerVoxelCount * emitter_list_Size   * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Emitter
////	deviceTableVoxelToEmitterPathSosSumSize  = sosZLayerVoxelCount * emitter_list_Size   * partialSoSZLayerCount * sizeof(float);
////	deviceTableVoxelToReceiverPathCountSize  = sosZLayerVoxelCount * receiver_list_Size  * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Receiver
////	deviceTableVoxelToReceiverPathSosSumSize = sosZLayerVoxelCount * receiver_list_Size  * partialSoSZLayerCount * sizeof(float);
//
//    std::size_t
//        emitterSpeedOfSoundVoxelCombinations = emitterCount * partialSpeedOfSoundVoxelCount,
//        receiverSpeedOfSoundVoxelCombinations = receiverCount * partialSpeedOfSoundVoxelCount;
//
//    emitterToVoxelPathVoxelDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
//    emitterToVoxelPathSpeedDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(float);
//
//    voxelToReceiverPathVoxelDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
//    voxelToReceiverPathSpeedDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(float);

	#ifdef debug_OutputFunctions
		// printf( "<== SAFTHandler::determineSpeedOfSoundData - End\n");
	#endif
}

/**
   Perform initialisations for the partial reconstructions for both the speed of sound pre-calculation and the actual reconstruction.
   - F<EFBFBD>hre Initialisierungen fuer eine Teilrekonstruktion von beiden durch: Der Schallgeschwindigkeit und der aktuellen Rekonstruktion
*/
void SAFTHandler::partialReconstructionInitialisation()
{
	#ifdef debug_OutputFunctions
		// printf( "==> SAFTHandler::partialReconstructionInitialisation - Start\n");
	#endif
//
//	if(!partialReconstructionInitialised)
//    {
//        std::cout << "Initialising partial reconstruction data" << std::endl;
//
//        //zLayerVoxelCount = regionOfInterestResolutionX * regionOfInterestResolutionY;
//        zLayerVoxelCount = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y;			// Anzahl der X-Y-Voxel bestimmen den Schritt in das naechste Layer.
//
//        partialOutputVoxelCount = partialOutputSize / sizeof(double);
////        if(partialOutputVoxelCount % zLayerVoxelCount != 0)								//Sicherheitsabfrage nun im kernel
////        	mexErrMsgTxt("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
//            //throw ail::exception("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
//        partialOutputZLayerCount = partialOutputVoxelCount / zLayerVoxelCount;
//
////        if(partialOutputZLayerCount % genericSAFTBlockDimensions.z != 0)								//Sicherheitsabfrage nun im kernel
////        	mexErrMsgTxt("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
//            //throw ail::exception("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
//
//        //Make dynamically sized allocations for the pre-calculated speed of sound data.
//        //The size depends on the number of z-layers in the output window.
//        //These particular pre-calculations are no longer performed only once for all voxels.
//        //Instead, they are performed partially, prior to each launch of the SAFT kernel.
//        //This lowers the pressure on GPU global memory.
//
//        //F<>hre Allokationen mit dynamischer Groesse aus fuer die Vor-Verarbeitung der SoS-Daten
//        //Die Groesse haengt von der Anzahl der z-Layer in dem -Fenster ab.
//        //Diese partielle-Vorberechnung muss nur einmal fuer alle Voxel durchgef<65>hrt werden.
//        //Stattdessen werden sie immer partiell durchgef<65>hrt, vor jedem Start des SAFT-Kernels.
//        //Das entlastet den globalen GPU-Speicher.
//
//        determineSpeedOfSoundData(partialOutputZLayerCount);
//
//        // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathVoxelCounts der Groesse:%i\n", emitterToVoxelPathVoxelDataSize);
//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathVoxelCounts), emitterToVoxelPathVoxelDataSize));
//        // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathSpeedOfSoundSum der Groesse:%i\n", emitterToVoxelPathSpeedDataSize);
//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathSpeedOfSoundSum), emitterToVoxelPathSpeedDataSize));
//
//        // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathVoxelCounts der Groesse:%i\n", voxelToReceiverPathVoxelDataSize);
//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathVoxelCounts), voxelToReceiverPathVoxelDataSize));
//        // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathSpeedOfSoundSum der Groesse:%i\n", voxelToReceiverPathSpeedDataSize);
//        CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathSpeedOfSoundSum), voxelToReceiverPathSpeedDataSize));
//
//        partialReconstructionInitialised = true;
//    }

	#ifdef debug_OutputFunctions
		// printf( "<== SAFTHandler::partialReconstructionInitialisation - End\n");
	#endif
}


/**
   Print free/total memory available on the chosen device.
   - Gibt freien/totalen zur verf<EFBFBD>gung stehenden Speicher auf dem gew<EFBFBD>hlten Device aus.
*/
void memoryCheck()
{
	#ifdef debug_OutputFunctions
		// printf( "==> memoryCheck - Start\n");
	#endif

	std::size_t
        totalMemory,
        freeMemory;
    CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));

    #if defined(debug_OutputInfo) || defined(debug_OutputMaxMemory)
		//printSize("   Total memory ", totalMemory);
		//std::cout << " ( " << totalMemory << " )" << std::endl;
		//printSize("    Free memory ", freeMemory);
		//std::cout << " ( " << freeMemory << " )" << std::endl;

		//printSize(" => Used memory ", (totalMemory-freeMemory));
		//std::cout << " ( " << (totalMemory-freeMemory) << " )" << std::endl;
	#endif

    #ifdef debug_OutputFunctions
    	// printf( "<== memoryCheck - End\n");
	#endif

}


/**
   Generic CUDA call wrapper.
   Check the result of a CUDA operation and throw an exception if an error occurred.
   This is used in combination with a macro in saft.hpp.
   - Generischer CUDA Call Wrapper
   - <EFBFBD>berpr<EFBFBD>ft die Ergebnisse einer CUDA Operation und wirft eine Exception wenn ein Fehler auftritt
   - Das wird wird mit einer Kombination mit einem Makro in saft.hpp genutzt.
*/
//inline	// Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
void performCUDAResultCheck(
    cudaError_t result, ///< Result of the CUDA operation.
    std::string const & file, ///< Path to the source code file.
    int line ///< Line within the source code
    )
{
    if(result != cudaSuccess)
    {
    	//// printf("A CUDA operation failed in file \"%s\" (line %i):  %s \n", file, line, cudaGetErrorString(result).c_str() );
    	// printf("%s\n", cudaGetErrorString( cudaGetLastError() ) );

    	//std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
        //std::cout << errorMessage << std::endl;
        mexErrMsgTxt("-> Error occurred");
    }
}