URDepends/SAFT_TOFI/src/kernel/saftPrivate.cu

#include "precalculateSpeedOfSoundKernel.cuh"
#include "rayTracing.cuh"
#include "saftKernel.cuh"
#include "saft.hpp"

void SAFTHandler::precalculateAscanIndex_usePaths(int ascanIndex_i, int aScanWindowSize, int currentSpeedOfSoundZLayer, int maxFeasibleSosZLayerCount)
{
    cudaChannelFormatDesc texChannelDescTableVoxelToEmRecPathSosBoth = cudaCreateChannelDesc(32, 32, 32, 32,
                                                                                             cudaChannelFormatKindFloat);  // Schritt 2.1 Output-Kanal anlegen und
                                                                                                                           // beschreiben - Float4
                                                                                                                           // Both Emitter Path Tables
    // --------------------------------------------------------
    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[1] = cudaAddressModeClamp;
    texTableVoxelToEmitterPathSosBoth_preprocess.addressMode[2] = cudaAddressModeClamp;
    switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
    {
    case 0:
        texTableVoxelToEmitterPathSosBoth_preprocess.filterMode = cudaFilterModePoint;
        break;
    case 1:
        texTableVoxelToEmitterPathSosBoth_preprocess.filterMode = cudaFilterModeLinear;
        break;
    }
    texTableVoxelToEmitterPathSosBoth_preprocess.normalized = 0;
    CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToEmitterPathSosBoth_preprocess, deviceTableVoxelToEmPathSosBothCuArray, &texChannelDescTableVoxelToEmRecPathSosBoth));

    // Texturmemory fuer Receiver - SosPathsTables
    // ===================================================================================================================
    // Both Receiver Path Tables
    // ------------------------------------------------------
    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[1] = cudaAddressModeClamp;
    texTableVoxelToReceiverPathSosBoth0_preprocess.addressMode[2] = cudaAddressModeClamp;
    switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
    {
    case 0:
        texTableVoxelToReceiverPathSosBoth0_preprocess.filterMode = cudaFilterModePoint;
        break;
    case 1:
        texTableVoxelToReceiverPathSosBoth0_preprocess.filterMode = cudaFilterModeLinear;
        break;
    }
    texTableVoxelToReceiverPathSosBoth0_preprocess.normalized = 0;

    CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth0_preprocess, deviceTableVoxelToRecPathSosBothCuArray[0], &texChannelDescTableVoxelToEmRecPathSosBoth));

    if (TableVoxelToReceiverPathSosAllocationCount > 1)
    {                                                                                          // TODO: mit Arrays flexibel programmieren, wenn moeglich!!!
        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[1] = cudaAddressModeClamp;
        texTableVoxelToReceiverPathSosBoth1_preprocess.addressMode[2] = cudaAddressModeClamp;
        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
        {
        case 0:
            texTableVoxelToReceiverPathSosBoth1_preprocess.filterMode = cudaFilterModePoint;
            break;
        case 1:
            texTableVoxelToReceiverPathSosBoth1_preprocess.filterMode = cudaFilterModeLinear;
            break;
        }
        texTableVoxelToReceiverPathSosBoth1_preprocess.normalized = 0;

        CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth1_preprocess, deviceTableVoxelToRecPathSosBothCuArray[1], &texChannelDescTableVoxelToEmRecPathSosBoth));
    }

    if (TableVoxelToReceiverPathSosAllocationCount > 2)
    {
        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[1] = cudaAddressModeClamp;
        texTableVoxelToReceiverPathSosBoth2_preprocess.addressMode[2] = cudaAddressModeClamp;
        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
        {
        case 0:
            texTableVoxelToReceiverPathSosBoth2_preprocess.filterMode = cudaFilterModePoint;
            break;
        case 1:
            texTableVoxelToReceiverPathSosBoth2_preprocess.filterMode = cudaFilterModeLinear;
            break;
        }
        texTableVoxelToReceiverPathSosBoth2_preprocess.normalized = 0;

        CUDA_CHECK(cudaBindTextureToArray(&texTableVoxelToReceiverPathSosBoth2_preprocess, deviceTableVoxelToRecPathSosBothCuArray[2], &texChannelDescTableVoxelToEmRecPathSosBoth));
    }

    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1, 1);
    dim3 blocksPerGrid(1, 1, 1);
    blocksPerGrid.x = SOSGrid_XYZ.y;
    blocksPerGrid.y = maxFeasibleSosZLayerCount;
    blocksPerGrid.z = 1;

    // Step 2. Bereite Output-Textur fuer AscanIndex vor

    if (TableAscanIndexAllocationCount > 0)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat0, deviceTextureAscanIndexFloatCuArray[0]);
    }
    if (TableAscanIndexAllocationCount > 1)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat1, deviceTextureAscanIndexFloatCuArray[1]);
    }
    if (TableAscanIndexAllocationCount > 2)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat2, deviceTextureAscanIndexFloatCuArray[2]);
    }
    if (TableAscanIndexAllocationCount > 3)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat3, deviceTextureAscanIndexFloatCuArray[3]);
    }

    // Step 3. Fuehre Kernel aus mit #Threads: SOS.x*SOS.y . Innerhalb werden
    // immer 1024/2048 A-Scans durchlaufen und in AscanIndex-Textur geschrieben

    if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false))
    {  // ====================================================
       // Blockmode with SOS-value per Ascan

        precalculateAscanIndex_usePathsKernel<<<blocksPerGrid, threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
                                                                                  aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
                                                                                                    // vorberechnet werden können
                                                                                  deviceSosAttFieldCuArray,
                                                                                  currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
                                                                                                              ///< grid the pre-calculation is performed
                                                                                                              ///< for.
                                                                                  maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
                                                                                                              ///< sound grid the pre-calculation is
                                                                                                              ///< performed for.
                                                                                  // currentEmIndexUsedForAscanIndexCalculation,	///< current Index of Em
                                                                                  // for which the AscanIndex is calculated

                                                                                  maxSoSReceiverArrayForTexture,

                                                                                  deviceEmitterIndex_block,   // Speicheradresse fuer EmitterIndexdaten
                                                                                  deviceReceiverIndex_block,  // Speicheradresse fuer ReceiverIndexdaten

                                                                                  TableAscanIndexAllocationCount,  ///< Anzahl der benoetigten AscanBlocks
                                                                                                                   ///< der Groesse 2048/4096
                                                                                  maxAscanIndexArraysInTexture,    ///< maximale Anzahl an Em/Rec in einem
                                                                                                                   ///< CUDA Array (fest definiert fuer
                                                                                                                   ///< bestimmung welche Textur genutzt
                                                                                                                   ///< wird)

                                                                                  deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
                                                                                                                        ///< the path from transducer to
                                                                                                                        ///< voxel.

                                                                                  SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
                                                                                  deviceSAFT_VARIANT);
    }
    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false))
    {  // ====================================================
       // 3DVolume Mode with SOS-Correction no ATT-Correction

        precalculateAscanIndex_usePathsKernel_SOS<<<blocksPerGrid,
                                                    threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
                                                                       aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
                                                                                         // vorberechnet werden können
                                                                       deviceSosAttFieldCuArray,
                                                                       currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
                                                                                                   ///< grid the pre-calculation is performed
                                                                                                   ///< for.
                                                                       maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
                                                                                                   ///< sound grid the pre-calculation is
                                                                                                   ///< performed for.
                                                                       // currentEmIndexUsedForAscanIndexCalculation,		///< current
                                                                       // Index of Em for which the AscanIndex is calculated

                                                                       maxSoSReceiverArrayForTexture,

                                                                       deviceEmitterIndex_block,   // Speicheradresse fuer EmitterIndexdaten
                                                                       deviceReceiverIndex_block,  // Speicheradresse fuer ReceiverIndexdaten

                                                                       TableAscanIndexAllocationCount,  ///< Anzahl der benoetigten AscanBlocks
                                                                                                        ///< der Groesse 2048/4096
                                                                       maxAscanIndexArraysInTexture,    ///< maximale Anzahl an Em/Rec in einem
                                                                                                        ///< CUDA Array (fest definiert fuer
                                                                                                        ///< bestimmung welche Textur genutzt
                                                                                                        ///< wird)

                                                                       deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
                                                                                                             ///< the path from transducer to
                                                                                                             ///< voxel.

                                                                       SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
                                                                       deviceSAFT_VARIANT);
    }
    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true))
    {  // ====================================================
       // 3DVolume Mode with SOS- and ATT-Correction

        precalculateAscanIndex_usePathsKernel_SOS_ATT<<<blocksPerGrid,
                                                        threadsPerBlock>>>(ascanIndex_i,     ///< Offset of AscanIndex batch (bei mehreren Aufrufen)
                                                                           aScanWindowSize,  // aktuelle Anzahl der Ascans, die maximal
                                                                                             // vorberechnet werden können
                                                                           deviceSosAttFieldCuArray,
                                                                           currentSpeedOfSoundZLayer,  ///< First z-layer in the speed of sound
                                                                                                       ///< grid the pre-calculation is performed
                                                                                                       ///< for.
                                                                           maxFeasibleSosZLayerCount,  ///< Number of z-layers in the speed of
                                                                                                       ///< sound grid the pre-calculation is
                                                                                                       ///< performed for.
                                                                           // currentEmIndexUsedForAscanIndexCalculation,		///< current
                                                                           // Index of Em for which the AscanIndex is calculated
                                                                           maxSoSReceiverArrayForTexture,
                                                                           deviceEmitterIndex_block,             // Speicheradresse fuer EmitterIndexdaten
                                                                           deviceReceiverIndex_block,            // Speicheradresse fuer ReceiverIndexdaten
                                                                           TableAscanIndexAllocationCount,       ///< Anzahl der benoetigten AscanBlocks
                                                                                                                 ///< der Groesse 2048/4096
                                                                           maxAscanIndexArraysInTexture,         ///< maximale Anzahl an Em/Rec in einem
                                                                                                                 ///< CUDA Array (fest definiert fuer
                                                                                                                 ///< bestimmung welche Textur genutzt
                                                                                                                 ///< wird)
                                                                           deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS samples in
                                                                                                                 ///< the path from transducer to
                                                                                                                 ///< voxel.
                                                                           SOSGrid_XYZ, sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter,
                                                                           deviceSAFT_VARIANT);
    }

    CUDA_CHECK(cudaGetLastError());

    // ==================================================== cudaUnbindTexture
    // Texturmemory fuer Emitter - SosPathsTables entbinden
    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToEmitterPathSosBoth_preprocess));
    // Texturmemory fuer Receiver - SosPathsTables entbinden
    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth0_preprocess));
    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth1_preprocess));
    CUDA_CHECK(cudaUnbindTexture(&texTableVoxelToReceiverPathSosBoth2_preprocess));
}

void SAFTHandler::precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceListGeometry, int geometryElementCount)
{
    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1,
                         1);  // max. 512 oder 1024	Threads werden vorgegeben und
    // dim3 threadsPerBlock (SOSGrid_XYZ.x,SOSGrid_XYZ.y,1);	// max. 512 oder
    // 1024	Threads werden vorgegeben und
    dim3 blocksPerGrid(1, 1, 1);  // max. 65.535			Bloecke im Grid
                                  // berechnet.	Initialisierung
    blocksPerGrid.x = SOSGrid_XYZ.y;
    blocksPerGrid.y = sosZLayerCount;
    blocksPerGrid.z = 1;

    cudaChannelFormatDesc texChannelDescSosAttField = cudaCreateChannelDesc(32, 32, 0, 0,
                                                                            cudaChannelFormatKindFloat);  // Schritt 2.1 Output-Kanal
                                                                                                          // anlegen und beschreiben

    texRefSosAttField.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
    texRefSosAttField.addressMode[1] = cudaAddressModeClamp;
    texRefSosAttField.addressMode[2] = cudaAddressModeClamp;

    if (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing] == 1)
    {
        texRefSosAttField.filterMode = cudaFilterModeLinear;  // Lineare Interpolation
    }
    else
    {
        texRefSosAttField.filterMode = cudaFilterModePoint;  // Nearest Neighbor
    }
    texRefSosAttField.normalized = 0;

    CUDA_CHECK(cudaBindTextureToArray(&texRefSosAttField, deviceSosAttFieldCuArray,
                                      &texChannelDescSosAttField));  // Schritt 4.1 3DArray an Texturmemory
                                                                     // binden

    if (deviceListGeometry == 0)
    {
        cudaBindSurfaceToArray(outSurfRefTableVoxelToEmPathSosBoth, deviceTableVoxelToEmPathSosBothCuArray);
    }

    if (deviceListGeometry == 1)
    {
        if (TableVoxelToReceiverPathSosAllocationCount > 0)
        {
            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth0, deviceTableVoxelToRecPathSosBothCuArray[0]);
        }
        if (TableVoxelToReceiverPathSosAllocationCount > 1)
        {
            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth1, deviceTableVoxelToRecPathSosBothCuArray[1]);
        }
        if (TableVoxelToReceiverPathSosAllocationCount > 2)
        {
            cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth2, deviceTableVoxelToRecPathSosBothCuArray[2]);
        }
    }

    precalculateAverageSpeedOfSoundKernel<<<blocksPerGrid, threadsPerBlock>>>(firstZLayer, sosZLayerCount, deviceListGeometry, geometryElementCount,
                                                                              maxSoSReceiverArrayForTexture,
                                                                            sosOffset, regionOfInterestOffset, IMAGE_RESOLUTION, SOS_RESOLUTION, debugMode, debugModeParameter);
    CUDA_CHECK(cudaGetLastError());

    CUDA_CHECK(cudaUnbindTexture(&texRefSosAttField));
}

void SAFTHandler::fillCuArray(float useValue,
                              cudaArray **deviceTextureAscanIndexFloatCuArray,  ///< CuArray to fill
                              int TableAscanIndexAllocationCount)
{
    dim3 threadsPerBlock(SOSGrid_XYZ.x, 1,
                         1);  // determine neccessary amount of threads
                              // // max. 512 oder 1024
    dim3 blocksPerGrid(1, 1,
                       1);  // determine neccessary amount of blocks in grid	// max. 65.535
    blocksPerGrid.x = SOSGrid_XYZ.y;
    blocksPerGrid.y = maxFeasibleSosZLayerCount;
    blocksPerGrid.z = 1;

    // Step 1. Bereite Output-Textur fuer AscanIndex vor
    if (TableAscanIndexAllocationCount > 0)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat0, deviceTextureAscanIndexFloatCuArray[0]);
    }
    if (TableAscanIndexAllocationCount > 1)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat1, deviceTextureAscanIndexFloatCuArray[1]);
    }
    if (TableAscanIndexAllocationCount > 2)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat2, deviceTextureAscanIndexFloatCuArray[2]);
    }
    if (TableAscanIndexAllocationCount > 3)
    {
        cudaBindSurfaceToArray(outSurfRefAscanIndexFloat3, deviceTextureAscanIndexFloatCuArray[3]);
    }

    // Step 2. Fuere Kernel aus mit #Threads: SOS.x*SOS.y . Innerhalb werden immer
    // 1024/2048 A-Scans durchgegangen und in AscanIndex-Textur geschrieben
    fillCuArrayKernel<<<blocksPerGrid, threadsPerBlock>>>(useValue,
                                                          deviceTextureAscanIndexFloatCuArray,  ///< Out: Sum of SoS
                                                                                                ///< samples in the path
                                                                                                ///< from transducer to
                                                                                                ///< voxel.
                                                          maxAscanIndexArraysInTexture,
                                                          TableAscanIndexAllocationCount,  ///< Amount of Surfaces in
                                                                                           ///< the Array of cuArrays
                                                          maxFeasibleSosZLayerCount, ATTMode_3DVolume, debugMode, debugModeParameter);

    CUDA_CHECK(cudaGetLastError());
}

void SAFTHandler::performSAFT(
    int aScanIndex,              ///< The A-scan index is increased by the A-scan batch size in every iteration. It describes the offset into the A-scan samples the SAFT kernel is operating with.
    size_t aScanWindowSize,      ///< A-scan batch size in terms of number of samples within one window.
    int3 IMAGE_SIZE_XYZ,         ///< Bildbereichsgroesse/ROI in Voxel
    int3 SOSGrid_XYZ,            ///< SoSGridgroesse in Voxel
    int blockIndexOffset,        ///< Additional offset added to the z component of the block index, required because of the adjustments for partial reconstruction in different z-layers.
    int outputWindowVoxelCount,  ///< Number of Voxels in the output window.
    int speedOfSoundZLayer,      ///< current SoS z-layer Offset in the speed of sound grid.
    int speedOfSoundVoxelsWithinZLayers,  ///< Number of z-layers in the speed of sound grid touched by the z-layers of the active zone of reconstruction in the region of interest.
    int maxFeasibleSosZLayerCount,
    int currentEmIndexUsedForAscanIndexCalculation,  ///< current Index of Em for which the AscanIndex is calculated
    dim3 const &windowGridDimensions,  ///< Grid dimensions to be used to launch the SAFT kernel. It is smaller than the full grid dimensions and only represents the current reconstruction window.
    dim3 const &gridDimensions,        ///< Full grid dimensions of the reconstruction.
    dim3 const &blockDimensions,       ///< Block dimensions to be used with the SAFT kernel.
    float *deviceSpeedOfSoundField,    ///< Pointer to SoSGrid.
    cudaArray *deviceAScansCuArray
    //    cudaStream_t stream ///< Stream to execute the SAFT kernel on.
)
{

    dim3 reducedGridDimensions, reducedBlockDimensions;

    reduceKernelDimensions(windowGridDimensions, blockDimensions, reducedGridDimensions, reducedBlockDimensions);

    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex_SOS_ATT, cudaFuncCachePreferL1));
    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex_SOS, cudaFuncCachePreferL1));
    CUDA_CHECK(cudaFuncSetCacheConfig(saftKernelAscanIndex, cudaFuncCachePreferL1));

    //	Texture Memory Adressing-mode 			// http://docs.nvidia.com/cuda/pdf/CUDA_C_Programming_Guide.pdf  ->  3.2.11.1. Texture Memory S. 42
    // cudaAddressModeClamp 	- Return values at the boarders if out-of range - default
    // cudaAddressModeBorder	- Return 0 if out-of range
    // cudaAddressModeMirror	- Mirror the values    		- For normalized coordinates
    // cudaAddressModeWrap      - Repeating the values    	- For normalized coordinates

    // Texturmemory fuer Ascans
    cudaChannelFormatDesc texChannelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);  // Beschreibung des RueckgabeFormats der Textur
    texRefAscans.addressMode[0] = cudaAddressModeBorder;                                                    // Texturreferenz beschreiben
    texRefAscans.addressMode[1] = cudaAddressModeBorder;

    if (SAFT_VARIANT[SAFT_VARIANT_AscanInterpolation] == 1)
    {
        texRefAscans.filterMode = cudaFilterModeLinear;  // Lineare Interpolation
    }
    else
    {
        texRefAscans.filterMode = cudaFilterModePoint;  // Nearest Neighbor
    }

    texRefAscans.normalized = 0;
    CUDA_CHECK(cudaBindTextureToArray(&texRefAscans, deviceAScansCuArray, &texChannelDesc));

    if (ATTMode_3DVolume == false)
    {  // ========= 3DVolume Mode without ATT-Correction

        cudaChannelFormatDesc texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc<float>();  // Should do the same

        // AscanIndex Path Tables ------------------------------------------------------
        texTableAscanIndexFloat1_0.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
        texTableAscanIndexFloat1_0.addressMode[1] = cudaAddressModeClamp;
        texTableAscanIndexFloat1_0.addressMode[2] = cudaAddressModeClamp;
        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
        {
        case 0:
            texTableAscanIndexFloat1_0.filterMode = cudaFilterModePoint;
            break;
        case 1:
            texTableAscanIndexFloat1_0.filterMode = cudaFilterModeLinear;
            break;
        }
        texTableAscanIndexFloat1_0.normalized = 0;

        CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_0, deviceTextureAscanIndexFloatCuArray[0], &texChannelDescTableAscanIndexFloat));

        if (TableAscanIndexAllocationCount > 1)
        {                                                                      // TODO: mit Arrays flexibel programmieren!!!
            texTableAscanIndexFloat1_1.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
            texTableAscanIndexFloat1_1.addressMode[1] = cudaAddressModeClamp;
            texTableAscanIndexFloat1_1.addressMode[2] = cudaAddressModeClamp;
            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
            {
            case 0:
                texTableAscanIndexFloat1_1.filterMode = cudaFilterModePoint;
                break;
            case 1:
                texTableAscanIndexFloat1_1.filterMode = cudaFilterModeLinear;
                break;
            }
            texTableAscanIndexFloat1_1.normalized = 0;

            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_1, deviceTextureAscanIndexFloatCuArray[1], &texChannelDescTableAscanIndexFloat));
        }

        if (TableAscanIndexAllocationCount > 2)
        {
            texTableAscanIndexFloat1_2.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
            texTableAscanIndexFloat1_2.addressMode[1] = cudaAddressModeClamp;
            texTableAscanIndexFloat1_2.addressMode[2] = cudaAddressModeClamp;
            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
            {
            case 0:
                texTableAscanIndexFloat1_2.filterMode = cudaFilterModePoint;
                break;
            case 1:
                texTableAscanIndexFloat1_2.filterMode = cudaFilterModeLinear;
                break;
            }
            texTableAscanIndexFloat1_2.normalized = 0;

            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_2, deviceTextureAscanIndexFloatCuArray[2], &texChannelDescTableAscanIndexFloat));
        }

        if (TableAscanIndexAllocationCount > 3)
        {
            texTableAscanIndexFloat1_3.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
            texTableAscanIndexFloat1_3.addressMode[1] = cudaAddressModeClamp;
            texTableAscanIndexFloat1_3.addressMode[2] = cudaAddressModeClamp;
            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
            {
            case 0:
                texTableAscanIndexFloat1_3.filterMode = cudaFilterModePoint;
                break;
            case 1:
                texTableAscanIndexFloat1_3.filterMode = cudaFilterModeLinear;
                break;
            }
            texTableAscanIndexFloat1_3.normalized = 0;

            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat1_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat));
        }
    }
    else if (ATTMode_3DVolume == true)
    {  // ========= 3DVolume Mode with ATT-Correction

        cudaChannelFormatDesc texChannelDescTableAscanIndexFloat = cudaCreateChannelDesc<float2>();  // Should do the same

        // AscanIndex Path Tables ------------------------------------------------------
        texTableAscanIndexFloat2_0.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
        texTableAscanIndexFloat2_0.addressMode[1] = cudaAddressModeClamp;
        texTableAscanIndexFloat2_0.addressMode[2] = cudaAddressModeClamp;
        switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
        {
        case 0:
            texTableAscanIndexFloat2_0.filterMode = cudaFilterModePoint;
            break;
        case 1:
            texTableAscanIndexFloat2_0.filterMode = cudaFilterModeLinear;
            break;
        }
        texTableAscanIndexFloat2_0.normalized = 0;

        CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_0, deviceTextureAscanIndexFloatCuArray[0], &texChannelDescTableAscanIndexFloat));

        if (TableAscanIndexAllocationCount > 1)
        {                                                                      // TODO: mit Arrays flexibel programmieren!!!
            texTableAscanIndexFloat2_1.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
            texTableAscanIndexFloat2_1.addressMode[1] = cudaAddressModeClamp;
            texTableAscanIndexFloat2_1.addressMode[2] = cudaAddressModeClamp;
            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
            {
            case 0:
                texTableAscanIndexFloat2_1.filterMode = cudaFilterModePoint;
                break;
            case 1:
                texTableAscanIndexFloat2_1.filterMode = cudaFilterModeLinear;
                break;
            }
            texTableAscanIndexFloat2_1.normalized = 0;

            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_1, deviceTextureAscanIndexFloatCuArray[1], &texChannelDescTableAscanIndexFloat));
        }

        if (TableAscanIndexAllocationCount > 2)
        {
            texTableAscanIndexFloat2_2.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
            texTableAscanIndexFloat2_2.addressMode[1] = cudaAddressModeClamp;
            texTableAscanIndexFloat2_2.addressMode[2] = cudaAddressModeClamp;
            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
            {
            case 0:
                texTableAscanIndexFloat2_2.filterMode = cudaFilterModePoint;
                break;
            case 1:
                texTableAscanIndexFloat2_2.filterMode = cudaFilterModeLinear;
                break;
            }
            texTableAscanIndexFloat2_2.normalized = 0;

            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_2, deviceTextureAscanIndexFloatCuArray[2], &texChannelDescTableAscanIndexFloat));
        }

        if (TableAscanIndexAllocationCount > 3)
        {
            texTableAscanIndexFloat2_3.addressMode[0] = cudaAddressModeClamp;  // Texturreferenz beschreiben
            texTableAscanIndexFloat2_3.addressMode[1] = cudaAddressModeClamp;
            texTableAscanIndexFloat2_3.addressMode[2] = cudaAddressModeClamp;
            switch (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtReconstruction])
            {
            case 0:
                texTableAscanIndexFloat2_3.filterMode = cudaFilterModePoint;
                break;
            case 1:
                texTableAscanIndexFloat2_3.filterMode = cudaFilterModeLinear;
                break;
            }
            texTableAscanIndexFloat2_3.normalized = 0;

            CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat));
        }
    }
    // Vorberechnung der Koordinaten --> schnelleres Bestimmen der Voxelposition
    float VoxelIncrement = IMAGE_RESOLUTION / SOS_RESOLUTION;
    float3 SosVoxelStartPosition;
    SosVoxelStartPosition.x = (regionOfInterestOffset.x - sosOffset.x) / SOS_RESOLUTION;  // Start des Bildes im SOS-Grid aus Positionsdaten bestimmen
    SosVoxelStartPosition.y = (regionOfInterestOffset.y - sosOffset.y) / SOS_RESOLUTION;
    SosVoxelStartPosition.z = (regionOfInterestOffset.z - sosOffset.z) / SOS_RESOLUTION;
    // printf("\n\n SosVoxelStartPosition [%f %f %f]\n",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);

    // Anzahl der Teiltabellen, bereits hier berechnen oder übergeben
    int TableAscanIndexAllocationCount = ceil((float)aScanWindowSize / (float)maxAscanIndexArraysInTexture);  // float is important due to ceiling
    // printf( "TableAscanIndexAllocationCount = (%i/%i) = %i = %i\n", aScanWindowSize, maxAscanIndexArraysInTexture, TableAscanIndexAllocationCount,
    // ceil(aScanWindowSize/maxAscanIndexArraysInTexture));

    // Call of 3 SAFT Versions - AscanIndex-Varainten
    // #####################################################################################################################################################################
    // ==================================================== Blockmode with SOS-value per Ascan
    // ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
    // ==================================================== 3DVolume Mode with SOS- and ATT-Correction

    if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false))
    {  // ==================================================== Blockmode with SOS-value per Ascan
        printf("\n\n --- SAFT (AscanIndex) without SOS currently not implemented --- \n");
    }
    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false))
    {
		// ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
        saftKernelAscanIndex_SOS<<<reducedGridDimensions, reducedBlockDimensions>>>(  // , 0, stream>>>(

            aScanIndex,

            (float)aScanWindowSize,          // maxAscanIndexArraysInTexture
            maxAscanIndexArraysInTexture,    //  maxSoSReceiverArrayForTexture
            TableAscanIndexAllocationCount,  // Anzahl der genutzten Teiltabellen
            IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement, blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions, deviceOutput);
    }
    else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true))
    {  // ==================================================== 3DVolume Mode with SOS- and ATT-Correction

        saftKernelAscanIndex_SOS_ATT<<<reducedGridDimensions, reducedBlockDimensions>>>(  // , 0, stream>>>(
            aScanIndex,
            (float)aScanWindowSize,          // maxAscanIndexArraysInTexture
            maxAscanIndexArraysInTexture,    //  maxSoSReceiverArrayForTexture
            TableAscanIndexAllocationCount,  // Anzahl der genutzten Teiltabellen
            IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement,
            blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions,
            debugMode, debugModeParameter, deviceSAFT_VARIANT,
            deviceOutput

        );
    }

    // Unbind Textures
    CUDA_CHECK(cudaUnbindTexture(&texRefAscans));

    if (ATTMode_3DVolume == false)
    {  // ========= 3DVolume Mode without ATT-Correction
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_0));
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_1));
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_2));
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat1_3));
    }
    else if (ATTMode_3DVolume == true)
    {  // ========= 3DVolume Mode with ATT-Correction
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_0));
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_1));
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_2));
        CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_3));
    }
    CUDA_CHECK(cudaGetLastError());
}