Commit source

This commit is contained in:
kradchen
2023-05-18 16:04:27 +08:00
parent 88cf81e4ea
commit c6cd188732
83 changed files with 39921 additions and 0 deletions

20
SAFT_ATT/CMakeLists.txt Normal file
View File

@@ -0,0 +1,20 @@
cmake_minimum_required(VERSION 3.8 FATAL_ERROR)
project(SaftATT)
set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)
set(Matlab_ROOT_DIR /usr/local/Polyspace/R2019b)
find_package(Matlab)
add_library(SaftATT SHARED ./src/SAFT_ATT.cpp ./src/saft.cu ./src/processAScans.cpp ./src/saft.cpp )
target_include_directories(SaftATT PRIVATE ./src /usr/local/cuda/include /usr/local/Polyspace/R2019b/extern/include)
set_target_properties(SaftATT PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_compile_options(SaftATT PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-O3
--compiler-options -fPIC
--use_fast_math
--ptxas-options=-v
-arch compute_30 -code compute_30,sm_30
>)
target_link_libraries(SaftATT PRIVATE ${CUDA_RUNTIME_LIBRARY} ${Matlab_MEX_LIBRARY} ${Matlab_MX_LIBRARY})
set_target_properties(SaftATT PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_LIST_DIR}/src/SAFT_ATT.h)

2325
SAFT_ATT/src/SAFT_ATT.cpp Normal file

File diff suppressed because it is too large Load Diff

8
SAFT_ATT/src/SAFT_ATT.h Normal file
View File

@@ -0,0 +1,8 @@
#ifndef __SAFT_ATT_H__
#define __SAFT_ATT_H__
#include <mex.h>
extern "C"{
void SAFT_ATT(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]);
}
#endif // __SAFT_ATT_H__

View File

@@ -0,0 +1,76 @@
//// printf() is only supported
//// for devices of compute capability 2.0 and above
//#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
// #define printf(f, ...) ((void)(f, __VA_ARGS__),0)
//#endif
//
///**
// This kernel is responsible for determining valid combinations of emitters and receivers, based on the angle between their transducer vectors.
// Reference: Thesis 3.3
// - Dieser Kernel ist verantwortlich daf<61>r g<>ltige Kombinationen von Emitter und Receiver zu bestimmen, basierend auf dem Winkel zwischen den TransducerVektoren.
// - Reference: Thesis 3.3
//*/
//__global__ void analyseTransducerVectorsKernel(
// bool * validEmitterReceiverCombinations, ///< Boolean array in which valid combinations of emitters and receivers are held (true for valid ones, false for invalid ones). The data are stored in the following order, arranged from the fastest moving index to the slowest one: receivers, emitters.
// int * analysisOfTransducerVectorsDistributionCounters ///< This is a pointer to an int [2] in which the number of valid and invalid combinations is stored, for later analysis.
// )
//{
// int
// threadFieldOffset,
// threadFieldCount;
// getWorkLoad(emitterReceiverCombinations, threadFieldOffset, threadFieldCount);
//
// int
// emitterIndex = threadFieldOffset / receiverCount,
// receiverIndex = threadFieldOffset % receiverCount;
//
// float3 currentEmitterVector = getEmitterTransducerVector(emitterIndex);
//
// int
// invalidCombinationCount = 0,
// validCombinationCount = 0;
//
// for(int i = threadFieldOffset, limit = threadFieldOffset + threadFieldCount; i < limit; i++)
// {
// float3 currentReceiverVector = getReceiverTransducerVector(receiverIndex);
// float angle = determineAngle(currentEmitterVector, currentReceiverVector);
// bool isValidCombination = angle <= maximumAngleBetweenEmitterAndReceiverTransducerVectors;
//
// //printf( "Kernelaufruf~~~");
// //printf( "[th %d bl %d] i:%i - isValidCombination: %i\n", threadIdx.x, blockIdx.x, i, isValidCombination);
//
// validEmitterReceiverCombinations[i] = isValidCombination;
// if(normalisePerformanceStatistics)
// {
// if(isValidCombination)
// validCombinationCount++;
// else
// invalidCombinationCount++;
// }
// receiverIndex++;
// if(receiverIndex == receiverCount)
// {
// receiverIndex = 0;
// emitterIndex++;
// currentEmitterVector = getEmitterTransducerVector(emitterIndex);
// }
// }
// if(normalisePerformanceStatistics)
// {
// atomicAdd(analysisOfTransducerVectorsDistributionCounters + 0, invalidCombinationCount);
// atomicAdd(analysisOfTransducerVectorsDistributionCounters + 1, validCombinationCount);
// }
//}
//
///**
// Proxy function to launch the actual kernel that determines the valid combinations of emitters and receivers.
// - Proxyfunktion um den aktuellen Kernel aufzurufen, der die g<>ltigen Kombinationen von Emitter und Receiver bestimmt
//*/
//void SAFTHandler::analyseTransducerVectors(
// dim3 gridDimensions, ///< Grid dimensions to be used by the kernel.
// dim3 blockDimensions ///< Block dimensions to be used by the kernel.
// )
//{
// analyseTransducerVectorsKernel<<<gridDimensions, blockDimensions>>>(deviceValidEmitterReceiverCombinations, deviceTransducerVectorAnalysisDistributionCounters);
// CUDA_CHECK(cudaGetLastError());
//}

View File

@@ -0,0 +1,54 @@
/*!
Emitter and receiver geometry held in constant memory, available across all functions in saft.cu because all of it is held in the same compilation unit.
- Emitter und Receiver Geometrie werden im Constant Memory gehalten, erreichbar f<>r alle Funktionen in Saft.cu weil alle von ihnen in der selben Kompilierungs-Einheit gehalten werden.
*/
#include "saft.hpp"
#ifdef SaftUseConstantMemforGeometry
#ifdef SaftCalcSoSInKernel
__constant__ float3 emitterPOSsosInKernel[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
__constant__ float3 receiverPOSsosInKernel[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
//__constant__ float3 emitterPOSsosInKernel[157 * 4];
//__constant__ float3 receiverPOSsosInKernel[157 * 9];
float3* constEmitterPtr = &emitterPOSsosInKernel[0];
float3* constReceiverPtr = &receiverPOSsosInKernel[0];
#else
#ifdef SaftUseArithmeticMean // Nötig wegen Doppelnennung :-(
__constant__ float3 emitterPOSarith[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
__constant__ float3 receiverPOSarith[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
//__constant__ float3 emitterPOSarith[157 * 4];
//__constant__ float3 receiverPOSarith[157 * 9];
float3* constEmitterPtr = &emitterPOSarith[0];
float3* constReceiverPtr = &receiverPOSarith[0];
#endif
#ifdef SaftUseHarmonicMean
__constant__ float3 emitterPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
__constant__ float3 receiverPOSharmon[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
//__constant__ float3 emitterPOSharmon[157 * 4];
//__constant__ float3 receiverPOSharmon[157 * 9];
float3* constEmitterPtr = &emitterPOSharmon[0];
float3* constReceiverPtr = &receiverPOSharmon[0];
#endif
#endif
// LookUpTable for GeometryList and Memory Position
__constant__ unsigned short lookUpGeometryMemoryListEmitter [MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
__constant__ unsigned short lookUpGeometryMemoryListReceiver[MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY];
//__constant__ unsigned short lookUpGeometryMemoryListEmitter [157 * 4];
//__constant__ unsigned short lookUpGeometryMemoryListReceiver[157 * 9];
unsigned short* constLookUpGeometryMemoryListEmitterPtr = &lookUpGeometryMemoryListEmitter[0];
unsigned short* constLookUpGeometryMemoryListReceiverPtr = &lookUpGeometryMemoryListReceiver[0];
#endif

View File

@@ -0,0 +1,684 @@
#include <stdio.h>
#include "saft.hpp"
//#include <mex.h>
// printf() is only supported
// for devices of compute capability 2.0 and above
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 200)
#define printf(f, ...) ((void)(f, __VA_ARGS__),0)
#endif
#ifdef debug_CudaPrecalculateKernel
#define DebugSosVoxelX 5
#define DebugSosVoxelY 5
#define DebugSosVoxelZ 5
#endif
// #define DebugSosVoxelX 64
// #define DebugSosVoxelY 64
// #define DebugSosVoxelZ 64
//Surfaces fuer Emitter - SosPathsTables
#ifdef SaftTextureForEmRecSosPathsTablesFloat1
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToEmitterPathSosSum;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToEmitterPathCount;
//Surfaces fuer Emitter - SosPathsTables
//surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSumTest;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSum0;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSum1;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSum2;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathCount0;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathCount1;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathCount2;
#endif
#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToEmPathSosBoth;
//Surfaces fuer Emitter - SosPathsTables
//surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToReceiverPathSosSumTest;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToRecPathSosBoth0;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToRecPathSosBoth1;
surface <void, cudaSurfaceType3D> outSurfRefTableVoxelToRecPathSosBoth2;
#endif
__global__ void precalculateAverageSpeedOfSoundKernel(
#ifndef SaftTextureForBresenhamSosPaths
float const * deviceSpeedOfSoundField, ///< Array of speed of sound samples. Dimensions ordered by speed of indices, commencing with the fastest moving one: 1. x 2. y 3. z
#else
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
cudaArray *deviceSpeedOfSoundFieldCuArray, ///< CuArray fuer SOSFieldTextur
#endif
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
cudaArray *deviceSosAttFieldCuArray, ///< CuArray fuer SosAttFieldTextur
#endif
#endif
int firstZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
int sosZLayerCount, ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
#ifdef SaftUseConstantMemforGeometry
int geometry, ///< emitters=0 or receivers=1.
#else
float3 const * geometry, ///< Vector array describing the positions of emitters or receivers.
#endif
int geometryElementCount, ///< Number of elements in the geometry array.
int maxSoSReceiverArrayForTexture, ///< max amount of elements in the receiver CUDA array.
// VoxelCountType * deviceVoxelCountOutput, ///< fuer Count im Integerformat gedacht fuer Texturmemory.
float * deviceVoxelCountOutputFloat, ///< fuer Count im Floatformat gedacht fuer Texturmemory.
float * speedOfSoundSumOutput, ///< fuer SoS im Floatformat gedacht fuer Texturmemory.
// float3 regionOfInterestOffset,
int3 SOSGrid_XYZ,
float3 sosOffset,
float3 regionOfInterestOffset,
float IMAGE_RESOLUTION,
float SOS_RESOLUTION,
float debugMode,
float debugModeParameter
)
{
dim3 SosVoxel
(
threadIdx.x , // SoS-Voxel X ? Threads fangen an bei 0 an
blockIdx.x , // SoS-Voxel Y
blockIdx.y + firstZLayer // SoS-Voxel Z + Offset
);
#ifdef debug_CudaPrecalculateKernel
//printf(" SosVoxel.x,y,z = [%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z); // Herausfinden welche berechnet werden
if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
{
int threadCountAll = gridDim.z * gridDim.x * blockDim.x; // = Anzahl aller Threads X*Y*Z
int threadIndex = blockDim.x * (blockIdx.y * gridDim.x + blockIdx.x) + threadIdx.x;
printf("==================================================================\n");
printf(" threadCountAll = %i\n", threadCountAll); // Anzahl aller Threads //Brauche ich wahrscheinlich gar nicht.
printf(" threadIndex = %i\n", threadIndex); // Threadindex von aktuellem Kernel
printf(" SosVoxel.x,y,z = [%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z); // In welchem SoS-Voxel befinde ich mich?
printf(" geometryElementCount = %i\n", geometryElementCount); // Wie viele Elemente gibt es in der Emitter/receiverListe?
printf("==================================================================\n");
}
#endif
// if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
// {
// printf(" PrecalculateKernel: debugMode [%i] for geometry[%i]\n", debugMode, geometry);
// }
int voxelCount; // Anzahl der Voxel auf einem SoS-Pfad
float totalSpeed = 0.0; // SoSSumme auf einem SoS-Pfad
float totalAttenuation = 0.0; // AttSumme auf einem Attenuation-Pfad
dim3 SosGeometryVoxel; // SoSVoxel von Emitter/Receiver
float3 SosGeometryVoxelFloat; // SoSVoxel von Emitter/Receiver in Float
float SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION; // Aufluesung im SoS-Grid
//int tableIndex; // Index innerhalb TableVoxelToEmitter/ReceiverPath
// Speicher in Texturformat
// int xmax = SOSGrid_XYZ.x;
// int ymax = SOSGrid_XYZ.y;
// int zmax = sosZLayerCount; //SOSGrid_XYZ.z;
int i_x = SosVoxel.x;
int i_y = SosVoxel.y;
int i_z = (SosVoxel.z-firstZLayer); // float SosVoxelTextureZ = (SosVoxelf.z - speedOfSoundZLayer);
//int Index;
int TexturGeometryIndexZ;
float3 currentGeometry;
for(int geometryIndexCounter = 0; geometryIndexCounter < geometryElementCount; geometryIndexCounter++) // Alle Emitter oder Receiver in der Liste von Matlab durchgehen
{
int lookUpGeometryIndex = 0;
// Lade lookUpGeometryMemoryList-Eintrag, um Position im Memory zu bestimmen
if (geometry == 0) // => Emitter
{
lookUpGeometryIndex = lookUpGeometryMemoryListEmitter[geometryIndexCounter]; // Load from Constant Memory
}
else //if (geometry == 1) => Receiver
{
lookUpGeometryIndex = lookUpGeometryMemoryListReceiver[geometryIndexCounter]; // Load from Constant Memory
}
//if (currentGeometry.x != 255) // currentGeometry.x = 255 ist außerhalb des Wertebereichs und zeigt an, das Geometrie nicht genutzt wird. Darum muss nicht berechnet werden.
if (lookUpGeometryIndex != 65535) // currentGeometry.x = 65535 ist außerhalb des Wertebereichs und zeigt an, das Geometrie nicht genutzt wird. Darum muss nicht berechnet werden.
{
#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
{
printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i): lookUpGeometryIndex(%i)\n", geometry, geometryIndexCounter, lookUpGeometryIndex); // In welche Speicherstelle wird geschrieben
printf(" SOSGrid_XYZ.x,y,z = [%i %i %i]\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z); // In welchem SoS-Voxel befinde ich mich?
printf(" geometryElementCount = %i\n", geometryElementCount); // Wie viele Elemente gibt es in der Emitter/receiverListe?
printf("-------------------------------------------------------------------\n");
printf(" SosVoxel.x,y,z = [%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z); // In welchem SoS-Voxel befinde ich mich?
printf(" geometryIndexCounter = %i\n", geometryIndexCounter); // Wie viele Elemente gibt es in der Emitter/receiverListe?
printf(" firstZLayer = %i\n", firstZLayer); // zLayer Offset, welcher wird zur Zeit berechnet?
printf(" sosZLayerCount = %i\n", sosZLayerCount); // Anzhal der zu berechnenden zLayer?
printf("-------------------------------------------------------------------\n");
printf(" speedOfSoundSumOutput_Index= %i\n", SOSGrid_XYZ.x*(SOSGrid_XYZ.y*SOSGrid_XYZ.y*geometryIndexCounter+SOSGrid_XYZ.y*(SosVoxel.z-firstZLayer)+SosVoxel.y)+SosVoxel.x); // In welche Speicherstelle wird geschrieben
printf(" totalSpeed = %f\n", totalSpeed); // Berechnete Geschwindigkeit
printf(" write i_x,i_y,i_z = [%i %i %i]\n", i_x, i_y, i_z); // In welchem SoS-Voxel schreibe ich?
printf("==================================================================\n");
}
#endif
// Wenn Emitter/Receiver genutzt werden Koordinaten laden
#ifdef SaftUseConstantMemforGeometry
if (geometry == 0) // => Emitter
{
#ifdef SaftCalcSoSInKernel
currentGeometry = emitterPOSsosInKernel[geometryIndexCounter]; // Positionsdaten von Emitter lesen
#else
#ifdef SaftUseArithmeticMean // Nötig wegen Doppelnennung :-(
currentGeometry = emitterPOSarith[geometryIndexCounter]; // Positionsdaten von Emitter lesen
#endif
#ifdef SaftUseHarmonicMean
currentGeometry = emitterPOSharmon[geometryIndexCounter]; // Positionsdaten von Emitter lesen
#endif
#endif
//syncthreads();
}
else //if (geometry == 1) => Receiver
{
#ifdef SaftCalcSoSInKernel
currentGeometry = receiverPOSsosInKernel[geometryIndexCounter]; // Positionsdaten von Receiver lesen
#else
#ifdef SaftUseArithmeticMean // Nötig wegen Doppelnennung :-(
currentGeometry = receiverPOSarith[geometryIndexCounter]; // Positionsdaten von Receiver lesen
#endif
#ifdef SaftUseHarmonicMean
currentGeometry = receiverPOSharmon[geometryIndexCounter]; // Positionsdaten von Receiver lesen
#endif
#endif
//syncthreads();
}
#endif
#ifndef SaftUseConstantMemforGeometry
currentGeometry = geometry[geometryIndexCounter]; // Positionsdaten von Emitter/Receiver lesen
#endif
// Versuche mit Geometrie d.h. E/R-Kooridinaten um einen halben Voxel zu verschieben ==> muesste eigentlich bei beiden, S/E-Koordinaten und Voxel, gemacht werden
//determineSpeedOfSoundFieldVoxel(currentGeometry , SosGeometryVoxel, SOS_RESOLUTION_FACTOR); // SoSVoxel von Emitter/Receiver bestimmen
// currentGeometry_plushalf.x = currentGeometry.x + IMAGE_RESOLUTION/2; // Emitter/Receiver-Position in SoS-Koordinaten Umwandeln
// currentGeometry_plushalf.y = currentGeometry.y + IMAGE_RESOLUTION/2; // halbe Koordinaten hier nicht benuetigt, da ja die genaue Positionsdaten da sind
// currentGeometry_plushalf.z = currentGeometry.z + IMAGE_RESOLUTION/2;
// // Versuche im Integerformat
// //determineSpeedOfSoundFieldVoxel (currentGeometry_plushalf , SosGeometryVoxel, sosOffset, SOS_RESOLUTION_FACTOR);// SoSVoxel von E/R bestimmen // currentGeometry + 1/2--> SosGeometryVoxel
// determineSpeedOfSoundFieldVoxel (currentGeometry , SosGeometryVoxel, sosOffset, SOS_RESOLUTION_FACTOR); // SoSVoxel von E/R bestimmen // currentGeometry --> SosGeometryVoxel Integer
// // out out voxel1(E/R) voxel2(SoSVoxel) SoSField Size of SoSField
// performRayTracedSpeedAddition(voxelCount, totalSpeed, SosGeometryVoxel, SosVoxel, deviceSpeedOfSoundField, SOSGrid_XYZ); // SosGeometryVoxel im Integerformat, SoSVoxel als Integer
// Bestimmen der SoS-Koordinaten fuer die Sender/Empfuenger-Koordinaten
determineSpeedOfSoundFieldVoxelFloat(currentGeometry, SosGeometryVoxelFloat, sosOffset, SOS_RESOLUTION_FACTOR); // SoSVoxel von E/R bestimmen // currentGeometry --> SosGeometryVoxel Float
//determineSpeedOfSoundFieldVoxelFloat(currentGeometry_plushalf, SosGeometryVoxelFloat, sosOffset, SOS_RESOLUTION_FACTOR); // SoSVoxel von E/R bestimmen // currentGeometry --> SosGeometryVoxel Float
// Nutzen der Bresenham-Floatvariante
// out out voxel1(E/R) voxel2(SoSVoxel) SoSField Size of SoSField , E=0/R=1
//performRayTracedSpeedAdditionFloat(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel, deviceSpeedOfSoundField, SOSGrid_XYZ , geometry); // SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
// Nutzen der Bresenham-Floatvariante mit Texturmemory und Interpolation
// out out voxel1(E/R) voxel2(SoSVoxel+0.5) SoSField Size of SoSField , E=0/R=1
//performRayTracedSpeedAdditionTexture(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel, deviceSpeedOfSoundField, SOSGrid_XYZ ,sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry); // SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
#ifndef SaftTextureForBresenhamSosPaths // SOS-Volume ueber Array oder normal ansprechen?!
performRayTracedSpeedAdditionTexture(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel, deviceSpeedOfSoundField, SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry); // SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
#else
//performRayTracedSpeedAdditionTexture(voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry); // SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
performRayTracedSpeedAdditionTexture (voxelCount, totalSpeed, SosGeometryVoxelFloat, SosVoxel, deviceSpeedOfSoundFieldCuArray, SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry); // SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
#endif
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
performRayTracedSpeedAdditionTexture (voxelCount, totalSpeed, totalAttenuation, SosGeometryVoxelFloat, SosVoxel, deviceSosAttFieldCuArray, SOSGrid_XYZ, sosOffset, SOS_RESOLUTION, IMAGE_RESOLUTION, regionOfInterestOffset, geometry); // SosGeometryVoxelFloat im Floatformat, SoSVoxel als Integer
#endif
#endif
#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
//if ((SosVoxel.y == DebugSosVoxelY) && ( (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX))){
if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ)){
//printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f]:[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), SoStotalSpeed(%3.3f), Index[Table,Index] = [%i %i]\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosGeometryVoxelFloat.x, SosGeometryVoxelFloat.y, SosGeometryVoxelFloat.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)voxelCount, totalSpeed, tableIndex, Index); // In welche Speicherstelle wird geschrieben
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.6f %+3.6f %+3.6f]:[%+3.6f %+3.6f %+3.6f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), SoStotalSpeed(%3.3f)\n\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosGeometryVoxelFloat.x, SosGeometryVoxelFloat.y, SosGeometryVoxelFloat.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)voxelCount, totalSpeed); // In welche Speicherstelle wird geschrieben
#endif
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.6f %+3.6f %+3.6f]:[%+3.6f %+3.6f %+3.6f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), SoStotalSpeed(%3.3f), totalAttenuation(%3.3f)\n\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosGeometryVoxelFloat.x, SosGeometryVoxelFloat.y, SosGeometryVoxelFloat.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)voxelCount, totalSpeed, totalAttenuation); // In welche Speicherstelle wird geschrieben
#endif
}
#endif
if (geometry == 0) // Emitter
{
//speedOfSoundSumOutput[Index] = totalSpeed; // Fuellen der TableVoxelToEmitter/ReceiverPathSosSum
//deviceVoxelCountOutputFloat[Index] = (float)voxelCount;
//speedOfSoundSumOutput[Index] = 0.0f;
//deviceVoxelCountOutputFloat[Index] = 0.0f;
TexturGeometryIndexZ = sosZLayerCount * lookUpGeometryIndex + i_z;
#ifdef SaftTextureForEmRecSosPathsTablesFloat1 // Float1
surf3Dwrite( totalSpeed, outSurfRefTableVoxelToEmitterPathSosSum, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToEmitterPathCount, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathCount
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat2 // Float2
float2 VoxelValues;
VoxelValues.x = totalSpeed;
VoxelValues.y = (float)voxelCount;
surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToEmPathSosBoth, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat4 // Float4
float4 VoxelValues;
VoxelValues.x = totalSpeed;
VoxelValues.y = (float)voxelCount;
// if (totalAttenuation>debugModeParameter) // Max Border for Attenuation Correction
// VoxelValues.z = debugModeParameter; // Average Attenuation on this Path
// else
VoxelValues.z = totalAttenuation; // Average Attenuation on this Path
VoxelValues.w = 0.0f;
surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToEmPathSosBoth, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
#endif
}
else
{
//speedOfSoundSumOutput[Index] = totalSpeed; // Fuellen der TableVoxelToEmitter/ReceiverPathSosSum
//deviceVoxelCountOutputFloat[Index] = (float)voxelCount;
//speedOfSoundSumOutput[Index] = 0.0f;
//deviceVoxelCountOutputFloat[Index] = 0.0f;
TexturGeometryIndexZ = sosZLayerCount * ((lookUpGeometryIndex) % maxSoSReceiverArrayForTexture) + i_z;
#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
//if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
// printf(">>>> %i >>>> Precalc: geomIdxCounter(%4i):[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] firstZLayer(%i) ==> TexturNr.[%3i], TexturGeometryIndexZ(%3i), lookUpGeometryIndex(%4i)\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, firstZLayer, (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) , TexturGeometryIndexZ, lookUpGeometryIndex); // In welche Speicherstelle wird geschrieben
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat1 // Float1
if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 0){
surf3Dwrite((float)totalSpeed, outSurfRefTableVoxelToReceiverPathSosSum0, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToReceiverPathCount0, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
}
else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 1) {
surf3Dwrite( totalSpeed, outSurfRefTableVoxelToReceiverPathSosSum1, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToReceiverPathCount1, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
}
else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 2){
surf3Dwrite( totalSpeed, outSurfRefTableVoxelToReceiverPathSosSum2, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
surf3Dwrite((float)voxelCount, outSurfRefTableVoxelToReceiverPathCount2, i_x*sizeof(float), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben: TableVoxelToEmitterPathSosSum
}
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat2 // Float2
float2 VoxelValues;
VoxelValues.x = totalSpeed;
VoxelValues.y = (float)voxelCount;
if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 0){
surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth0, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
}
else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 1) {
surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth1, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
}
else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 2){
surf3Dwrite<float2>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth2, i_x*sizeof(float2), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
}
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat4 // Float4
float4 VoxelValues;
VoxelValues.x = totalSpeed; // Average SoS on this Path
VoxelValues.y = (float)voxelCount; // Amount of visited voxel
// if (totalAttenuation>debugModeParameter) // Max Border for Attenuation Correction
// VoxelValues.z = debugModeParameter; // Average Attenuation on this Path
// else
VoxelValues.z = totalAttenuation; // Average Attenuation on this Path
VoxelValues.w = 0.0f; // Amount of visited voxel
if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 0){
surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth0, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
}
else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 1) {
surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth1, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
}
else if ( (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture) == 2){
surf3Dwrite<float4>(VoxelValues, outSurfRefTableVoxelToRecPathSosBoth2, i_x*sizeof(float4), i_y, TexturGeometryIndexZ ); // Direkt in CUDA Array schreiben
}
#endif
//speedOfSoundSumOutput[Index] = totalSpeed; // Fuellen der TableVoxelToEmitter/ReceiverPathSosSum
//deviceVoxelCountOutput[tableIndex] = typedVoxelCount; // Fuellen der TableVoxelToEmitter/ReceiverPathCount
}
#if defined(debug_CudaPrecalculateKernel) || defined(debug_OutputSOSPaths)
// //printf(" SosVoxel.x,y,z = [%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z); // Herausfinden welche berechnet werden
if ((SosVoxel.x == DebugSosVoxelX) && (SosVoxel.y == DebugSosVoxelY) && (SosVoxel.z == DebugSosVoxelZ))
{
//printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] firstZLayer(%i)\n>>>>>>>>>>>> VoxelCnt(%i), SoSSum(%3.3f), SoSSum_Index = %i\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, firstZLayer, (int)typedVoxelCount, totalSpeed, Index); // In welche Speicherstelle wird geschrieben
printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.6f %+3.6f %+3.6f] - SOSVoxel [%3i %3i %3i] firstZLayer(%i)\n>>>>>>>>>>>> surf3Dwrite Textur[%3i %3i %3i], TexturGeometryIndexZ(%3i) = VoxelCnt(%3.6f), SoSSum(%3.6f) = avgSpeed(%3.6f) \n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, firstZLayer, i_x,i_y,i_z, TexturGeometryIndexZ, (float)voxelCount, totalSpeed, (1/(totalSpeed/(float)voxelCount))); // In welche Speicherstelle wird geschrieben
//printf("======%i %i %i============================================================\n", geometry,geometry,geometry);
printf(" SOSGrid_XYZ.x,y,z = [%i %i %i]\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z); // In welchem SoS-Voxel befinde ich mich?
printf(" geometryElementCount = %i\n", geometryElementCount); // Wie viele Elemente gibt es in der Emitter/receiverListe?
printf("-------------------------------------------------------------------\n");
printf(" SosVoxel.x,y,z = [%i %i %i]\n", SosVoxel.x, SosVoxel.y, SosVoxel.z); // In welchem SoS-Voxel befinde ich mich?
printf(" firstZLayer = %i\n", firstZLayer); // zLayer Offset, welcher wird zur Zeit berechnet?
printf(" geometryIndexCounter = %i\n", geometryIndexCounter); // Welches Elemente aus der Emitter/receiverListe?
//printf(" TexturGeometryIndexZ = %i\n", TexturGeometryIndexZ); // zLayer Offset, welcher wird zur Zeit berechnet?
printf(" lookUpGeometryIndex = %i => ### %i in [%i] ###\n", lookUpGeometryIndex, TexturGeometryIndexZ, (int)floor((float)lookUpGeometryIndex / (float)maxSoSReceiverArrayForTexture)); // Welcher Index hat Emitter/receiver?
printf(" i_z = (SosVxl.z-firstZLay) = %i\n", i_z); // zLayer Offset, welcher wird zur Zeit berechnet?
//printf("-------------------------------------------------------------------\n");
//printf(" speedOfSoundSumOutput_Index= %i\n", SOSGrid_XYZ.x*(SOSGrid_XYZ.y*SOSGrid_XYZ.y*geometryIndexCounter+SOSGrid_XYZ.y*(SosVoxel.z-firstZLayer)+SosVoxel.y)+SosVoxel.x); // In welche Speicherstelle wird geschrieben
// printf(" totalSpeed = %f\n", totalSpeed); // Berechnete Geschwindigkeit
// printf("==================================================================\n");
}
#endif
// //#ifdef debug_CudaPrecalculateKernel
// //if ((SosVoxel.y == DebugSosVoxelY) && ( (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX) || (SosVoxel.x == DebugSosVoxelX))){
// //printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), 1/SoStotalSpeed(%3.3f), Index[Table,Index] = [%i %i]\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (int)typedVoxelCount, totalSpeed, tableIndex, Index); // In welche Speicherstelle wird geschrieben ?
// printf(">>>> %i >>>> Precalc: geometryIndexCounter(%i):[%+3.4f %+3.4f %+3.4f] - SOSVoxel [%3i %3i %3i] \n>>>>>>>>>>>> VoxelCnt(%i), 1/SoStotalSpeed(%3.3f) = [%i %i]\n", geometry, geometryIndexCounter, currentGeometry.x, currentGeometry.y, currentGeometry.z, SosVoxel.x,SosVoxel.y,SosVoxel.z, (float)voxelCount, totalSpeed); // In welche Speicherstelle wird geschrieben ?
// // // Speicher in Texturformat
// // // Indexberechnung für Einsatz des Texturmemorys
// // float xmax = SOSGrid_XYZ.x;
// // float ymax = SOSGrid_XYZ.y;
// // float zmax = (float)maxFeasibleSosZLayerCount;
// // float i_x = SosVoxel.x;
// // float i_y = SosVoxel.y;
// // float i_z = (float)(int)(SosVoxelTextureZ); // float SosVoxelTextureZ = (SosVoxelf.z - speedOfSoundZLayer);
//
// //Index = xmax*(ymax*(zmax*geometryIndexCounter+i_z)+i_y)+i_x; // ohne lookUpGeometryIndex-Liste linear im Speicher liegend
// //Index = xmax*(ymax*(zmax*lookUpGeometryIndex+i_z)+i_y)+i_x; // mit lookUpGeometryIndex-Liste
//
// //printf(">>>> %i >>>> Index = xmax(%i)*(ymax(%i)*(zmax(%i)*geometryIndexCounter(%i)+i_z(%i))+i_y(%i))+i_x(%i) = [%i]\n", geometry, (int)xmax, (int)ymax, (int)zmax, geometryIndexCounter, (SosVoxel.z-firstZLayer), SosVoxel.y, SosVoxel.x,Index); // In welche Speicherstelle wird geschrieben ?
// printf(">>>> %i >>>> Index = xmax(%i)*(ymax(%i)*(zmax(%i)*lookUpGeometryIndex(%i)+i_z(%i))+i_y(%i))+i_x(%i) = [%i]\n", geometry, (int)xmax, (int)ymax, (int)zmax, lookUpGeometryIndex, (SosVoxel.z-firstZLayer), SosVoxel.y, SosVoxel.x,Index); // In welche Speicherstelle wird geschrieben ?
// //}
// //#endif
// Alle berechneten SOS-Voxel ausgeben mit Index
//printf(" SosVoxel.x,y,z = [%i %i %i] => Index (%i)\n", SosVoxel.x, SosVoxel.y, SosVoxel.z, Index); // In welchem SoS-Voxel befinde ich mich?
}
}
}
/**
Proxy function which calls the speed of sound precalculation kernel.
- Proxy-Funktion der einen Schallgeschwindigkeits-Kernel aufruft.
*/
//precalculateAverageSpeedOfSound(
// currentSpeedOfSoundZLayer,
// maxFeasibleSosZLayerCount,
// 0,
// emitter_list_Size,
// deviceTableVoxelToEmitterPathCount,
// deviceTableVoxelToEmitterPathCountFloat,
// deviceTableVoxelToEmitterPathSosSum);
void SAFTHandler::precalculateAverageSpeedOfSound
(
int firstZLayer, ///< First z-layer in the speed of sound grid the pre-calculation is performed for.
int sosZLayerCount, ///< Number of z-layers in the speed of sound grid the pre-calculation is performed for.
#ifdef SaftUseConstantMemforGeometry
int deviceListGeometry, ///< emitters=0 or receivers=1.
#else
float3 const * deviceListGeometry, ///< Vector array describing the positions of emitters or receivers.
#endif
int geometryElementCount, ///< Number of elements in the geometry array got from Matlab
//VoxelCountType * deviceVoxelCountOutput, ///< Out: # of voxels in the path from a transducer element to a voxel.
float * deviceVoxelCountOutputFloat, ///< Out: # of voxels in the path from a transducer element to a voxel in Float format.
float * deviceSpeedOfSoundSumOutput ///< Out: Sum of SoS samples in the path from transducer to voxel.
// int blocksPerGrid, ///< Number of blocks per grid to be used to execute the kernel.
// int threadsPerBlock, ///< Number of threads per block to be used to execute the kernel.
// cudaStream_t stream, ///< Stream to be used for the execution of the kernel.
)
{
#ifdef debug_OutputFunctions
printf( "==> SAFTHandler::precalculateAverageSpeedOfSound - Start\n");
#endif
dim3 threadsPerBlock (SOSGrid_XYZ.x,1,1); // max. 512 oder 1024 Threads werden vorgegeben und
//dim3 threadsPerBlock (SOSGrid_XYZ.x,SOSGrid_XYZ.y,1); // max. 512 oder 1024 Threads werden vorgegeben und
dim3 blocksPerGrid (1,1,1); // max. 65.535 Bloecke im Grid berechnet. Initialisierung
blocksPerGrid.x = SOSGrid_XYZ.y;
blocksPerGrid.y = sosZLayerCount;
blocksPerGrid.z = 1;
#ifdef debug_CudaPrecalculateKernel
int sosZLayerVoxelCountToProcess = sosZLayerVoxelCount * sosZLayerCount; // Anzahl der Voxel die berechnet werden sollen
printf("===========================================================================================\n");
printf(" deviceListGeometry: %i (0=Em/1=Rec)\n", deviceListGeometry);
printf(" geometryElementCount: %i\n", geometryElementCount);
printf(" sosZLayerVoxelCountToProcess = sosZLayerVoxelCount(%i) * sosZLayerCount(%i) = %i\n", sosZLayerVoxelCount, sosZLayerCount, sosZLayerVoxelCountToProcess);
printf(" threadsPerBlock x,y,z: [%i %i %i]\n", threadsPerBlock.x, threadsPerBlock.y, threadsPerBlock.z);
printf(" blocksPerGrid x,y,z: [%i %i %i]\n", blocksPerGrid.x, blocksPerGrid.y, blocksPerGrid.z);
printf(" firstZLayer (Start z): %i\n", firstZLayer);
//printf(" SOSGrid_XYZ x,y,z: [%i %i %i]\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z);
printf("===========================================================================================\n");
#endif
#ifdef SaftTextureForBresenhamSosPaths
// Prepare Texture for SpeedOfSoundField
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
cudaChannelFormatDesc texChannelDescSpeedOfSoundField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); // Beschreibung des RueckgabeFormats der Textur fuer SpeedOfSoundField
texRefSpeedOfSoundField.addressMode[0] = cudaAddressModeClamp; // Texturreferenz beschreiben
texRefSpeedOfSoundField.addressMode[1] = cudaAddressModeClamp;
texRefSpeedOfSoundField.addressMode[2] = cudaAddressModeClamp;
if (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing] == 1){
texRefSpeedOfSoundField.filterMode = cudaFilterModeLinear; // Lineare Interpolation
}
else{
texRefSpeedOfSoundField.filterMode = cudaFilterModePoint; // Nearest Neighbor
}
// #ifdef SaftTextureForBresenhamInterpolated
// texRefSpeedOfSoundField.filterMode = cudaFilterModeLinear;
// #else
// texRefSpeedOfSoundField.filterMode = cudaFilterModePoint;
// #endif
texRefSpeedOfSoundField.normalized = 0;
CUDA_CHECK(cudaBindTextureToArray ( &texRefSpeedOfSoundField, deviceSpeedOfSoundFieldCuArray, &texChannelDescSpeedOfSoundField )); // Schritt 4.1 3DArray an Texturmemory binden
#endif
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
cudaChannelFormatDesc texChannelDescSosAttField = cudaCreateChannelDesc(32, 32, 0, 0, cudaChannelFormatKindFloat); // Schritt 2.1 Output-Kanal anlegen und beschreiben
texRefSosAttField.addressMode[0] = cudaAddressModeClamp; // Texturreferenz beschreiben
texRefSosAttField.addressMode[1] = cudaAddressModeClamp;
texRefSosAttField.addressMode[2] = cudaAddressModeClamp;
if (SAFT_VARIANT[SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing] == 1){
texRefSosAttField.filterMode = cudaFilterModeLinear; // Lineare Interpolation
}
else{
texRefSosAttField.filterMode = cudaFilterModePoint; // Nearest Neighbor
}
// #ifdef SaftTextureForBresenhamInterpolated
// texRefSosAttField.filterMode = cudaFilterModeLinear;
// #else
// texRefSosAttField.filterMode = cudaFilterModePoint;
// #endif
texRefSosAttField.normalized = 0;
CUDA_CHECK(cudaBindTextureToArray ( &texRefSosAttField, deviceSosAttFieldCuArray, &texChannelDescSosAttField )); // Schritt 4.1 3DArray an Texturmemory binden
#endif
#endif
#ifdef SaftTextureForEmRecSosPathsTables
if (deviceListGeometry == 0){
#ifdef SaftTextureForEmRecSosPathsTablesFloat1
cudaBindSurfaceToArray(outSurfRefTableVoxelToEmitterPathSosSum, deviceTableVoxelToEmitterPathSosSumCuArray);
cudaBindSurfaceToArray(outSurfRefTableVoxelToEmitterPathCount, deviceTableVoxelToEmitterPathCountCuArray);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat2
cudaBindSurfaceToArray(outSurfRefTableVoxelToEmPathSosBoth, deviceTableVoxelToEmPathSosBothCuArray);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat4 // TODO: hier Name aendern mit Att
cudaBindSurfaceToArray(outSurfRefTableVoxelToEmPathSosBoth, deviceTableVoxelToEmPathSosBothCuArray);
#endif
}
//deviceTableVoxelToReceiverPathCountCuArray[0] = deviceTableVoxelToReceiverPathSosSumCuArrayTest;
//cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSumTest, deviceTableVoxelToReceiverPathSosSumCuArrayTest);
if (deviceListGeometry == 1){
//printf( "#################(int)floor((float)geometryElementCount / (float)maxSoSReceiverArrayForTexture) == %i\n", (int)floor((float)geometryElementCount / (float)maxSoSReceiverArrayForTexture));
//printf( "#################TableVoxelToReceiverPathSosAllocationCount == %i\n", TableVoxelToReceiverPathSosAllocationCount);
if ( TableVoxelToReceiverPathSosAllocationCount > 0){
#ifdef SaftTextureForEmRecSosPathsTablesFloat1
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToReceiverPathSosSumCuArray[0](%X) deviceTableVoxelToReceiverPathCountCuArray[0](%X)\n", deviceTableVoxelToReceiverPathSosSumCuArray[0], deviceTableVoxelToReceiverPathCountCuArray[0]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum0, deviceTableVoxelToReceiverPathSosSumCuArray[0]);
cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount0, deviceTableVoxelToReceiverPathCountCuArray[0]);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat2
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[0](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[0]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth0, deviceTableVoxelToRecPathSosBothCuArray[0]);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat4 // TODO: hier Name aendern mit Att
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[0](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[0]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth0, deviceTableVoxelToRecPathSosBothCuArray[0]);
#endif
}
if ( TableVoxelToReceiverPathSosAllocationCount > 1) {
#ifdef SaftTextureForEmRecSosPathsTablesFloat1
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToReceiverPathSosSumCuArray[1](%X) deviceTableVoxelToReceiverPathCountCuArray[1](%X)\n", deviceTableVoxelToReceiverPathSosSumCuArray[1], deviceTableVoxelToReceiverPathCountCuArray[1]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum1, deviceTableVoxelToReceiverPathSosSumCuArray[1]);
cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount1, deviceTableVoxelToReceiverPathCountCuArray[1]);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat2
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[1](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[1]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth1, deviceTableVoxelToRecPathSosBothCuArray[1]);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat4
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[1](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[1]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth1, deviceTableVoxelToRecPathSosBothCuArray[1]);
#endif
}
if ( TableVoxelToReceiverPathSosAllocationCount > 2){
#ifdef SaftTextureForEmRecSosPathsTablesFloat1
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToReceiverPathSosSumCuArray[2](%X) deviceTableVoxelToReceiverPathCountCuArray[2](%X)\n", deviceTableVoxelToReceiverPathSosSumCuArray[2], deviceTableVoxelToReceiverPathCountCuArray[2]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum2, deviceTableVoxelToReceiverPathSosSumCuArray[2]);
cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount2, deviceTableVoxelToReceiverPathCountCuArray[2]);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat2
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[2](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[2]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth2, deviceTableVoxelToRecPathSosBothCuArray[2]);
#endif
#ifdef SaftTextureForEmRecSosPathsTablesFloat4
#ifdef debug_CudaPrecalculateKernel
printf( "cudaBindSurfaceToArray: deviceTableVoxelToRecPathSosBothCuArray[2](%X)\n", deviceTableVoxelToRecPathSosBothCuArray[2]);
#endif
cudaBindSurfaceToArray(outSurfRefTableVoxelToRecPathSosBoth2, deviceTableVoxelToRecPathSosBothCuArray[2]);
#endif
}
}
#endif
// cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum0, deviceTableVoxelToReceiverPathSosSumCuArray[0]);
// cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum1, deviceTableVoxelToReceiverPathSosSumCuArray[1]);
// cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathSosSum2, deviceTableVoxelToReceiverPathSosSumCuArray[2]);
// cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount0, deviceTableVoxelToReceiverPathCountCuArray[0]);
// cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount1, deviceTableVoxelToReceiverPathCountCuArray[1]);
// cudaBindSurfaceToArray(outSurfRefTableVoxelToReceiverPathCount2, deviceTableVoxelToReceiverPathCountCuArray[2]);
precalculateAverageSpeedOfSoundKernel <<< blocksPerGrid, threadsPerBlock >>>
(
#ifndef SaftTextureForBresenhamSosPaths
deviceSpeedOfSoundField,
#else
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
deviceSpeedOfSoundFieldCuArray,
#endif
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
deviceSosAttFieldCuArray,
#endif
#endif
firstZLayer,
sosZLayerCount,
deviceListGeometry,
geometryElementCount,
maxSoSReceiverArrayForTexture, // maximale Anzahl an Receivern in einem CUDA Array
//deviceVoxelCountOutput,
deviceVoxelCountOutputFloat,
deviceSpeedOfSoundSumOutput,
// regionOfInterestOffset,
SOSGrid_XYZ,
sosOffset,
regionOfInterestOffset,
IMAGE_RESOLUTION,
SOS_RESOLUTION,
debugMode,
debugModeParameter
);
CUDA_CHECK(cudaGetLastError());
#ifdef SaftTextureForBresenhamSosPaths
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
CUDA_CHECK(cudaUnbindTexture( &texRefSpeedOfSoundField ));
#endif
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
CUDA_CHECK(cudaUnbindTexture( &texRefSosAttField ));
#endif
#endif
#ifdef debug_OutputFunctions
printf( "<== SAFTHandler::precalculateAverageSpeedOfSound - End\n");
#endif
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

650
SAFT_ATT/src/saft.cpp Normal file
View File

@@ -0,0 +1,650 @@
#include <mex.h>
#include <iostream>
#include <vector>
#include <cstdlib>
#include <ctime>
#include <cmath>
//#include <sys/time.h>
//#include <ail/file.hpp>
//#include <ail/string.hpp>
//#include <ail/time.hpp>
//#include "configuration.hpp"
#include "saft.hpp"
/**
Clumsy constructor of the core reconstruction class.
- Unbeholfener Konstruktor der Kern Rekonstuktionsklasse
*/
SAFTHandler::SAFTHandler(
int deviceId, ///< CUDA ID of the device to be used.
int deviceIndex, ///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
float *aScan_ptr, ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath, ///< Path to the actual A-scan samples.
double *output_ptr, ///< Zeiger zu den daten // std::string const & Path, ///< Path to a file in which the output of the image reconstruction is to be stored.
double *Duration_ptr, ///< Zeiger auf R<>ckgabewert fuer Matlab fuer Laufzeit des Kernels
unsigned short *receiver_index_ptr, ///<
unsigned short *emitter_index_ptr, ///<
float *receiver_list_ptr, ///<
int receiver_list_Size,
float *emitter_list_ptr, ///<
int emitter_list_Size,
float *speed_vec_ptr, ///< Zeiger auf die SoS-Daten in Block-/Gridmode
int3 SOSGrid_XYZ,
float3 sosOffset, ///< Startpoint of SoSGrid
float SOS_RESOLUTION, ///< Aufloesung des SoSGrid
float *att_vec_ptr, ///< Zeiger auf die Att-Daten inm Gridmode
int aScanCount,
int aScanLength,
int3 IMAGE_SIZE_XYZ,
float sampleRate,
float3 regionOfInterestOffset,
float IMAGE_RESOLUTION,
dim3 const & fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
int medianWindowSize, ///< define width of used median filter
float debugMode,
float debugModeParameter,
bool SOSMode_3DVolume,
bool ATTMode_3DVolume,
int SAFT_MODE,
int *SAFT_VARIANT
):
deviceId(deviceId), // Das hier ist eine Initialisation der Klassenvariablen mit den <20>bergebenen Werten aehnlich Konstruktor, called Initializer list
deviceIndex(deviceIndex),
aScan_ptr(aScan_ptr), //aScanSamplesPath(aScanSamplesPath),
output_ptr(output_ptr), //Path(Path),
Duration_ptr(Duration_ptr),
receiver_index_ptr(receiver_index_ptr), //
emitter_index_ptr(emitter_index_ptr), //
receiver_list_ptr(receiver_list_ptr), //
receiver_list_Size(receiver_list_Size),
emitter_list_ptr(emitter_list_ptr), //
emitter_list_Size(emitter_list_Size),
speed_vec_ptr(speed_vec_ptr), ///< SoS-Daten im Blockmode oder SoSGrid
SOSGrid_XYZ(SOSGrid_XYZ), // Groesse des SoSGrids
sosOffset(sosOffset), ///< Startpoint of SoSGrid
SOS_RESOLUTION(SOS_RESOLUTION), ///< Aufloesung des SoSGrid
att_vec_ptr(att_vec_ptr), ///< Att-Daten als ATTGrid
aScanCount(aScanCount),
aScanLength(aScanLength),
IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
sampleRate(sampleRate),
regionOfInterestOffset(regionOfInterestOffset),
IMAGE_RESOLUTION(IMAGE_RESOLUTION),
fixedBlockDimensions(fixedBlockDimensions),
medianWindowSize(medianWindowSize),
debugMode(debugMode),
debugModeParameter(debugModeParameter),
SOSMode_3DVolume(SOSMode_3DVolume),
ATTMode_3DVolume(ATTMode_3DVolume),
SAFT_MODE(SAFT_MODE),
SAFT_VARIANT(SAFT_VARIANT)
{
#ifdef debug_OutputFunctions
// printf( "==> SAFTHandler::SAFTHandler - Start\n");
#endif
#ifdef debug_OutputInfo
// printf( "SAFTHandler Constructor\n");
#endif
aScanAllocationCount = 1; // Speicher der Allokiert wird, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.
IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION; // Auflösung im OutputVolumen
SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION; // Auflösung im SoS-Grid
#ifdef debug_OutputVariables
// printf( "IMAGE_RESOLUTION_FACTOR = %e\n", IMAGE_RESOLUTION_FACTOR);
// printf( "SOS_RESOLUTION_FACTOR = %e\n", SOS_RESOLUTION_FACTOR);
// printf( "Samplerate = %e\n", sampleRate);
#endif
#ifdef debug_OutputFunctions
// printf( "<== SAFTHandler::SAFTHandler - End\n");
#endif
}
/**
Top level function of the SAFTHandler class that performs the image reconstruction.
- Top Level Funktion der SAFTHandler Klasse die die Bildrekonstruktion durchf<68>hrt.
*/
void SAFTHandler::performReconstruction()
{
#ifdef debug_OutputFunctions
// printf( "==> SAFTHandler::performReconstruction - Start\n");
#endif
#ifdef debug_OutputInfo // Name des Device mit ID ausgeben
// printf( "Device ID: %i\n", deviceId);
#endif
#ifdef debug_OutputFunctions
// printf( "==> loadDevices - Start\n");
#endif
int deviceCount;
CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, könnte auch nur einmal ausgelesen werden aber zweitrangig.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
//DeviceProperties & outputProb = deviceProperties; // lokalen Zeiger auf Vektor erstellen der auf Klassenvektor zeigt.
//// printf("1: size(%i) capacity(%i) max_size(%i)\n", outputProb.size(), outputProb.capacity(), outputProb.max_size());
//outputProb.reserve(static_cast<std::size_t>(deviceCount)); // Request Vector with size deviceCount
deviceProperties.reserve(static_cast<std::size_t>(deviceCount)); // Request Vector with size deviceCount
//cudaDeviceProp & device = outputProb[deviceId]; //
cudaDeviceProp & device = deviceProperties[deviceId]; //
CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));
//// printf("%i. %s\n", deviceId, device.name);
//// printf("%i. %s\n", deviceId, deviceProperties[deviceId].name);
#ifdef debug_OutputInfo
// printf("%i. %s\n", deviceId, device.name);
// printf(" Byte Total Global Mem: %lld \n", device.totalGlobalMem);
// printf(" Compute Capability: %i.%i\n", device.major,device.minor);
// printf(" Name: %s\n", device.name);
// printf(" Major revision number: %d\n", device.major);
// printf(" Minor revision number: %d\n", device.minor);
// printf(" Total global memory: %lld\n", device.totalGlobalMem);
// printf(" Total shared memory per block: %u\n", device.sharedMemPerBlock);
// printf(" Total registers per block: %d\n", device.regsPerBlock);
// printf(" Warp size: %d\n", device.warpSize);
// printf(" Maximum memory pitch: %lld\n", device.memPitch);
// printf(" Maximum threads per block: %d\n", device.maxThreadsPerBlock);
for (int i = 0; i < 3; ++i) {
// printf(" Maximum dimension %d of block: %lld\n", i, device.maxThreadsDim[i]);
}
for (int i = 0; i < 3; ++i) {
// printf(" Maximum dimension %d of grid: %lld\n", i, device.maxGridSize[i]);
}
// printf(" Clock rate: %d\n", device.clockRate);
// printf(" Total constant memory: %u\n", device.totalConstMem);
// printf(" Texture alignment: %u\n", device.textureAlignment);
// printf(" Concurrent copy and execution: %s\n", (device.deviceOverlap ? "Yes" : "No"));
// printf(" Number of multiprocessors: %d\n", device.multiProcessorCount);
// printf(" Kernel execution timeout: %s\n\n", (device.kernelExecTimeoutEnabled ? "Yes" : "No"));
#endif
//outputProb.push_back(device); // Add element at the end of the vector outputProb
deviceProperties.push_back(device); // Add element at the end of the vector outputProb
//// printf("2: size(%i) capacity(%i) max_size(%i)\n", outputProb.size(), outputProb.capacity(), outputProb.max_size());
#ifdef debug_OutputFunctions
// printf( "<== loadDevices - End\n");
#endif
// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, aber erstmal egal.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// siehe http://gpucoder.livejournal.com/1064.html
// int devCount;
// cudaGetDeviceCount(&devCount);
// // printf("CUDA Device Query...\n");
// // printf("There are %d CUDA devices.\n", devCount);
//
// // Iterate through devices
// for (int i = 0; i < devCount; ++i)
// {
// // Get device properties
// // printf("\nCUDA Device #%d\n", i);
// cudaDeviceProp devProp;
// cudaGetDeviceProperties(&devProp, i);
// printDevProp(devProp);
// }
//
// // printf("\nPress any key to exit...");
// char c;
// scanf("%c", &c);
//cudaDeviceProp & device = deviceProperties[deviceId];
//CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId)); // Eingenschaften des Devices auslesen
//#ifdef debug_OutputInfo // Name des Device mit ID ausgeben
// printf( "Device used: %18s (HW-ID %i) (Idx %i)\n", device.name , deviceId, deviceIndex);
//#endif
CUDA_CHECK(cudaSetDevice(deviceId));
#ifdef debug_OutputInfo // Reset Device
// printf("Reset Device\n");
#endif
//CUDA_CHECK(cudaDeviceReset());
// std::string errorMessage = cudaGetErrorString(cudaPeekAtLastError());
// std::cout << errorMessage << std::endl;
//memoryCheck(); // Freier Speicher am Anfang
// Check and set Block and Grid-Dimensions
genericSAFTBlockDimensions = fixedBlockDimensions;
genericSAFTGridDimensions = dim3(
(IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x-1)/ genericSAFTBlockDimensions.x, // hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
(IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y-1)/ genericSAFTBlockDimensions.y, // in Blockgroesse geteilt werden kann, muss ein weiterer
(IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z // Block berechnet werden. Zu Viele werden im Kernel aussortiert.
);
#if defined(debug_OutputVariables) || defined(debug_OutputZSteps)
if (deviceIndex == DebugOutputGPUIdx){
// printf( "genericSAFTBlockDimensions X,Y,Z = (%i %i %i)\n",genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z);
// printf( "genericSAFTGridDimensions X,Y,Z = (%i %i %i)\n",genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z);
}
#endif
//Pointeruebergabe der AScan-Daten Geometrie-Daten und Output-Daten von Matlab
#ifdef debug_OutputInfo
// printf( "Give Pointer Names for AScan, Geometry, Output and SoS-Data from Matlab\n");
// printf( "Uebergebener Pointer SoSData fuer SoS-Daten aus Matlab\n");
#endif
aScanSamples = (float*)aScan_ptr;
#ifdef debug_OutputInfo
// printf( "Uebergebene Geometry Pointer fuer Index sowie der Zuordnungs-Tabelle aus Matlab\n");
#endif
emitter_index = (unsigned short*) emitter_index_ptr; // Index for associating emitter to corresponding coordinates
receiver_index = (unsigned short*) receiver_index_ptr; // Index for associating receiver to corresponding coordinates
emitter_list = (float3*) emitter_list_ptr; // Lookuptable for emitter coordinates
receiver_list = (float3*) receiver_list_ptr; // Lookuptable for receiver coordinates
#ifdef debug_OutputInfo
// printf( "Uebergebener Pointer output fuer Ausgabe-Daten aus Matlab\n");
#endif
output = (double *)output_ptr;
speedOfSoundFieldVoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z;
speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount * sizeof(float);
#ifdef debug_OutputVariables
// printf(" speedOfSoundFieldVoxelCount [%ix%ix%i] = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, speedOfSoundFieldVoxelCount);
// printf(" speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount(%i) x sizeof(float = 4)] = %i\n", speedOfSoundFieldVoxelCount, speedOfSoundFieldBytes);
#endif
#ifdef debug_OutputInfo
// printf( "Uebergebener Pointer speedOfSoundField fuer SoS-Daten aus Matlab\n");
// printf( "Uebergebener Pointer SoSData fuer SoS-Daten aus Matlab\n");
// printf( "Uebergebener Pointer attenuationField fuer ATT-Daten aus Matlab\n");
#endif
speedOfSoundField = (float*)speed_vec_ptr; // Fuer SoSGrid-Mode fuer korrekte Schallgeschwindigkeitskorrektur
SoSData = (float*)speed_vec_ptr; // Fuer Blockmode
attenuationField = (float*)att_vec_ptr; // Fuer SoSGrid-Mode fuer Daempfungskorrektur
// Uebergabe der Outputgroessen aus Matlab.
regionOfInterestVoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen
outputSize = regionOfInterestVoxelCount * sizeof(double); // Speicherbedarf fuer alle Voxel im Volumen
#ifdef debug_OutputVariables
// printf(" regionOfInterestVoxelCount [%ix%ix%i]= %lld\n",IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, regionOfInterestVoxelCount);
// printf(" outputSize [%lld x sizeof(double = 8)] = %lld\n", regionOfInterestVoxelCount, outputSize);
#endif
//Hier auf maximale Outputgroesse von 32-BitSystem ueberpruefen --> falls Probleme mit 32-Bitsystemen hier noch Abfrage und Abbruch implementieren
if (regionOfInterestVoxelCount > (uint64_t)(2^32 / sizeof(double)) ){ // 2^32 / sizeof(double) = 536870912
// printf("outputSize > 2^32 !!! => works only in 64-Bit System\n");
}
//Groesse der Datenbloecke fuer die Blockverarbeitung wird mit aScanCount angegeben
//Die selbe Anzahl wird auch fuer die Geometriedaten erwartet
#ifdef debug_OutputVariables
// printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
#endif
aScanSize = aScanLength * sizeof(float);
batchSize = aScanCount; // Anzahl der Blockgroesse d.h. wie viele AScans gleichzeitig verarbeitet werden. Batchgroesse ist gleich der Anzahl der uebergebenen Blockgroesse aus Matlab
aScanBatchSize = batchSize * aScanSize; // Batchgroesse der AScans (* 3000 * sizeof(float)) in Byte
#ifdef debug_OutputVariables
// printf( "aScanSize = aScanLength(%i) * sizeof(float=4) = %i\n", aScanLength, aScanSize);
// printf( "batchSize = aScanCount = %i\n", batchSize);
// printf( "aScanBatchSize = batchSize * aScanSize ( = %i * sizeof(float)) = %i\n", aScanLength, aScanBatchSize);
#endif
// if(batchSize > aScanCount) // Abfrage macht keinen Sinn mehr wenn batchSize = aScanCount;
// {
// mexErrMsgTxt("A-scan window size cannot be larger than the total number of A-scans");
// //throw ail::exception("A-scan window size cannot be larger than the total number of A-scans");
// }
#ifdef debug_OutputInfo
// printf("\nParameter for Image Reconstruction\n");
// printf( "========================================================================\n");
//std::cout << "ROI dimensions: " << regionOfInterestResolutionX << " x " << regionOfInterestResolutionY << " x " << regionOfInterestResolutionZ << std::endl;
std::cout << "IMAGE_SIZE_XYZ: [" << IMAGE_SIZE_XYZ.x << " x " << IMAGE_SIZE_XYZ.y << " x " << IMAGE_SIZE_XYZ.z << "]" <<std::endl;
std::cout << "Voxel count in Volume: " << regionOfInterestVoxelCount << std::endl;
//std::cout << "Increment vector: (" << regionOfInterestIncrementVector.x << ", " << regionOfInterestIncrementVector.y << ", " << regionOfInterestIncrementVector.z << ")" << std::endl;
std::cout << "Increment vector/Resolution: (" << IMAGE_RESOLUTION << ")" << std::endl;
std::cout << "IMAGE_STARTPOINT in meters: " << regionOfInterestOffset.x << " " << regionOfInterestOffset.y << " " << regionOfInterestOffset.z << std::endl;
regionOfInterestSize.x = IMAGE_SIZE_XYZ.x * IMAGE_RESOLUTION;
regionOfInterestSize.y = IMAGE_SIZE_XYZ.y * IMAGE_RESOLUTION;
regionOfInterestSize.z = IMAGE_SIZE_XYZ.z * IMAGE_RESOLUTION;
std::cout << "ROI size in metres: " << regionOfInterestSize.x << " " << regionOfInterestSize.y << " " << regionOfInterestSize.z << std::endl;
std::cout << "Batch size/Blocks(Ascan, R/E-Combi): " << batchSize << std::endl;
// printf( "========================================================================\n\n");
#endif
// #ifdef debug_OutputPerformance
// struct timeval startPerformCoreReconstruction, stopPerformCoreReconstruction;
// gettimeofday(&startPerformCoreReconstruction, NULL);
// #endif
//perform processing with AScan-Data
//===========================================================================================================
ullong duration;
processAScans(duration);
//===========================================================================================================
#ifdef debug_OutputPerformance
double numerator = static_cast<double>(aScanCount) * regionOfInterestVoxelCount; // Performanz [Ascans * GVoxel/s]
double performance = numerator / duration;
//adjust for the change from voxels per millisecond to gigavoxels per second (=> 10^3 * 10^-9 = 10^-6)
performance /= 1e9;
//std::cout << "# Device ("<< (int)deviceId <<"): Duration of main processing: " << (int)duration << " us" << std::endl;
//std::cout << "# Device ("<< (int)deviceId <<"): Performance: " << performance << " AScan * GVoxel/s" << std::endl;
#endif
//Duration_ptr[(deviceId+1)] = (double)duration; // Für jede GPU einen Laufzeitwert in µs übermitteln // Angabe von ID der GPU abhaengig
Duration_ptr[(deviceIndex+1)] = (double)duration; // Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig
#ifdef debug_OutputVariables
//// printf( "Duration_ptr[%i] = duration(%i) = %f\n", (deviceId+1), duration, Duration_ptr[(deviceId+1)]);
// printf( " GPU (%s:ID %i,Index %i): => Duration_ptr[%i] = duration(%i µs) = %.2f s\n", device.name, deviceId, deviceIndex, (deviceIndex+1), duration, Duration_ptr[(deviceIndex+1)]/1000/1000);
#endif
// #ifdef debug_OutputVariables
// // printf( "Duration_ptr[0] = duration(%i) = %f\n", duration, Duration_ptr[0]);
// #endif
// Reset Device
// #ifdef debug_OutputInfo
// // printf( "Device was used: %s (%i)\n", deviceProperties[deviceId].name , deviceId);
// #endif
// CUDA_CHECK(cudaSetDevice(deviceId));
#ifdef debug_OutputInfo // Reset Device
// printf("Reset Device\n");
#endif
//CUDA_CHECK(cudaDeviceReset());
#ifdef debug_OutputFunctions
// printf( "<== SAFTHandler::performReconstruction - End\n");
#endif
}
/**
The SAFT kernel expects arguments in which the grid dimensions have been reduced to less than three dimensions and the block dimensions are reduced to only one dimension.
This also depends on the properties of the hardware available (shader model).
- Der SAFT Kernel erwartet Argumente in den die Grid Dimension auf drei Dimensionen reduziert wurde und die Block-Dimensionen auf nur eine Dimension reduziert ist.
- Das haengt auch von den Eigenschaften der verfuegbaren HW ab (shader model)
*/
void SAFTHandler::reduceKernelDimensions(
dim3 const & gridDimensions, ///< Input grid dimensions.
dim3 const & blockDimensions, ///< Input block dimensions.
dim3 & reducedGridDimensions, ///< Reduced output grid dimensions.
dim3 & reducedBlockDimensions ///< Reduced output block dimensions.
)
{
#ifdef debug_OutputFunctions
// printf( "==> SAFTHandler::reduceKernelDimensions - Start\n");
#endif
if(deviceProperties[deviceId].maxGridSize[2] > 1)
{
reducedGridDimensions = gridDimensions;
#ifdef debug_OutputParameter
// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
#endif
}
else
{
reducedGridDimensions = dim3(
gridDimensions.x * gridDimensions.y,
gridDimensions.z,
1
);
#ifdef debug_OutputParameter
// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
#endif
}
reducedBlockDimensions = dim3(blockDimensions.x * blockDimensions.y * blockDimensions.z);
#ifdef debug_OutputParameter
// printf( "reducedBlockDimensions X,Y,Z = (%i %i %i)\n", reducedBlockDimensions.x, reducedBlockDimensions.y, reducedBlockDimensions.z);
#endif
#ifdef debug_OutputFunctions
// printf( "<== SAFTHandler::reduceKernelDimensions - End\n");
#endif
}
/**
Utility function to perform integer based divison which rounds up instead of down.
- N<>tzliche Funktion: eine Integerbasierte Division die aufrundet und nicht abrundet
@return Quotient of the divison, rounded up.
*/
std::size_t ceilingDivision(
std::size_t dividend, ///< Dividend of the division.
std::size_t divisor ///< Divisor of the division.
)
{
std::size_t output = dividend / divisor;
if(dividend % divisor)
output ++;
return output;
}
/**
Converts an offset based on two different resolutions.
This is a utility function used to deal with the number of z-layers in the speed of sound grid.
- Konvertiert einen Offset, basierend auf zwei verschiedenen Aufloesungen
- Diese n<>tzliche Funktion wird genutzt um mit der Anzahl der z-Layer in dem Spallgeschwindigkeits-Grid umzugehen.
@return Result of the conversion.
*/
std::size_t SAFTHandler::resolutionConversion(
std::size_t input, ///< Offset.
std::size_t greaterResolution, ///< Greater resolution.
std::size_t lowerResolution ///< Lower resolution.
)
{
#ifdef debug_OutputFunctions
// printf( "==> SAFTHandler::resolutionConversion - Start\n");
// printf( "<== SAFTHandler::resolutionConversion - End\n");
#endif
return ceilingDivision(input * lowerResolution, greaterResolution);
}
/**
Perform calculations pertaining to the execution of the speed of sound pre-calculations.
- F<>hre Berechnungen der Schallgeschwindigkeit-Vorberechnung aus
*/
void SAFTHandler::determineSpeedOfSoundData(
std::size_t regionOfInterestZLayers ///< Number of z-layers within the region of interest that are currently being processed. This number is often smaller than the total number of z-layers.
)
{
#ifdef debug_OutputFunctions
// printf( "==> SAFTHandler::determineSpeedOfSoundData - Start\n");
#endif
// //Determine the maximum number of z-layers to be pre-calculated within the speed of sound grid
// //Bestimme die maximale Anzahl an Z-layer, die in dem SoS-Grid Vorberechnet werden.
// //std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, regionOfInterestResolutionZ, regionOfInterestGridSizeZ);
// std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, IMAGE_SIZE_XYZ.z, regionOfInterestGridSizeZ);
//
// partialSpeedOfSoundVoxelCount = maximumSpeedOfSoundPartialZLayerCount * regionOfInterestGridSizeX * regionOfInterestGridSizeY;
//
//// deviceTableVoxelToEmitterPathCountSize = sosZLayerVoxelCount * emitter_list_Size * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Emitter
//// deviceTableVoxelToEmitterPathSosSumSize = sosZLayerVoxelCount * emitter_list_Size * partialSoSZLayerCount * sizeof(float);
//// deviceTableVoxelToReceiverPathCountSize = sosZLayerVoxelCount * receiver_list_Size * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Receiver
//// deviceTableVoxelToReceiverPathSosSumSize = sosZLayerVoxelCount * receiver_list_Size * partialSoSZLayerCount * sizeof(float);
//
// std::size_t
// emitterSpeedOfSoundVoxelCombinations = emitterCount * partialSpeedOfSoundVoxelCount,
// receiverSpeedOfSoundVoxelCombinations = receiverCount * partialSpeedOfSoundVoxelCount;
//
// emitterToVoxelPathVoxelDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
// emitterToVoxelPathSpeedDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(float);
//
// voxelToReceiverPathVoxelDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
// voxelToReceiverPathSpeedDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(float);
#ifdef debug_OutputFunctions
// printf( "<== SAFTHandler::determineSpeedOfSoundData - End\n");
#endif
}
/**
Perform initialisations for the partial reconstructions for both the speed of sound pre-calculation and the actual reconstruction.
- F<>hre Initialisierungen fuer eine Teilrekonstruktion von beiden durch: Der Schallgeschwindigkeit und der aktuellen Rekonstruktion
*/
void SAFTHandler::partialReconstructionInitialisation()
{
#ifdef debug_OutputFunctions
// printf( "==> SAFTHandler::partialReconstructionInitialisation - Start\n");
#endif
//
// if(!partialReconstructionInitialised)
// {
// std::cout << "Initialising partial reconstruction data" << std::endl;
//
// //zLayerVoxelCount = regionOfInterestResolutionX * regionOfInterestResolutionY;
// zLayerVoxelCount = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y; // Anzahl der X-Y-Voxel bestimmen den Schritt in das naechste Layer.
//
// partialOutputVoxelCount = partialOutputSize / sizeof(double);
//// if(partialOutputVoxelCount % zLayerVoxelCount != 0) //Sicherheitsabfrage nun im kernel
//// mexErrMsgTxt("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
// //throw ail::exception("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
// partialOutputZLayerCount = partialOutputVoxelCount / zLayerVoxelCount;
//
//// if(partialOutputZLayerCount % genericSAFTBlockDimensions.z != 0) //Sicherheitsabfrage nun im kernel
//// mexErrMsgTxt("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
// //throw ail::exception("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
//
// //Make dynamically sized allocations for the pre-calculated speed of sound data.
// //The size depends on the number of z-layers in the output window.
// //These particular pre-calculations are no longer performed only once for all voxels.
// //Instead, they are performed partially, prior to each launch of the SAFT kernel.
// //This lowers the pressure on GPU global memory.
//
// //F<>hre Allokationen mit dynamischer Groesse aus fuer die Vor-Verarbeitung der SoS-Daten
// //Die Groesse haengt von der Anzahl der z-Layer in dem -Fenster ab.
// //Diese partielle-Vorberechnung muss nur einmal fuer alle Voxel durchgef<65>hrt werden.
// //Stattdessen werden sie immer partiell durchgef<65>hrt, vor jedem Start des SAFT-Kernels.
// //Das entlastet den globalen GPU-Speicher.
//
// determineSpeedOfSoundData(partialOutputZLayerCount);
//
// // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathVoxelCounts der Groesse:%i\n", emitterToVoxelPathVoxelDataSize);
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathVoxelCounts), emitterToVoxelPathVoxelDataSize));
// // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathSpeedOfSoundSum der Groesse:%i\n", emitterToVoxelPathSpeedDataSize);
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathSpeedOfSoundSum), emitterToVoxelPathSpeedDataSize));
//
// // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathVoxelCounts der Groesse:%i\n", voxelToReceiverPathVoxelDataSize);
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathVoxelCounts), voxelToReceiverPathVoxelDataSize));
// // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathSpeedOfSoundSum der Groesse:%i\n", voxelToReceiverPathSpeedDataSize);
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathSpeedOfSoundSum), voxelToReceiverPathSpeedDataSize));
//
// partialReconstructionInitialised = true;
// }
#ifdef debug_OutputFunctions
// printf( "<== SAFTHandler::partialReconstructionInitialisation - End\n");
#endif
}
/**
Print free/total memory available on the chosen device.
- Gibt freien/totalen zur verf<72>gung stehenden Speicher auf dem gew<65>hlten Device aus.
*/
void memoryCheck()
{
#ifdef debug_OutputFunctions
// printf( "==> memoryCheck - Start\n");
#endif
std::size_t
totalMemory,
freeMemory;
CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
#if defined(debug_OutputInfo) || defined(debug_OutputMaxMemory)
//printSize(" Total memory ", totalMemory);
//std::cout << " ( " << totalMemory << " )" << std::endl;
//printSize(" Free memory ", freeMemory);
//std::cout << " ( " << freeMemory << " )" << std::endl;
//printSize(" => Used memory ", (totalMemory-freeMemory));
//std::cout << " ( " << (totalMemory-freeMemory) << " )" << std::endl;
#endif
#ifdef debug_OutputFunctions
// printf( "<== memoryCheck - End\n");
#endif
}
/**
Generic CUDA call wrapper.
Check the result of a CUDA operation and throw an exception if an error occurred.
This is used in combination with a macro in saft.hpp.
- Generischer CUDA Call Wrapper
- <20>berpr<70>ft die Ergebnisse einer CUDA Operation und wirft eine Exception wenn ein Fehler auftritt
- Das wird wird mit einer Kombination mit einem Makro in saft.hpp genutzt.
*/
//inline // Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
void performCUDAResultCheck(
cudaError_t result, ///< Result of the CUDA operation.
std::string const & file, ///< Path to the source code file.
int line ///< Line within the source code
)
{
if(result != cudaSuccess)
{
//// printf("A CUDA operation failed in file \"%s\" (line %i): %s \n", file, line, cudaGetErrorString(result).c_str() );
// printf("%s\n", cudaGetErrorString( cudaGetLastError() ) );
//std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
//std::cout << errorMessage << std::endl;
mexErrMsgTxt("-> Error occurred");
}
}

15
SAFT_ATT/src/saft.cu Normal file
View File

@@ -0,0 +1,15 @@
#include <iostream>
#include "saft.hpp"
/*!
This is the central CUDA file which really just includes the other modules.
This is done because CUDA does not support external references for referencing data from other compilation units.
- Dies ist das zentrale CUDA-File welches nur die anderen Module einbindet
- Das wird gemacht, weil CUDA keine externen Referenzen unterst<73>tzt, um Daten von anderen Compilierungs Einheiten zu referenzieren.
*/
#include "kernel/constantMemory.hcu"
#include "kernel/rayTracing.hcu"
#include "kernel/precalculateSpeedOfSoundKernel.hcu"
#include "kernel/saftKernel.hcu"

594
SAFT_ATT/src/saft.hpp Normal file
View File

@@ -0,0 +1,594 @@
// 1. Compilieren mit make
// -> es wird folgende Datei erstellt: output/saft_sos.mexa64
// 2. Kopieren in Arbeitsordner
// cp /home/kretzek/fser/sandbox/SAFT-GPU/output/saft_sos.mexa64 /home/kretzek/fser/USCT_SW/3DReconstruction/Reconstruction/Reflection/trunk/saft_sos_compute2_debugSoS.mexa64
#pragma once
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <stdint.h>
#include <stdio.h> // standard input/output
#include <vector> // stl vector header
typedef unsigned char uchar;
typedef unsigned short ushort;
typedef unsigned long ulong;
typedef unsigned long long ullong;
//Define Outputs for Debugmode
//============================
//#define debug_OutputFunctions // Funktionenaufrufe ausgeben
//#define debug_OutputVariables // Werte der Variablen ausgeben
//#define debug_OutputParameter // Uebersicht der Eingabedaten anzeigen sowie Infoblöcke in den einzelnen Schritten
//#define debug_OutputMemory // Speicherverwaltung, Malloc, Free, Groessen
//#define debug_OutputMaxMemory // Gibt aktuellen Speicherverbrauch an, wenn memoryCheck aufgerufen wird
//#define debug_OutputInfo // Gibt Infos zu Schritten, Variablen,... aus
//#define debug_OutputPerformance // Gibt die Laufzeiten und die eizelnen Multi-GPU Performanzwerte von ProcessAscans aus (MemAlloc,PerformCoreReconstruction, Duration, FreeMem)
//#define debug_OutputStepsPerformance // Gibt die Laufzeiten und für die eizelnen Schritte in performCoreReconstruction aus (Copy Ascans, Precalc, PerfCoreReconstruction, copy back)
//#define debug_OutputZSteps // Gibt die Einteilung in Z-Richtung aus
#define DebugOutputGPUIdx 0
//#define debug_OutputHostStepsPerformance // Gibt die Laufzeiten für die eizelnen Schritte auf dem HOST aus (Preintegrated Ascans)
//#define debug_OutputSAFTHandlerThreadPerformance // Gibt die Gesamt-Laufzeiten der einzelnen Multi-GPU Threads aus
//#define debug_OutputMultiGpu // Einteilung des Volumens auf mehrerer GPUs ausgeben
//#define debug_OutputStreams // Gibt die Schritte der Berechnung der Streams aus
//#define debug_OutputSOSPaths // Gibt die Schritte und Werte der SOSPfadberechnung aus
//#define debug_OutputSOSStepsParameter // Einteilung der ZLayer in SOSZlayer
//#define debug_OutputLookUpGeometryMemoryList // Debugausgabe fuer die LookUpGeometryMemoryList (Constant Memory)
//#define OutputVolume // Ausgabe des Volumens
// Debugging CUDA Kernels
//================================================
//#define debug_CudaSAFTKernel
//#define debug_CudaSAFTKernel_Median
//#define debug_CudaPrecalculateKernel
//#define debug_CudaRayTraceKernel
//#define debug_CudaRayTraceKernelLive
//#define DebugSetMemoryToZero // Set SOSPathMemory to Zero as Initialisation
// Define specific Hardware-Versions
#define GTX_590
//#define GTX_690
//#define GTX_TITAN
#if defined(GTX_590)
#define GTX_Fermi
#endif
#if defined(GTX_690) || defined(GTX_TITAN)
#define GTX_Kepler
#endif
// Speichermanagement der GPU sowie Errordetektion
//================================================
//#define SaftNoTexture
//#define SaftCorrectSumOneAscan // 9.7-9.9 GVA/s // Skip wrong Numbers
#define SaftCorrectSumAllAscan // 8.2 GVA/s // Recalculation if too high numbers are calculated
#define SaftEmitterCache // Caching for Emitter Coordinates and Distance
//#define SaftEmitterCacheTernery // Caching for Emitter Coordinates and Distance
// SAFT- SOS Implementierungen
//================================================
//#define SaftSoSNoCache
//#define SaftSoSEmitterCache
//#define SaftSoSCombineTasCache // noch nicht implementiert
//#define SaftSoSCombineInSoSVoxelCache
#define SaftSoSWithPrecalculateSoSZLayer
#define SaftMedian
#define BRANCHLESS_MEDIAN // Ohne kommts zum Absturz!
//#define SaftMedian_withMean3 // Mean of 3 Values
//#define SaftMedian_withMean5 // Mean of 5 Values
//#define SaftMedian_CalcOnlyMean // Mean of all buffered Values in Window
#define maxMedianWindowSize 96
#ifndef FLT_MAX //is not defined in cuda kernel?
#define FLT_MAX 0x1.fffffep127f
#endif
// Integration der A-scans im Vornherein durchfuehren um Samplebreite an zu rekonstruierende Aufloesung anzupassen
#define preAscanIntegrationToMatchSamplerateToResolution // Integration der Ascans ueber Fensterbreite durchfuehren
//#define debug_preAscanIntegration
#define DebugSammleMin 2990
#define DebugSammleMax 3000
//#define preAscanIntegrationVersion1Michael // direkt übernommene Version von Michael
#define preAscanIntegrationVersion2Ernst // korrigierte Variante mit genauerer Fensterbreite
// Parameter fuer SAFT-Kernel
#define SaftLinearInterpolation // Lineare Interpolation beim Zugriff auf A-scans durchführen
#define SaftUseConstantMemforGeometry // Geometriedaten im Constantmemory nutzen
//#define SaftTextureForERIndexBlock // Texturmemory für das Laden der Emitter und Receiver Indexe fuer entsprechenden AScan nutzen
#define debug_CudaSAFTKernelModes // Use variable debugMode for different calulations methods and output
//#define debug_CudaSAFTKernel_EnableAnalyticAverageSpeedCalculation // Fuer Fehlerberchnungen
//#define SaftTextureForEmRecSosPathsTablesFloat1 // Use Float1-Textur for loading SOS-Paths -> Sum, Count separated
//#define SaftTextureForEmRecSosPathsTablesFloat2 // Use Float2-Textur for loading SOS-Paths -> Sum + Count for SOS for one position
#define SaftTextureForEmRecSosPathsTablesFloat4 // Use Float4-Textur for loading SOS-Paths -> Sum as well Count for SOS and ATT for one position
#if defined(SaftTextureForEmRecSosPathsTablesFloat1) || defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
#define SaftTextureForEmRecSosPathsTables // Use Textur for loading SOS-Paths, -> Interpolation between SoSVoxelnPaths is possible
#endif
// Several SAFT_VARIANTs
#define SAFT_VARIANT_AscanPreintegration 0
#define SAFT_VARIANT_AscanInterpolation 1
#define SAFT_VARIANT_3DVolumeInterpolationAtPreprocessing 2 // Use interpolation while Preprocessing
#define SAFT_VARIANT_3DVolumeInterpolationAtReconstruction 3 // Use interpolation while Reconstruction
#define SAFT_VARIANT_CalcStandardDeviation 4
#define SAFT_VARIANT_SumUpOverBoarderIndices 5
// Cache <-> shared Memory
//#define SaftPreferSharedMem // cudaFuncCachePreferShared: shared memory is 48 KB
#define SaftPreferL1SharedMem // cudaFuncCachePreferL1: shared memory is 16
//#define SaftPreferNone // cudaFuncCachePreferNone: no preference
// Receiver Cache mit shared Memory (nur bei kleinen Blockgroeßen)
//#define SaftReceiverSharedMemCacheReceiverDistance
//#define SaftCacheReceiverSOS
//#define SaftReceiverSharedMemCacheReceiverSOS // Use Shared Memory for Caching
//#define SaftRegisterCacheReceiverSOS // Use Register for Caching
// Berechnung der mittleren Schallgeschwindigkeit
//================================================
//#define SaftUseArithmeticMean // arithmetic Mean
#define SaftUseHarmonicMean // harmonic Mean //das Richtige!!
//#define SaftCalcSoSInKernel // Bresenham wird noch mal speziell bei jedem Voxel und Pfad durchgerechnet!
// ! SOS_Version2 rausnehmen sonst gehts nicht!
#define SaftTextureForBresenhamSosPaths // Texturmemory für SOS-Volumen nutzen
//#define SaftTextureForBresenhamInterpolated //iSOS-Version --> wird nun ueber Parameter uebergeben
//#define SaftUseFastMath //FastMath fuer schnellere Berechnung aber Fehler am Rand. Dafuer ist Korrektur noetig.
//#define SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att) // Aktuell nicht implementiert
#define SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
#define SOS_Version2 // korrekte Version mit Definitionen im Mittelpunkt
//#define SOS_Version3 // Mit extra Angabe der Endpkte
// MultiGPU
//================================================
// #define debug_SetNumGPU // Anzahl der GPUs festlegen
// //#undef debug_SetNumGPU
//
// #ifdef debug_SetNumGPU
// #define NUM_GPUS 1
// #define NUM_DEVICEGPU 1 // Um diese Anzahl verschiebt sich alles also zB bei +1
// #endif
const int MAX_EMITTER_RECEIVE_IN_CONSTANT_MEMORY = 2340;
#define Distanz_Standard //172 MV/s //14,5 GVA/s
//#define Distanz_Heron2
//#define Distanz_Memory 100 //Mit 100-Werte LUT-Memory //11,53 GVA/s //Diff [0 .. 0.0828] sehr schlecht!
//#define Distanz_Memory 1000 //Mit 1000-Werte LUT-Memory //12,6 GVA/s //Diff [0 .. 0.0096]
//#define Distanz_Memory 1000_Heron //281 MV/s //Diff [0 .. 2.3176e-004]
//#define Distanz_Memory 10000 //Mit 10000-Werte LUT-Memory //11,58 GVA/s //Diff [0 .. 9.6333e-004]
//#define Distanz_Memory 100000 //Mit 10000-Werte LUT-Memory //375 MV/s
//#define Use_Distanz_SharedMemory
//Macro used to perform CUDA calls. Throws an exception in case of a CUDA error. Also shows on which line it occurred.
#define CUDA_CHECK(operation) performCUDAResultCheck(operation, __FILE__, __LINE__);
//Macro used to see when a particular line of code is executed on the host.
#define DEBUG_MARK std::cout << "[DEBUG] file " << __FILE__ << ", line " << __LINE__ << std::endl
//Convenient typedefs for containers
typedef std::vector<cudaDeviceProp> DeviceProperties;
typedef std::vector<dim3> Dimensions;
/**
Most important class in the application.
- Haupt-Klasse der Applikation
It is responsible for all of the image reconstruction.
- Sie ist verantwortlich fuer alle BildRekonstruktionen
*/
class SAFTHandler
{
public:
SAFTHandler(int deviceId,
int deviceIndex,
float *aScan_ptr, ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath,
double *output_ptr, ///< Zeiger zu den Outputdaten //std::string const & outputPath,
double *Duration_ptr, ///< Zeiger auf Ausgabewert f<>r benoetigte Laufzeit des SAFT-Kernels
unsigned short *receiver_index_ptr, ///<
unsigned short *emitter_index_ptr, ///<
float *receiver_list_ptr, ///<
int receiver_list_Size, ///<
float *emitter_list_ptr, ///<
int emitter_list_Size, ///<
float *speed_vec_ptr,
int3 SOSGrid_XYZ,
float3 sosOffset, ///< Startpoint of SoSGrid
float SOS_RESOLUTION, ///< Aufloesung des SoSGrid
float *att_vec_ptr, //att_vec_ptr
int aScanCount,
int aScanLength,
int3 IMAGE_SIZE_XYZ,
float sampleRate,
float3 regionOfInterestOffset,
float IMAGE_RESOLUTION,
dim3 const & fixedBlockDimensions,
int medianWindowSize, ///< define width of used median filter
float debugMode,
float debugModeParameter,
//bool useFixedPartialOutputWindow,
bool SOSMode_3DVolume,
bool ATTMode_3DVolume,
int SAFT_MODE,
int *SAFT_VARIANT
);
void performReconstruction();
private:
bool SOSMode_3DVolume,
ATTMode_3DVolume;
int SAFT_MODE;
int *SAFT_VARIANT;
int *deviceSAFT_VARIANT;
#ifdef Distanz_Memory
float *deviceWurzelApprox;
#endif
int deviceId;
int deviceIndex;
float debugMode;
float debugModeParameter;
DeviceProperties deviceProperties;
float
*aScan_ptr;
// float
// *rec_vec_ptr,
// *send_vec_ptr;
unsigned short
*emitter_index_ptr,
*receiver_index_ptr;
float
*emitter_list_ptr,
*receiver_list_ptr;
int
receiver_list_Size,
emitter_list_Size;
double
*output_ptr;
double
*Duration_ptr;
float
Sos,
*speed_vec_ptr,
*att_vec_ptr;
int3
SOSGrid_XYZ;
float3
sosOffset; ///< Startpoint of SoSGrid
int
aScanCount,
aScanLength;
int3
IMAGE_SIZE_XYZ;
float3 regionOfInterestSize; // ROI-Groesse in meter
float3
regionOfInterestOffset; //imageStartpoint; TODO: umbenennen!
float
IMAGE_RESOLUTION, ///< Aufl<66>sung im OutputVolumen
IMAGE_RESOLUTION_FACTOR, ///< 1/Aufl<66>sung im OutputVolumen
SOS_RESOLUTION, ///< Aufloesung des SoSGrid
SOS_RESOLUTION_FACTOR; ///< 1/Aufl<66>sung im SoS-Grid
std::string
emitterGeometryPath,
receiverGeometryPath,
aScanSamplesPath,
outputPath;
// bool
// printPerformanceAnalysis,
// printSortedAutoTuningResults;
float *aScanSamples;
double *output;
//int aScanCount;
int
aScanSize,
batchSize,
aScanBatchSize;
float voxelSize;
float sampleRate;
//size_t
uint64_t
regionOfInterestVoxelCount,
outputSize;
uint64_t
partialOutputZLayerOffset;
int
partialOutputZLayerOffsetCount,
partialOutputSoSZLayerCount,
currentZLayerCount,
partialSoSZLayerCount;
double *currentHostOutputAdress;
// Pointer of Inputdata in memory of Ascanblock
float3
*receiver_list, // LookUpTable receiverNr -> coordinates
*emitter_list; // LookUpTable emitterNr -> coordinates
unsigned short
*receiver_index, // Input Ascanblockdata: corresponding receiverNr
*emitter_index; // Input Ascanblockdata: corresponding emitterNr
float
*SoSData; // Input Ascanblockdata: Corresponding SOS value
float *speedOfSoundField; // Input Ascanblockdata: Corresponding SOS value as volume TODO: ==> in speedOfSoundGrid umbenennen
float *attenuationField; // Input Ascanblockdata: Corresponding ATT value as volume TODO: ==> in attenuationGrid umbenennen
#ifdef SaftUseSosAttFloat2
float2 *hostSosAttField;
#endif
// Memorysizes
//std::size_t
int
speedOfSoundFieldVoxelCount, //
speedOfSoundFieldBytes, //
speedOfSoundEmitterVoxelPathCountByteSize, // Speichergroesse fuer die Anzahl der Voxel, die auf einem Pfad liegen
speedOfSoundEmitterVoxelPathSumByteSize; // Speichergroesse fuer die Summe der Schallgeschwindigkeiten auf dem Pfad zu einem Voxel
dim3
fixedBlockDimensions, // kann ws durch genericSAFTBlockDimensions ersetzt
genericSAFTBlockDimensions,
genericSAFTGridDimensions,
windowGridDimensions;
int medianWindowSize; // define width of used median filter
#ifdef SaftNoTexture
float ** deviceAScans;
#else
cudaArray **deviceAScansCuArray;
#endif
#ifdef SaftTextureForBresenhamSosPaths
#ifdef SaftUseSosAttFloat1 // Nutze getrennte Texturen fuer beide Volumen (Sos+Att)
cudaArray *deviceSpeedOfSoundFieldCuArray; // SOS volume
cudaArray *deviceAttenuationFieldCuArray; // ATT volume
#endif
#ifdef SaftUseSosAttFloat2 // Nutze nur eine Textur fuer beide Volumen (Sos+Att)
cudaArray *deviceSosAttFieldCuArray;
#endif
#endif
int maxSoSReceiverArrayForTexture;
int TableVoxelToReceiverPathSosAllocationCount;
std::size_t receiver_list_Size_deviceMemory;
#ifdef SaftTextureForEmRecSosPathsTables
// Für Emitter ----- normal definieren
cudaArray *deviceTableVoxelToEmitterPathSosSumCuArray; //SoSSum
//cudaPitchedPtr pitchedTableVoxelToEmitterPathSosSumDevPtr;
cudaArray *deviceTableVoxelToEmitterPathCountCuArray; //Count
//cudaPitchedPtr pitchedTableVoxelToEmitterPathCountDevPtr;
// Für Receiver ----- als Arrays definieren
cudaArray **deviceTableVoxelToReceiverPathSosSumCuArray; //SoSSum
//cudaPitchedPtr * pitchedTableVoxelToReceiverPathSosSumDevPtr;
cudaArray **deviceTableVoxelToReceiverPathCountCuArray; //Count
//cudaPitchedPtr * pitchedTableVoxelToReceiverPathCountDevPtr;
#endif
#if defined(SaftTextureForEmRecSosPathsTablesFloat2) || defined(SaftTextureForEmRecSosPathsTablesFloat4)
cudaArray *deviceTableVoxelToEmPathSosBothCuArray; //Emitter SoSSum + Count
cudaArray **deviceTableVoxelToRecPathSosBothCuArray; //Receiver SoSSum + Count
#endif
#ifdef SaftTextureForERIndexBlock
cudaArray * deviceEmIndexBlockCuArray;
cudaArray * deviceRecIndexBlockCuArray;
#endif
// Schallgeschwindigkeitskorrektur-Mode
float *deviceSpeedOfSoundField; // Adressen fuer Speicherfuer Schallgeschwindigkeitsgrid auf der GPU
// Block-Mode
unsigned short *deviceEmitterIndex_block; // Adressen fuer Speicher fuer Index der Geometriedaten auf der GPU
unsigned short *deviceReceiverIndex_block;
float3 *deviceListEmitterGeometry; // Adressen fuer Speicher fuer Zuordnung Index <-> Geometriedaten auf der GPU
float3 *deviceListReceiverGeometry;
float *deviceSoSData_block; // Adressen fuer Speicher fuer Schallgeschwindigkeitsdaten auf der GPU
// VoxelCountType // Adressen fuer Speicher der SoS-Pfade auf der GPU
// * deviceTableVoxelToEmitterPathCount,
// * deviceTableVoxelToReceiverPathCount;
float
*deviceTableVoxelToEmitterPathCountFloat,
*deviceTableVoxelToReceiverPathCountFloat,
*deviceTableVoxelToEmitterPathSosSum,
*deviceTableVoxelToReceiverPathSosSum;
bool *deviceValidEmitterReceiverCombinations;
int *deviceTransducerVectorAnalysisDistributionCounters;
// float3
// * deviceEmitterGeometry,
// * deviceReceiverGeometry;
int usedAmountOfEmitter, // amount of used emitter
usedAmountOfReceiver; // amount of used receiver
// Output volume
double *deviceOutput;
//Streams used for synchronisation
cudaStream_t
copyStream,
calculationStream;
//This variable describes the number of allocations used by the current SAFT mode
std::size_t aScanAllocationCount;
int
invalidEmitterReceiverCombinationsCount,
validEmitterReceiverCombinationsCount;
Dimensions validBlockDimensions;
bool useAutoTuning;
// AutoTuningConfiguration autoTuningConfiguration;
size_t
partialOutputSize,
partialVolumeSize, // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an Z-Layern benötigt wuerde
partialSosPathSize, // Speicher(OutputVolumen), der fuer die entsprechende Anzahl an SoS-Z-Layer benötigt wuerde
maxFeasibleZLayerCount, // Maximal moegliche Anzahl an Z-Layern wird zu Beginn auf # die in eine SOS Z-layer passt gesetzt.
maxFeasibleSosZLayerCount; // Maximal moegliche Anzahl an Sos-Z-Layern wird zu Beginn auf Anzahl der noetigen SoS-Z-Layern für die OutputDaten gesetzt.
int
minimumAutoTuningThreadCount,
maximumAutoTuningThreadCount;
//New partial reconstruction data
std::size_t partialSpeedOfSoundVoxelCount;
std::size_t partialOutputZLayerCount;
std::size_t zLayerVoxelCount;
std::size_t sosZLayerVoxelCount; // Anzahl der X-Y-SOSVoxel in einer SoS-Layer. //saft.hpp
std::size_t partialOutputVoxelCount;
std::size_t
//deviceTableVoxelToEmitterPathCountSize,
deviceTableVoxelToEmitterPathCountFloatSize,
deviceTableVoxelToEmitterPathSosSumSize,
//deviceTableVoxelToReceiverPathCountSize,
deviceTableVoxelToReceiverPathCountFloatSize,
deviceTableVoxelToReceiverPathSosSumSize;
double diff_time; // For Time Measurement
float transferRate; // For DataTransferrate Measurement
float performRate; // For PerformSAFTrate Measurement
cudaDeviceProp deviceProp; // Ausgabe der Frequenz
//Core reconstruction
void processAScans(ullong & duration);
void performCoreReconstruction();
//Pre-calculation
void precalculateAverageSpeedOfSound(int zLayer, int zLayerCount);
// void analysisOfTransducerVectors();
// void normalisePerformanceStatisticsOutput();
// void printTransducerVectorStatistics();
//Auto-tuning
bool determineGridDimensions(dim3 const & blockDimensions, dim3 & gridDimensions);
void determineValidBlockDimensions();
void reduceKernelDimensions(dim3 const & gridDimensions, dim3 const & blockDimensions, dim3 & reducedGridDimensions, dim3 & reducedBlockDimensions);
//Pre-calculation kernels
#ifdef SaftUseConstantMemforGeometry
//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, int deviceGeometry, int geometryElementCount, float * deviceVoxelCountOutputFloat, float * deviceSpeedOfSoundSumOutput);
#else
//void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, VoxelCountType * deviceVoxelCountOutput, float * deviceSpeedOfSoundSumOutput);
void precalculateAverageSpeedOfSound(int firstZLayer, int sosZLayerCount, float3 const * deviceGeometry, int geometryElementCount, float * deviceSpeedOfSoundSumOutput);
#endif
// void analyseTransducerVectors(dim3 gridDimensions, dim3 blockDimensions);
//SAFT kernels
//void performInterpolation(float * deviceAScans, float * deviceOutput, dim3 gridDimensions, dim3 blockDimensions, cudaStream_t stream);
//void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float const * deviceAScans); //, cudaStream_t stream);
#ifdef SaftNoTexture
void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, float * deviceAScans ); //Ascans im Devicememory
#else
void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, int maxFeasibleSosZLayerCount, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceAScansCuArray); //Ascans in CuArray f<>r Texturmemory
//void performSAFT(int aScanIndex, int aScanWindowSize, int3 IMAGE_SIZE_XYZ, int3 SOSGrid_XYZ, int blockIndexOffset, int outputWindowVoxelCount, int speedOfSoundZLayer, int speedOfSoundVoxelsWithinZLayers, dim3 const & windowGridDimensions, dim3 const & gridDimensions, dim3 const & blockDimensions, float * deviceSpeedOfSoundField, cudaArray * deviceSpeedOfSoundFieldCuArray, cudaArray * deviceAScansCuArray); //Ascans in CuArray f<>r Texturmemory
#endif
//Utility functions
bool setGenericDimensions();
std::size_t resolutionConversion(std::size_t input, std::size_t greaterResolution, std::size_t lowerResolution);
void partialReconstructionInitialisation();
std::size_t getCurrentZLayerCount(std::size_t zOffset);
void getCurrentSpeedOfSoundVariables(std::size_t zOffset, std::size_t currentZLayerCount, std::size_t & currentSpeedOfSoundZLayer, std::size_t & currentSpeedOfSoundPartialZLayerCount);
void determineSpeedOfSoundData(std::size_t regionOfInterestZLayers);
};
//std::string vectorToString(float3 const & vector);
//std::string voxelToString(dim3 const & voxel);
extern void memoryCheck();
extern void performCUDAResultCheck(cudaError_t result, std::string const & file, int line);