651 lines
30 KiB
C++
651 lines
30 KiB
C++
|
|
#include <mex.h>
|
|||
|
|
|
|||
|
|
#include <iostream>
|
|||
|
|
#include <vector>
|
|||
|
|
|
|||
|
|
#include <cstdlib>
|
|||
|
|
#include <ctime>
|
|||
|
|
#include <cmath>
|
|||
|
|
|
|||
|
|
//#include <sys/time.h>
|
|||
|
|
|
|||
|
|
//#include <ail/file.hpp>
|
|||
|
|
//#include <ail/string.hpp>
|
|||
|
|
//#include <ail/time.hpp>
|
|||
|
|
|
|||
|
|
//#include "configuration.hpp"
|
|||
|
|
#include "saft.hpp"
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Clumsy constructor of the core reconstruction class.
|
|||
|
|
- Unbeholfener Konstruktor der Kern Rekonstuktionsklasse
|
|||
|
|
*/
|
|||
|
|
SAFTHandler::SAFTHandler(
|
|||
|
|
int deviceId, ///< CUDA ID of the device to be used.
|
|||
|
|
int deviceIndex, ///< Index given by MATLAB (An welcher Position steht die GPU in der Liste?)
|
|||
|
|
float *aScan_ptr, ///< Zeiger zu den AScandaten //std::string const & aScanSamplesPath, ///< Path to the actual A-scan samples.
|
|||
|
|
double *output_ptr, ///< Zeiger zu den daten // std::string const & Path, ///< Path to a file in which the output of the image reconstruction is to be stored.
|
|||
|
|
double *Duration_ptr, ///< Zeiger auf R<>ckgabewert fuer Matlab fuer Laufzeit des Kernels
|
|||
|
|
unsigned short *receiver_index_ptr, ///<
|
|||
|
|
unsigned short *emitter_index_ptr, ///<
|
|||
|
|
float *receiver_list_ptr, ///<
|
|||
|
|
int receiver_list_Size,
|
|||
|
|
float *emitter_list_ptr, ///<
|
|||
|
|
int emitter_list_Size,
|
|||
|
|
float *speed_vec_ptr, ///< Zeiger auf die SoS-Daten in Block-/Gridmode
|
|||
|
|
int3 SOSGrid_XYZ,
|
|||
|
|
float3 sosOffset, ///< Startpoint of SoSGrid
|
|||
|
|
float SOS_RESOLUTION, ///< Aufloesung des SoSGrid
|
|||
|
|
float *att_vec_ptr, ///< Zeiger auf die Att-Daten inm Gridmode
|
|||
|
|
|
|||
|
|
int aScanCount,
|
|||
|
|
int aScanLength,
|
|||
|
|
int3 IMAGE_SIZE_XYZ,
|
|||
|
|
float sampleRate,
|
|||
|
|
float3 regionOfInterestOffset,
|
|||
|
|
float IMAGE_RESOLUTION,
|
|||
|
|
dim3 const & fixedBlockDimensions, ///< If fixed block dimensions are enabled, they will be used over the ones determined by auto-tuning.
|
|||
|
|
int medianWindowSize, ///< define width of used median filter
|
|||
|
|
float debugMode,
|
|||
|
|
float debugModeParameter,
|
|||
|
|
bool SOSMode_3DVolume,
|
|||
|
|
bool ATTMode_3DVolume,
|
|||
|
|
|
|||
|
|
int SAFT_MODE,
|
|||
|
|
int *SAFT_VARIANT
|
|||
|
|
):
|
|||
|
|
deviceId(deviceId), // Das hier ist eine Initialisation der Klassenvariablen mit den <20>bergebenen Werten aehnlich Konstruktor, called Initializer list
|
|||
|
|
deviceIndex(deviceIndex),
|
|||
|
|
|
|||
|
|
aScan_ptr(aScan_ptr), //aScanSamplesPath(aScanSamplesPath),
|
|||
|
|
|
|||
|
|
output_ptr(output_ptr), //Path(Path),
|
|||
|
|
Duration_ptr(Duration_ptr),
|
|||
|
|
|
|||
|
|
receiver_index_ptr(receiver_index_ptr), //
|
|||
|
|
emitter_index_ptr(emitter_index_ptr), //
|
|||
|
|
receiver_list_ptr(receiver_list_ptr), //
|
|||
|
|
receiver_list_Size(receiver_list_Size),
|
|||
|
|
emitter_list_ptr(emitter_list_ptr), //
|
|||
|
|
emitter_list_Size(emitter_list_Size),
|
|||
|
|
speed_vec_ptr(speed_vec_ptr), ///< SoS-Daten im Blockmode oder SoSGrid
|
|||
|
|
SOSGrid_XYZ(SOSGrid_XYZ), // Groesse des SoSGrids
|
|||
|
|
sosOffset(sosOffset), ///< Startpoint of SoSGrid
|
|||
|
|
SOS_RESOLUTION(SOS_RESOLUTION), ///< Aufloesung des SoSGrid
|
|||
|
|
|
|||
|
|
att_vec_ptr(att_vec_ptr), ///< Att-Daten als ATTGrid
|
|||
|
|
|
|||
|
|
aScanCount(aScanCount),
|
|||
|
|
aScanLength(aScanLength),
|
|||
|
|
IMAGE_SIZE_XYZ(IMAGE_SIZE_XYZ),
|
|||
|
|
sampleRate(sampleRate),
|
|||
|
|
regionOfInterestOffset(regionOfInterestOffset),
|
|||
|
|
IMAGE_RESOLUTION(IMAGE_RESOLUTION),
|
|||
|
|
|
|||
|
|
fixedBlockDimensions(fixedBlockDimensions),
|
|||
|
|
medianWindowSize(medianWindowSize),
|
|||
|
|
debugMode(debugMode),
|
|||
|
|
debugModeParameter(debugModeParameter),
|
|||
|
|
SOSMode_3DVolume(SOSMode_3DVolume),
|
|||
|
|
ATTMode_3DVolume(ATTMode_3DVolume),
|
|||
|
|
|
|||
|
|
SAFT_MODE(SAFT_MODE),
|
|||
|
|
SAFT_VARIANT(SAFT_VARIANT)
|
|||
|
|
{
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> SAFTHandler::SAFTHandler - Start\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo
|
|||
|
|
// printf( "SAFTHandler Constructor\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
aScanAllocationCount = 1; // Speicher der Allokiert wird, es reicht einer statt 2! 2 nur wenn Streams fuer Copy genutzt werden sollen.
|
|||
|
|
|
|||
|
|
IMAGE_RESOLUTION_FACTOR = 1 / IMAGE_RESOLUTION; // Auflösung im OutputVolumen
|
|||
|
|
SOS_RESOLUTION_FACTOR = 1 / SOS_RESOLUTION; // Auflösung im SoS-Grid
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputVariables
|
|||
|
|
// printf( "IMAGE_RESOLUTION_FACTOR = %e\n", IMAGE_RESOLUTION_FACTOR);
|
|||
|
|
// printf( "SOS_RESOLUTION_FACTOR = %e\n", SOS_RESOLUTION_FACTOR);
|
|||
|
|
// printf( "Samplerate = %e\n", sampleRate);
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "<== SAFTHandler::SAFTHandler - End\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Top level function of the SAFTHandler class that performs the image reconstruction.
|
|||
|
|
- Top Level Funktion der SAFTHandler Klasse die die Bildrekonstruktion durchf<EFBFBD>hrt.
|
|||
|
|
*/
|
|||
|
|
void SAFTHandler::performReconstruction()
|
|||
|
|
{
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> SAFTHandler::performReconstruction - Start\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo // Name des Device mit ID ausgeben
|
|||
|
|
// printf( "Device ID: %i\n", deviceId);
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> loadDevices - Start\n");
|
|||
|
|
#endif
|
|||
|
|
int deviceCount;
|
|||
|
|
CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
|
|||
|
|
|
|||
|
|
|
|||
|
|
// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, könnte auch nur einmal ausgelesen werden aber zweitrangig.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|||
|
|
//DeviceProperties & outputProb = deviceProperties; // lokalen Zeiger auf Vektor erstellen der auf Klassenvektor zeigt.
|
|||
|
|
|
|||
|
|
//// printf("1: size(%i) capacity(%i) max_size(%i)\n", outputProb.size(), outputProb.capacity(), outputProb.max_size());
|
|||
|
|
|
|||
|
|
//outputProb.reserve(static_cast<std::size_t>(deviceCount)); // Request Vector with size deviceCount
|
|||
|
|
deviceProperties.reserve(static_cast<std::size_t>(deviceCount)); // Request Vector with size deviceCount
|
|||
|
|
|
|||
|
|
|
|||
|
|
//cudaDeviceProp & device = outputProb[deviceId]; //
|
|||
|
|
cudaDeviceProp & device = deviceProperties[deviceId]; //
|
|||
|
|
CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId));
|
|||
|
|
//// printf("%i. %s\n", deviceId, device.name);
|
|||
|
|
//// printf("%i. %s\n", deviceId, deviceProperties[deviceId].name);
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo
|
|||
|
|
// printf("%i. %s\n", deviceId, device.name);
|
|||
|
|
// printf(" Byte Total Global Mem: %lld \n", device.totalGlobalMem);
|
|||
|
|
// printf(" Compute Capability: %i.%i\n", device.major,device.minor);
|
|||
|
|
|
|||
|
|
// printf(" Name: %s\n", device.name);
|
|||
|
|
// printf(" Major revision number: %d\n", device.major);
|
|||
|
|
// printf(" Minor revision number: %d\n", device.minor);
|
|||
|
|
// printf(" Total global memory: %lld\n", device.totalGlobalMem);
|
|||
|
|
// printf(" Total shared memory per block: %u\n", device.sharedMemPerBlock);
|
|||
|
|
// printf(" Total registers per block: %d\n", device.regsPerBlock);
|
|||
|
|
// printf(" Warp size: %d\n", device.warpSize);
|
|||
|
|
// printf(" Maximum memory pitch: %lld\n", device.memPitch);
|
|||
|
|
// printf(" Maximum threads per block: %d\n", device.maxThreadsPerBlock);
|
|||
|
|
for (int i = 0; i < 3; ++i) {
|
|||
|
|
// printf(" Maximum dimension %d of block: %lld\n", i, device.maxThreadsDim[i]);
|
|||
|
|
}
|
|||
|
|
for (int i = 0; i < 3; ++i) {
|
|||
|
|
// printf(" Maximum dimension %d of grid: %lld\n", i, device.maxGridSize[i]);
|
|||
|
|
}
|
|||
|
|
// printf(" Clock rate: %d\n", device.clockRate);
|
|||
|
|
// printf(" Total constant memory: %u\n", device.totalConstMem);
|
|||
|
|
// printf(" Texture alignment: %u\n", device.textureAlignment);
|
|||
|
|
// printf(" Concurrent copy and execution: %s\n", (device.deviceOverlap ? "Yes" : "No"));
|
|||
|
|
// printf(" Number of multiprocessors: %d\n", device.multiProcessorCount);
|
|||
|
|
// printf(" Kernel execution timeout: %s\n\n", (device.kernelExecTimeoutEnabled ? "Yes" : "No"));
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
//outputProb.push_back(device); // Add element at the end of the vector outputProb
|
|||
|
|
deviceProperties.push_back(device); // Add element at the end of the vector outputProb
|
|||
|
|
|
|||
|
|
//// printf("2: size(%i) capacity(%i) max_size(%i)\n", outputProb.size(), outputProb.capacity(), outputProb.max_size());
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "<== loadDevices - End\n");
|
|||
|
|
#endif
|
|||
|
|
// Noch mal umstrukturieren!!!!! DA das so nicht sein muss, aber erstmal egal.~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|||
|
|
// siehe http://gpucoder.livejournal.com/1064.html
|
|||
|
|
|
|||
|
|
// int devCount;
|
|||
|
|
// cudaGetDeviceCount(&devCount);
|
|||
|
|
// // printf("CUDA Device Query...\n");
|
|||
|
|
// // printf("There are %d CUDA devices.\n", devCount);
|
|||
|
|
//
|
|||
|
|
// // Iterate through devices
|
|||
|
|
// for (int i = 0; i < devCount; ++i)
|
|||
|
|
// {
|
|||
|
|
// // Get device properties
|
|||
|
|
// // printf("\nCUDA Device #%d\n", i);
|
|||
|
|
// cudaDeviceProp devProp;
|
|||
|
|
// cudaGetDeviceProperties(&devProp, i);
|
|||
|
|
// printDevProp(devProp);
|
|||
|
|
// }
|
|||
|
|
//
|
|||
|
|
// // printf("\nPress any key to exit...");
|
|||
|
|
// char c;
|
|||
|
|
// scanf("%c", &c);
|
|||
|
|
|
|||
|
|
|
|||
|
|
//cudaDeviceProp & device = deviceProperties[deviceId];
|
|||
|
|
//CUDA_CHECK(cudaGetDeviceProperties(&device, deviceId)); // Eingenschaften des Devices auslesen
|
|||
|
|
|
|||
|
|
//#ifdef debug_OutputInfo // Name des Device mit ID ausgeben
|
|||
|
|
// printf( "Device used: %18s (HW-ID %i) (Idx %i)\n", device.name , deviceId, deviceIndex);
|
|||
|
|
//#endif
|
|||
|
|
CUDA_CHECK(cudaSetDevice(deviceId));
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo // Reset Device
|
|||
|
|
// printf("Reset Device\n");
|
|||
|
|
#endif
|
|||
|
|
//CUDA_CHECK(cudaDeviceReset());
|
|||
|
|
|
|||
|
|
// std::string errorMessage = cudaGetErrorString(cudaPeekAtLastError());
|
|||
|
|
// std::cout << errorMessage << std::endl;
|
|||
|
|
|
|||
|
|
//memoryCheck(); // Freier Speicher am Anfang
|
|||
|
|
|
|||
|
|
|
|||
|
|
// Check and set Block and Grid-Dimensions
|
|||
|
|
genericSAFTBlockDimensions = fixedBlockDimensions;
|
|||
|
|
genericSAFTGridDimensions = dim3(
|
|||
|
|
(IMAGE_SIZE_XYZ.x + genericSAFTBlockDimensions.x-1)/ genericSAFTBlockDimensions.x, // hier wird aufgerundet! Wenn ungerade Aufloesung nicht genau
|
|||
|
|
(IMAGE_SIZE_XYZ.y + genericSAFTBlockDimensions.y-1)/ genericSAFTBlockDimensions.y, // in Blockgroesse geteilt werden kann, muss ein weiterer
|
|||
|
|
(IMAGE_SIZE_XYZ.z + genericSAFTBlockDimensions.z-1)/ genericSAFTBlockDimensions.z // Block berechnet werden. Zu Viele werden im Kernel aussortiert.
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
#if defined(debug_OutputVariables) || defined(debug_OutputZSteps)
|
|||
|
|
if (deviceIndex == DebugOutputGPUIdx){
|
|||
|
|
// printf( "genericSAFTBlockDimensions X,Y,Z = (%i %i %i)\n",genericSAFTBlockDimensions.x, genericSAFTBlockDimensions.y, genericSAFTBlockDimensions.z);
|
|||
|
|
// printf( "genericSAFTGridDimensions X,Y,Z = (%i %i %i)\n",genericSAFTGridDimensions.x, genericSAFTGridDimensions.y, genericSAFTGridDimensions.z);
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
//Pointeruebergabe der AScan-Daten Geometrie-Daten und Output-Daten von Matlab
|
|||
|
|
#ifdef debug_OutputInfo
|
|||
|
|
// printf( "Give Pointer Names for AScan, Geometry, Output and SoS-Data from Matlab\n");
|
|||
|
|
// printf( "Uebergebener Pointer SoSData fuer SoS-Daten aus Matlab\n");
|
|||
|
|
#endif
|
|||
|
|
aScanSamples = (float*)aScan_ptr;
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo
|
|||
|
|
// printf( "Uebergebene Geometry Pointer fuer Index sowie der Zuordnungs-Tabelle aus Matlab\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
emitter_index = (unsigned short*) emitter_index_ptr; // Index for associating emitter to corresponding coordinates
|
|||
|
|
receiver_index = (unsigned short*) receiver_index_ptr; // Index for associating receiver to corresponding coordinates
|
|||
|
|
emitter_list = (float3*) emitter_list_ptr; // Lookuptable for emitter coordinates
|
|||
|
|
receiver_list = (float3*) receiver_list_ptr; // Lookuptable for receiver coordinates
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo
|
|||
|
|
// printf( "Uebergebener Pointer output fuer Ausgabe-Daten aus Matlab\n");
|
|||
|
|
#endif
|
|||
|
|
output = (double *)output_ptr;
|
|||
|
|
|
|||
|
|
|
|||
|
|
speedOfSoundFieldVoxelCount = SOSGrid_XYZ.x * SOSGrid_XYZ.y * SOSGrid_XYZ.z;
|
|||
|
|
speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount * sizeof(float);
|
|||
|
|
#ifdef debug_OutputVariables
|
|||
|
|
// printf(" speedOfSoundFieldVoxelCount [%ix%ix%i] = %i\n", SOSGrid_XYZ.x, SOSGrid_XYZ.y, SOSGrid_XYZ.z, speedOfSoundFieldVoxelCount);
|
|||
|
|
// printf(" speedOfSoundFieldBytes = speedOfSoundFieldVoxelCount(%i) x sizeof(float = 4)] = %i\n", speedOfSoundFieldVoxelCount, speedOfSoundFieldBytes);
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo
|
|||
|
|
// printf( "Uebergebener Pointer speedOfSoundField fuer SoS-Daten aus Matlab\n");
|
|||
|
|
// printf( "Uebergebener Pointer SoSData fuer SoS-Daten aus Matlab\n");
|
|||
|
|
// printf( "Uebergebener Pointer attenuationField fuer ATT-Daten aus Matlab\n");
|
|||
|
|
#endif
|
|||
|
|
speedOfSoundField = (float*)speed_vec_ptr; // Fuer SoSGrid-Mode fuer korrekte Schallgeschwindigkeitskorrektur
|
|||
|
|
SoSData = (float*)speed_vec_ptr; // Fuer Blockmode
|
|||
|
|
attenuationField = (float*)att_vec_ptr; // Fuer SoSGrid-Mode fuer Daempfungskorrektur
|
|||
|
|
|
|||
|
|
|
|||
|
|
// Uebergabe der Outputgroessen aus Matlab.
|
|||
|
|
regionOfInterestVoxelCount = (uint64_t)IMAGE_SIZE_XYZ.x * (uint64_t)IMAGE_SIZE_XYZ.y * (uint64_t)IMAGE_SIZE_XYZ.z; // Anzahl der Voxel im Volumen
|
|||
|
|
outputSize = regionOfInterestVoxelCount * sizeof(double); // Speicherbedarf fuer alle Voxel im Volumen
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputVariables
|
|||
|
|
// printf(" regionOfInterestVoxelCount [%ix%ix%i]= %lld\n",IMAGE_SIZE_XYZ.x, IMAGE_SIZE_XYZ.y, IMAGE_SIZE_XYZ.z, regionOfInterestVoxelCount);
|
|||
|
|
// printf(" outputSize [%lld x sizeof(double = 8)] = %lld\n", regionOfInterestVoxelCount, outputSize);
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
//Hier auf maximale Outputgroesse von 32-BitSystem ueberpruefen --> falls Probleme mit 32-Bitsystemen hier noch Abfrage und Abbruch implementieren
|
|||
|
|
if (regionOfInterestVoxelCount > (uint64_t)(2^32 / sizeof(double)) ){ // 2^32 / sizeof(double) = 536870912
|
|||
|
|
// printf("outputSize > 2^32 !!! => works only in 64-Bit System\n");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
//Groesse der Datenbloecke fuer die Blockverarbeitung wird mit aScanCount angegeben
|
|||
|
|
//Die selbe Anzahl wird auch fuer die Geometriedaten erwartet
|
|||
|
|
#ifdef debug_OutputVariables
|
|||
|
|
// printf( "AScan Blockgroesse (aScanCount)= %i\n", aScanCount);
|
|||
|
|
#endif
|
|||
|
|
aScanSize = aScanLength * sizeof(float);
|
|||
|
|
batchSize = aScanCount; // Anzahl der Blockgroesse d.h. wie viele AScans gleichzeitig verarbeitet werden. Batchgroesse ist gleich der Anzahl der uebergebenen Blockgroesse aus Matlab
|
|||
|
|
aScanBatchSize = batchSize * aScanSize; // Batchgroesse der AScans (* 3000 * sizeof(float)) in Byte
|
|||
|
|
#ifdef debug_OutputVariables
|
|||
|
|
// printf( "aScanSize = aScanLength(%i) * sizeof(float=4) = %i\n", aScanLength, aScanSize);
|
|||
|
|
// printf( "batchSize = aScanCount = %i\n", batchSize);
|
|||
|
|
// printf( "aScanBatchSize = batchSize * aScanSize ( = %i * sizeof(float)) = %i\n", aScanLength, aScanBatchSize);
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// if(batchSize > aScanCount) // Abfrage macht keinen Sinn mehr wenn batchSize = aScanCount;
|
|||
|
|
// {
|
|||
|
|
// mexErrMsgTxt("A-scan window size cannot be larger than the total number of A-scans");
|
|||
|
|
// //throw ail::exception("A-scan window size cannot be larger than the total number of A-scans");
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo
|
|||
|
|
|
|||
|
|
// printf("\nParameter for Image Reconstruction\n");
|
|||
|
|
// printf( "========================================================================\n");
|
|||
|
|
//std::cout << "ROI dimensions: " << regionOfInterestResolutionX << " x " << regionOfInterestResolutionY << " x " << regionOfInterestResolutionZ << std::endl;
|
|||
|
|
std::cout << "IMAGE_SIZE_XYZ: [" << IMAGE_SIZE_XYZ.x << " x " << IMAGE_SIZE_XYZ.y << " x " << IMAGE_SIZE_XYZ.z << "]" <<std::endl;
|
|||
|
|
std::cout << "Voxel count in Volume: " << regionOfInterestVoxelCount << std::endl;
|
|||
|
|
//std::cout << "Increment vector: (" << regionOfInterestIncrementVector.x << ", " << regionOfInterestIncrementVector.y << ", " << regionOfInterestIncrementVector.z << ")" << std::endl;
|
|||
|
|
std::cout << "Increment vector/Resolution: (" << IMAGE_RESOLUTION << ")" << std::endl;
|
|||
|
|
|
|||
|
|
std::cout << "IMAGE_STARTPOINT in meters: " << regionOfInterestOffset.x << " " << regionOfInterestOffset.y << " " << regionOfInterestOffset.z << std::endl;
|
|||
|
|
|
|||
|
|
regionOfInterestSize.x = IMAGE_SIZE_XYZ.x * IMAGE_RESOLUTION;
|
|||
|
|
regionOfInterestSize.y = IMAGE_SIZE_XYZ.y * IMAGE_RESOLUTION;
|
|||
|
|
regionOfInterestSize.z = IMAGE_SIZE_XYZ.z * IMAGE_RESOLUTION;
|
|||
|
|
std::cout << "ROI size in metres: " << regionOfInterestSize.x << " " << regionOfInterestSize.y << " " << regionOfInterestSize.z << std::endl;
|
|||
|
|
std::cout << "Batch size/Blocks(Ascan, R/E-Combi): " << batchSize << std::endl;
|
|||
|
|
// printf( "========================================================================\n\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
// #ifdef debug_OutputPerformance
|
|||
|
|
// struct timeval startPerformCoreReconstruction, stopPerformCoreReconstruction;
|
|||
|
|
// gettimeofday(&startPerformCoreReconstruction, NULL);
|
|||
|
|
// #endif
|
|||
|
|
|
|||
|
|
//perform processing with AScan-Data
|
|||
|
|
//===========================================================================================================
|
|||
|
|
ullong duration;
|
|||
|
|
processAScans(duration);
|
|||
|
|
//===========================================================================================================
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputPerformance
|
|||
|
|
double numerator = static_cast<double>(aScanCount) * regionOfInterestVoxelCount; // Performanz [Ascans * GVoxel/s]
|
|||
|
|
|
|||
|
|
double performance = numerator / duration;
|
|||
|
|
//adjust for the change from voxels per millisecond to gigavoxels per second (=> 10^3 * 10^-9 = 10^-6)
|
|||
|
|
performance /= 1e9;
|
|||
|
|
|
|||
|
|
//std::cout << "# Device ("<< (int)deviceId <<"): Duration of main processing: " << (int)duration << " us" << std::endl;
|
|||
|
|
//std::cout << "# Device ("<< (int)deviceId <<"): Performance: " << performance << " AScan * GVoxel/s" << std::endl;
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
//Duration_ptr[(deviceId+1)] = (double)duration; // Für jede GPU einen Laufzeitwert in µs übermitteln // Angabe von ID der GPU abhaengig
|
|||
|
|
Duration_ptr[(deviceIndex+1)] = (double)duration; // Angabe von Reihenfolge der angegebenen GPU-IDs abhaengig
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputVariables
|
|||
|
|
//// printf( "Duration_ptr[%i] = duration(%i) = %f\n", (deviceId+1), duration, Duration_ptr[(deviceId+1)]);
|
|||
|
|
// printf( " GPU (%s:ID %i,Index %i): => Duration_ptr[%i] = duration(%i µs) = %.2f s\n", device.name, deviceId, deviceIndex, (deviceIndex+1), duration, Duration_ptr[(deviceIndex+1)]/1000/1000);
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// #ifdef debug_OutputVariables
|
|||
|
|
// // printf( "Duration_ptr[0] = duration(%i) = %f\n", duration, Duration_ptr[0]);
|
|||
|
|
// #endif
|
|||
|
|
|
|||
|
|
// Reset Device
|
|||
|
|
// #ifdef debug_OutputInfo
|
|||
|
|
// // printf( "Device was used: %s (%i)\n", deviceProperties[deviceId].name , deviceId);
|
|||
|
|
// #endif
|
|||
|
|
// CUDA_CHECK(cudaSetDevice(deviceId));
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputInfo // Reset Device
|
|||
|
|
// printf("Reset Device\n");
|
|||
|
|
#endif
|
|||
|
|
//CUDA_CHECK(cudaDeviceReset());
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "<== SAFTHandler::performReconstruction - End\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
The SAFT kernel expects arguments in which the grid dimensions have been reduced to less than three dimensions and the block dimensions are reduced to only one dimension.
|
|||
|
|
This also depends on the properties of the hardware available (shader model).
|
|||
|
|
- Der SAFT Kernel erwartet Argumente in den die Grid Dimension auf drei Dimensionen reduziert wurde und die Block-Dimensionen auf nur eine Dimension reduziert ist.
|
|||
|
|
- Das haengt auch von den Eigenschaften der verfuegbaren HW ab (shader model)
|
|||
|
|
*/
|
|||
|
|
void SAFTHandler::reduceKernelDimensions(
|
|||
|
|
dim3 const & gridDimensions, ///< Input grid dimensions.
|
|||
|
|
dim3 const & blockDimensions, ///< Input block dimensions.
|
|||
|
|
dim3 & reducedGridDimensions, ///< Reduced output grid dimensions.
|
|||
|
|
dim3 & reducedBlockDimensions ///< Reduced output block dimensions.
|
|||
|
|
)
|
|||
|
|
{
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> SAFTHandler::reduceKernelDimensions - Start\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
|
|||
|
|
if(deviceProperties[deviceId].maxGridSize[2] > 1)
|
|||
|
|
{
|
|||
|
|
reducedGridDimensions = gridDimensions;
|
|||
|
|
#ifdef debug_OutputParameter
|
|||
|
|
// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
else
|
|||
|
|
{
|
|||
|
|
reducedGridDimensions = dim3(
|
|||
|
|
gridDimensions.x * gridDimensions.y,
|
|||
|
|
gridDimensions.z,
|
|||
|
|
1
|
|||
|
|
);
|
|||
|
|
#ifdef debug_OutputParameter
|
|||
|
|
// printf( "reducedGridDimensions X,Y,Z = (%i %i %i)\n",reducedGridDimensions.x, reducedGridDimensions.y, reducedGridDimensions.z);
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
reducedBlockDimensions = dim3(blockDimensions.x * blockDimensions.y * blockDimensions.z);
|
|||
|
|
#ifdef debug_OutputParameter
|
|||
|
|
// printf( "reducedBlockDimensions X,Y,Z = (%i %i %i)\n", reducedBlockDimensions.x, reducedBlockDimensions.y, reducedBlockDimensions.z);
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "<== SAFTHandler::reduceKernelDimensions - End\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Utility function to perform integer based divison which rounds up instead of down.
|
|||
|
|
- N<EFBFBD>tzliche Funktion: eine Integerbasierte Division die aufrundet und nicht abrundet
|
|||
|
|
@return Quotient of the divison, rounded up.
|
|||
|
|
*/
|
|||
|
|
std::size_t ceilingDivision(
|
|||
|
|
std::size_t dividend, ///< Dividend of the division.
|
|||
|
|
std::size_t divisor ///< Divisor of the division.
|
|||
|
|
)
|
|||
|
|
{
|
|||
|
|
std::size_t output = dividend / divisor;
|
|||
|
|
if(dividend % divisor)
|
|||
|
|
output ++;
|
|||
|
|
return output;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Converts an offset based on two different resolutions.
|
|||
|
|
This is a utility function used to deal with the number of z-layers in the speed of sound grid.
|
|||
|
|
- Konvertiert einen Offset, basierend auf zwei verschiedenen Aufloesungen
|
|||
|
|
- Diese n<EFBFBD>tzliche Funktion wird genutzt um mit der Anzahl der z-Layer in dem Spallgeschwindigkeits-Grid umzugehen.
|
|||
|
|
@return Result of the conversion.
|
|||
|
|
*/
|
|||
|
|
std::size_t SAFTHandler::resolutionConversion(
|
|||
|
|
std::size_t input, ///< Offset.
|
|||
|
|
std::size_t greaterResolution, ///< Greater resolution.
|
|||
|
|
std::size_t lowerResolution ///< Lower resolution.
|
|||
|
|
)
|
|||
|
|
{
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> SAFTHandler::resolutionConversion - Start\n");
|
|||
|
|
// printf( "<== SAFTHandler::resolutionConversion - End\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
return ceilingDivision(input * lowerResolution, greaterResolution);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Perform calculations pertaining to the execution of the speed of sound pre-calculations.
|
|||
|
|
- F<EFBFBD>hre Berechnungen der Schallgeschwindigkeit-Vorberechnung aus
|
|||
|
|
*/
|
|||
|
|
void SAFTHandler::determineSpeedOfSoundData(
|
|||
|
|
std::size_t regionOfInterestZLayers ///< Number of z-layers within the region of interest that are currently being processed. This number is often smaller than the total number of z-layers.
|
|||
|
|
)
|
|||
|
|
{
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> SAFTHandler::determineSpeedOfSoundData - Start\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// //Determine the maximum number of z-layers to be pre-calculated within the speed of sound grid
|
|||
|
|
// //Bestimme die maximale Anzahl an Z-layer, die in dem SoS-Grid Vorberechnet werden.
|
|||
|
|
// //std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, regionOfInterestResolutionZ, regionOfInterestGridSizeZ);
|
|||
|
|
// std::size_t maximumSpeedOfSoundPartialZLayerCount = resolutionConversion(regionOfInterestZLayers, IMAGE_SIZE_XYZ.z, regionOfInterestGridSizeZ);
|
|||
|
|
//
|
|||
|
|
// partialSpeedOfSoundVoxelCount = maximumSpeedOfSoundPartialZLayerCount * regionOfInterestGridSizeX * regionOfInterestGridSizeY;
|
|||
|
|
//
|
|||
|
|
//// deviceTableVoxelToEmitterPathCountSize = sosZLayerVoxelCount * emitter_list_Size * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Emitter
|
|||
|
|
//// deviceTableVoxelToEmitterPathSosSumSize = sosZLayerVoxelCount * emitter_list_Size * partialSoSZLayerCount * sizeof(float);
|
|||
|
|
//// deviceTableVoxelToReceiverPathCountSize = sosZLayerVoxelCount * receiver_list_Size * partialSoSZLayerCount * sizeof(VoxelCountType); // Gr<47><72>e f<>r Speicher der Pfadanzahl * die Anzahl der gleichzeitig genutzten Z-Layer f<>r alle Receiver
|
|||
|
|
//// deviceTableVoxelToReceiverPathSosSumSize = sosZLayerVoxelCount * receiver_list_Size * partialSoSZLayerCount * sizeof(float);
|
|||
|
|
//
|
|||
|
|
// std::size_t
|
|||
|
|
// emitterSpeedOfSoundVoxelCombinations = emitterCount * partialSpeedOfSoundVoxelCount,
|
|||
|
|
// receiverSpeedOfSoundVoxelCombinations = receiverCount * partialSpeedOfSoundVoxelCount;
|
|||
|
|
//
|
|||
|
|
// emitterToVoxelPathVoxelDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
|
|||
|
|
// emitterToVoxelPathSpeedDataSize = emitterSpeedOfSoundVoxelCombinations * sizeof(float);
|
|||
|
|
//
|
|||
|
|
// voxelToReceiverPathVoxelDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(VoxelCountType);
|
|||
|
|
// voxelToReceiverPathSpeedDataSize = receiverSpeedOfSoundVoxelCombinations * sizeof(float);
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "<== SAFTHandler::determineSpeedOfSoundData - End\n");
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Perform initialisations for the partial reconstructions for both the speed of sound pre-calculation and the actual reconstruction.
|
|||
|
|
- F<EFBFBD>hre Initialisierungen fuer eine Teilrekonstruktion von beiden durch: Der Schallgeschwindigkeit und der aktuellen Rekonstruktion
|
|||
|
|
*/
|
|||
|
|
void SAFTHandler::partialReconstructionInitialisation()
|
|||
|
|
{
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> SAFTHandler::partialReconstructionInitialisation - Start\n");
|
|||
|
|
#endif
|
|||
|
|
//
|
|||
|
|
// if(!partialReconstructionInitialised)
|
|||
|
|
// {
|
|||
|
|
// std::cout << "Initialising partial reconstruction data" << std::endl;
|
|||
|
|
//
|
|||
|
|
// //zLayerVoxelCount = regionOfInterestResolutionX * regionOfInterestResolutionY;
|
|||
|
|
// zLayerVoxelCount = IMAGE_SIZE_XYZ.x * IMAGE_SIZE_XYZ.y; // Anzahl der X-Y-Voxel bestimmen den Schritt in das naechste Layer.
|
|||
|
|
//
|
|||
|
|
// partialOutputVoxelCount = partialOutputSize / sizeof(double);
|
|||
|
|
//// if(partialOutputVoxelCount % zLayerVoxelCount != 0) //Sicherheitsabfrage nun im kernel
|
|||
|
|
//// mexErrMsgTxt("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
|
|||
|
|
// //throw ail::exception("The partial output size must consist of a discrete number of z-layers for the chosen resolution");
|
|||
|
|
// partialOutputZLayerCount = partialOutputVoxelCount / zLayerVoxelCount;
|
|||
|
|
//
|
|||
|
|
//// if(partialOutputZLayerCount % genericSAFTBlockDimensions.z != 0) //Sicherheitsabfrage nun im kernel
|
|||
|
|
//// mexErrMsgTxt("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
|
|||
|
|
// //throw ail::exception("The number of Z-layers in the output window must be a multiple of the reconstruction block dimensions");
|
|||
|
|
//
|
|||
|
|
// //Make dynamically sized allocations for the pre-calculated speed of sound data.
|
|||
|
|
// //The size depends on the number of z-layers in the output window.
|
|||
|
|
// //These particular pre-calculations are no longer performed only once for all voxels.
|
|||
|
|
// //Instead, they are performed partially, prior to each launch of the SAFT kernel.
|
|||
|
|
// //This lowers the pressure on GPU global memory.
|
|||
|
|
//
|
|||
|
|
// //F<>hre Allokationen mit dynamischer Groesse aus fuer die Vor-Verarbeitung der SoS-Daten
|
|||
|
|
// //Die Groesse haengt von der Anzahl der z-Layer in dem -Fenster ab.
|
|||
|
|
// //Diese partielle-Vorberechnung muss nur einmal fuer alle Voxel durchgef<65>hrt werden.
|
|||
|
|
// //Stattdessen werden sie immer partiell durchgef<65>hrt, vor jedem Start des SAFT-Kernels.
|
|||
|
|
// //Das entlastet den globalen GPU-Speicher.
|
|||
|
|
//
|
|||
|
|
// determineSpeedOfSoundData(partialOutputZLayerCount);
|
|||
|
|
//
|
|||
|
|
// // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathVoxelCounts der Groesse:%i\n", emitterToVoxelPathVoxelDataSize);
|
|||
|
|
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathVoxelCounts), emitterToVoxelPathVoxelDataSize));
|
|||
|
|
// // printf( "CUDA:Memory Allokation: deviceEmitterToVoxelPathSpeedOfSoundSum der Groesse:%i\n", emitterToVoxelPathSpeedDataSize);
|
|||
|
|
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceEmitterToVoxelPathSpeedOfSoundSum), emitterToVoxelPathSpeedDataSize));
|
|||
|
|
//
|
|||
|
|
// // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathVoxelCounts der Groesse:%i\n", voxelToReceiverPathVoxelDataSize);
|
|||
|
|
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathVoxelCounts), voxelToReceiverPathVoxelDataSize));
|
|||
|
|
// // printf( "CUDA:Memory Allokation: deviceVoxelToReceiverPathSpeedOfSoundSum der Groesse:%i\n", voxelToReceiverPathSpeedDataSize);
|
|||
|
|
// CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&deviceVoxelToReceiverPathSpeedOfSoundSum), voxelToReceiverPathSpeedDataSize));
|
|||
|
|
//
|
|||
|
|
// partialReconstructionInitialised = true;
|
|||
|
|
// }
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "<== SAFTHandler::partialReconstructionInitialisation - End\n");
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Print free/total memory available on the chosen device.
|
|||
|
|
- Gibt freien/totalen zur verf<EFBFBD>gung stehenden Speicher auf dem gew<EFBFBD>hlten Device aus.
|
|||
|
|
*/
|
|||
|
|
void memoryCheck()
|
|||
|
|
{
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "==> memoryCheck - Start\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
std::size_t
|
|||
|
|
totalMemory,
|
|||
|
|
freeMemory;
|
|||
|
|
CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
|
|||
|
|
|
|||
|
|
#if defined(debug_OutputInfo) || defined(debug_OutputMaxMemory)
|
|||
|
|
//printSize(" Total memory ", totalMemory);
|
|||
|
|
//std::cout << " ( " << totalMemory << " )" << std::endl;
|
|||
|
|
//printSize(" Free memory ", freeMemory);
|
|||
|
|
//std::cout << " ( " << freeMemory << " )" << std::endl;
|
|||
|
|
|
|||
|
|
//printSize(" => Used memory ", (totalMemory-freeMemory));
|
|||
|
|
//std::cout << " ( " << (totalMemory-freeMemory) << " )" << std::endl;
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
#ifdef debug_OutputFunctions
|
|||
|
|
// printf( "<== memoryCheck - End\n");
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
Generic CUDA call wrapper.
|
|||
|
|
Check the result of a CUDA operation and throw an exception if an error occurred.
|
|||
|
|
This is used in combination with a macro in saft.hpp.
|
|||
|
|
- Generischer CUDA Call Wrapper
|
|||
|
|
- <EFBFBD>berpr<EFBFBD>ft die Ergebnisse einer CUDA Operation und wirft eine Exception wenn ein Fehler auftritt
|
|||
|
|
- Das wird wird mit einer Kombination mit einem Makro in saft.hpp genutzt.
|
|||
|
|
*/
|
|||
|
|
//inline // Da performCUDAResultCheck in allen Files genutzt werden soll funktioniert inline und etern nicht zusammen
|
|||
|
|
void performCUDAResultCheck(
|
|||
|
|
cudaError_t result, ///< Result of the CUDA operation.
|
|||
|
|
std::string const & file, ///< Path to the source code file.
|
|||
|
|
int line ///< Line within the source code
|
|||
|
|
)
|
|||
|
|
{
|
|||
|
|
if(result != cudaSuccess)
|
|||
|
|
{
|
|||
|
|
//// printf("A CUDA operation failed in file \"%s\" (line %i): %s \n", file, line, cudaGetErrorString(result).c_str() );
|
|||
|
|
// printf("%s\n", cudaGetErrorString( cudaGetLastError() ) );
|
|||
|
|
|
|||
|
|
//std::string errorMessage = "A CUDA operation failed in file \"" + file + "\" (line " + ail::number_to_string(line) + "): " + std::string(cudaGetErrorString(result));
|
|||
|
|
//std::cout << errorMessage << std::endl;
|
|||
|
|
mexErrMsgTxt("-> Error occurred");
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|