feat: Add Profile log to SAFT

This commit is contained in:
kradchen
2024-12-03 13:43:38 +08:00
parent c83c161bc6
commit 3f69b3bb98
5 changed files with 50 additions and 5 deletions

View File

@@ -340,8 +340,10 @@ void multithreaded_processing(float *aScan_ptr, ///< AScan-Daten
SPDLOG_INFO("Start GPU execute!"); SPDLOG_INFO("Start GPU execute!");
for (j = 0; j < num_devices_factor; j++) for (j = 0; j < num_devices_factor; j++)
// for (j = 0; j < 1; j++)
{ {
for (k = 0; k < selectedNumberGPUs; k++) for (k = 0; k < selectedNumberGPUs; k++)
// for (k = 0; k < 1; k++)
{ {
// new async threads // new async threads
futures[k] = std::async(std::forward<std::function<void(void *)>>(thread_function), futures[k] = std::async(std::forward<std::function<void(void *)>>(thread_function),
@@ -350,6 +352,7 @@ void multithreaded_processing(float *aScan_ptr, ///< AScan-Daten
// Synchronization and termination ------------------------------------------------------------------------------------------------------------------- // Synchronization and termination -------------------------------------------------------------------------------------------------------------------
for (k = 0; k < selectedNumberGPUs; k++) for (k = 0; k < selectedNumberGPUs; k++)
// for (k = 0; k < 1; k++)
{ {
// new async threads // new async threads
futures[k].wait(); // advantage: async are packaged tasks after c++ with os handling, and consistency handling (if destructor is called, it executes task) futures[k].wait(); // advantage: async are packaged tasks after c++ with os handling, and consistency handling (if destructor is called, it executes task)

View File

@@ -2,6 +2,9 @@
#include "rayTracing.cuh" #include "rayTracing.cuh"
#include "saftKernel.cuh" #include "saftKernel.cuh"
#include "saft.hpp" #include "saft.hpp"
// #include "spdlog/spdlog.h"
#define SPDLOG_INFO(...)
void SAFTHandler::precalculateAscanIndex_usePaths(int ascanIndex_i, int aScanWindowSize, int currentSpeedOfSoundZLayer, int maxFeasibleSosZLayerCount) void SAFTHandler::precalculateAscanIndex_usePaths(int ascanIndex_i, int aScanWindowSize, int currentSpeedOfSoundZLayer, int maxFeasibleSosZLayerCount)
{ {
@@ -546,6 +549,7 @@ void SAFTHandler::performSAFT(
CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat)); CUDA_CHECK(cudaBindTextureToArray(&texTableAscanIndexFloat2_3, deviceTextureAscanIndexFloatCuArray[3], &texChannelDescTableAscanIndexFloat));
} }
} }
SPDLOG_INFO("performSAFT init end");
// Vorberechnung der Koordinaten --> schnelleres Bestimmen der Voxelposition // Vorberechnung der Koordinaten --> schnelleres Bestimmen der Voxelposition
float VoxelIncrement = IMAGE_RESOLUTION / SOS_RESOLUTION; float VoxelIncrement = IMAGE_RESOLUTION / SOS_RESOLUTION;
float3 SosVoxelStartPosition; float3 SosVoxelStartPosition;
@@ -564,6 +568,10 @@ void SAFTHandler::performSAFT(
// ==================================================== Blockmode with SOS-value per Ascan // ==================================================== Blockmode with SOS-value per Ascan
// ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction // ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
// ==================================================== 3DVolume Mode with SOS- and ATT-Correction // ==================================================== 3DVolume Mode with SOS- and ATT-Correction
SPDLOG_INFO("perform kernel start");
SPDLOG_INFO("SosVoxelStartPosition {0} {1} {2}",SosVoxelStartPosition.x,SosVoxelStartPosition.y,SosVoxelStartPosition.z);
SPDLOG_INFO("TableAscanIndexAllocationCount = ({0}/{1}) = {2} = {3}", aScanWindowSize, maxAscanIndexArraysInTexture, TableAscanIndexAllocationCount,
ceil(aScanWindowSize/maxAscanIndexArraysInTexture));
if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false)) if ((SOSMode_3DVolume == false) && (ATTMode_3DVolume == false))
{ // ==================================================== Blockmode with SOS-value per Ascan { // ==================================================== Blockmode with SOS-value per Ascan
@@ -571,6 +579,7 @@ void SAFTHandler::performSAFT(
} }
else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false)) else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == false))
{ {
SPDLOG_INFO("perform kernel saftKernelAscanIndex_SOS");
// ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction // ==================================================== 3DVolume Mode with SOS-Correction no ATT-Correction
saftKernelAscanIndex_SOS<<<reducedGridDimensions, reducedBlockDimensions>>>( // , 0, stream>>>( saftKernelAscanIndex_SOS<<<reducedGridDimensions, reducedBlockDimensions>>>( // , 0, stream>>>(
@@ -579,11 +588,13 @@ void SAFTHandler::performSAFT(
(float)aScanWindowSize, // maxAscanIndexArraysInTexture (float)aScanWindowSize, // maxAscanIndexArraysInTexture
maxAscanIndexArraysInTexture, // maxSoSReceiverArrayForTexture maxAscanIndexArraysInTexture, // maxSoSReceiverArrayForTexture
TableAscanIndexAllocationCount, // Anzahl der genutzten Teiltabellen TableAscanIndexAllocationCount, // Anzahl der genutzten Teiltabellen
IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement, blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions, deviceOutput); IMAGE_SIZE_XYZ, SosVoxelStartPosition, IMAGE_RESOLUTION, VoxelIncrement, blockIndexOffset, speedOfSoundZLayer, gridDimensions, blockDimensions, deviceOutput);
SPDLOG_INFO("perform kernel saftKernelAscanIndex_SOS finish");
} }
else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true)) else if ((SOSMode_3DVolume == true) && (ATTMode_3DVolume == true))
{ // ==================================================== 3DVolume Mode with SOS- and ATT-Correction { // ==================================================== 3DVolume Mode with SOS- and ATT-Correction
SPDLOG_INFO("perform kernel saftKernelAscanIndex_SOS_ATT, grid:{0},{1},{2}",reducedGridDimensions.x,reducedGridDimensions.y, reducedGridDimensions.z);
saftKernelAscanIndex_SOS_ATT<<<reducedGridDimensions, reducedBlockDimensions>>>( // , 0, stream>>>( saftKernelAscanIndex_SOS_ATT<<<reducedGridDimensions, reducedBlockDimensions>>>( // , 0, stream>>>(
aScanIndex, aScanIndex,
(float)aScanWindowSize, // maxAscanIndexArraysInTexture (float)aScanWindowSize, // maxAscanIndexArraysInTexture
@@ -595,7 +606,11 @@ void SAFTHandler::performSAFT(
deviceOutput deviceOutput
); );
SPDLOG_INFO("perform kernel saftKernelAscanIndex_SOS_ATT finish");
} }
SPDLOG_INFO("perform kernel finish");
SPDLOG_INFO("perform unbind start");
// Unbind Textures // Unbind Textures
CUDA_CHECK(cudaUnbindTexture(&texRefAscans)); CUDA_CHECK(cudaUnbindTexture(&texRefAscans));
@@ -615,4 +630,5 @@ void SAFTHandler::performSAFT(
CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_3)); CUDA_CHECK(cudaUnbindTexture(&texTableAscanIndexFloat2_3));
} }
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
SPDLOG_INFO("perform unbind finish");
} }

View File

@@ -9,6 +9,8 @@
#include <iostream> #include <iostream>
#include "saft.hpp" #include "saft.hpp"
// #include "spdlog/spdlog.h"
#define SPDLOG_INFO(...)
extern float3 *constEmitterPtr; extern float3 *constEmitterPtr;
extern float3 *constReceiverPtr; extern float3 *constReceiverPtr;
@@ -24,6 +26,7 @@ extern unsigned short *constLookUpGeometryMemoryListReceiverPtr;
*/ */
void SAFTHandler::performCoreReconstruction() void SAFTHandler::performCoreReconstruction()
{ {
// SPDLOG_INFO("performCoreReconstruction start!");
// Mitlaufender Zeiger fuer Speicherbereich der Outputdaten uebergeben. // Mitlaufender Zeiger fuer Speicherbereich der Outputdaten uebergeben.
double *currentHostOutputAdress = output; // Offset des OutputSpeichers am Anfang = 0 double *currentHostOutputAdress = output; // Offset des OutputSpeichers am Anfang = 0
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
@@ -76,7 +79,7 @@ void SAFTHandler::performCoreReconstruction()
} }
currentZLayerCount = maxFeasibleZLayerCount; // Mit maximal moeglicher Anzahl an Z-Layer rechnen und in Schleife abhaengig von Randbedingungen auf aktuelle maximale Anzahl verringern currentZLayerCount = maxFeasibleZLayerCount; // Mit maximal moeglicher Anzahl an Z-Layer rechnen und in Schleife abhaengig von Randbedingungen auf aktuelle maximale Anzahl verringern
SPDLOG_INFO("main loop start!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
// performCoreReconstruction - While-Loop over all Z-Layers~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // performCoreReconstruction - While-Loop over all Z-Layers~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
while ((zOffset < IMAGE_SIZE_XYZ.z)) while ((zOffset < IMAGE_SIZE_XYZ.z))
@@ -129,19 +132,25 @@ void SAFTHandler::performCoreReconstruction()
zSoSOffset_old = zSoSOffset; zSoSOffset_old = zSoSOffset;
// SoS-Pfade für Z-Layer vorberechnen ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // SoS-Pfade für Z-Layer vorberechnen ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SPDLOG_INFO("precalculateAverageSpeedOfSound Emitter start!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
// Emitter ----------------------------------------------------------------------- Emitter // Emitter ----------------------------------------------------------------------- Emitter
precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 0, emitter_list_Size); precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 0, emitter_list_Size);
SPDLOG_INFO("precalculateAverageSpeedOfSound Emitter finish!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
SPDLOG_INFO("precalculateAverageSpeedOfSound Receiver start!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
// Receiver ----------------------------------------------------------------------- Receiver // Receiver ----------------------------------------------------------------------- Receiver
precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 1, receiver_list_Size); precalculateAverageSpeedOfSound(currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount, 1, receiver_list_Size);
SPDLOG_INFO("precalculateAverageSpeedOfSound finish!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
} }
// Go overall A-Scanblocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Go overall A-Scanblocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// PrecalAndPerformSAFTAllAscans - Berechne Performanz fuer Durchlauf von allen Ascans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // PrecalAndPerformSAFTAllAscans - Berechne Performanz fuer Durchlauf von allen Ascans ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// aScanWindowSize auf maximal moegliche Anzahl setzen // aScanWindowSize auf maximal moegliche Anzahl setzen
std::size_t aScanWindowSize = (maxAscanIndexArraysInTexture * maxSupportedTexturesForAscanIndex); // Anzahl maximaler Ascans die auf einmal verarbeitet werden kann. std::size_t aScanWindowSize = (maxAscanIndexArraysInTexture * maxSupportedTexturesForAscanIndex); // Anzahl maximaler Ascans die auf einmal verarbeitet werden kann.
int ascanIndexBatchOffset = 0; int ascanIndexBatchOffset = 0;
while (ascanIndexBatchOffset < aScanCount) while (ascanIndexBatchOffset < aScanCount)
{ // Alle Emitter oder Receiver in der Liste von Matlab durchgehen { // Alle Emitter oder Receiver in der Liste von Matlab durchgehen
@@ -160,15 +169,27 @@ void SAFTHandler::performCoreReconstruction()
// neededAscanBatchCount = Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen // neededAscanBatchCount = Anzahl an benoetigten Durchlaeufen des SAFTs um alle Ascans abarbeiten zu koennen
// AscanIndex Emitter -> Voxel --> Receiver vorberechnen fuer AscansBatchSize // AscanIndex Emitter -> Voxel --> Receiver vorberechnen fuer AscansBatchSize
precalculateAscanIndex_usePaths(ascanIndexBatchOffset, aScanWindowSize, currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount); //, deviceTextureAscanIndexFloatCuArray); SPDLOG_INFO("precalculateAscanIndex_usePaths start!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
precalculateAscanIndex_usePaths(ascanIndexBatchOffset, aScanWindowSize, currentSpeedOfSoundZLayer, maxFeasibleSosZLayerCount); //, deviceTextureAscanIndexFloatCuArray);
SPDLOG_INFO("precalculateAscanIndex_usePaths finish!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
// if (copyed){
// // SPDLOG_INFO("cudaStreamSynchronize start!");
// cudaStreamSynchronize(stream);
// // SPDLOG_INFO("cudaStreamSynchronize finish!");
// }
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
if (currentZLayerCount > 0) // bugprevention because windows and linux would stop execution for a bug - Issue 100 if (currentZLayerCount > 0) // bugprevention because windows and linux would stop execution for a bug - Issue 100
{
SPDLOG_INFO("performSAFT start!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
performSAFT(ascanIndexBatchOffset, aScanWindowSize, IMAGE_SIZE_XYZ, SOSGrid_XYZ, blockIndexOffset, static_cast<int>(currentOutputZLayerVoxelCount), performSAFT(ascanIndexBatchOffset, aScanWindowSize, IMAGE_SIZE_XYZ, SOSGrid_XYZ, blockIndexOffset, static_cast<int>(currentOutputZLayerVoxelCount),
static_cast<int>(currentSpeedOfSoundZLayer), static_cast<int>(partialSpeedOfSoundVoxelCount), static_cast<int>(maxFeasibleSosZLayerCount), static_cast<int>(currentSpeedOfSoundZLayer), static_cast<int>(partialSpeedOfSoundVoxelCount), static_cast<int>(maxFeasibleSosZLayerCount),
static_cast<int>(currentEmIndexUsedForAscanIndexCalculation), windowGridDimensions, genericSAFTGridDimensions, genericSAFTBlockDimensions, deviceSpeedOfSoundField, static_cast<int>(currentEmIndexUsedForAscanIndexCalculation), windowGridDimensions, genericSAFTGridDimensions, genericSAFTBlockDimensions, deviceSpeedOfSoundField,
deviceAScansCuArray[0]); deviceAScansCuArray[0]);
SPDLOG_INFO("performSAFT finish!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
}
// , calculationStream); // , calculationStream);
ascanIndexBatchOffset += aScanWindowSize; ascanIndexBatchOffset += aScanWindowSize;
@@ -212,6 +233,7 @@ void SAFTHandler::performCoreReconstruction()
currentZLayerCount = maxFeasibleZLayerCount; currentZLayerCount = maxFeasibleZLayerCount;
}; // End While-Loop over all Z-Layer }; // End While-Loop over all Z-Layer
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
SPDLOG_INFO("main loop finish!GPUMemoryFree:{0}",memoryGPUfreeInGiga());
} }
/** /**

View File

@@ -223,6 +223,9 @@ std::size_t memoryGPUfree()
CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory)); CUDA_CHECK(cudaMemGetInfo(&freeMemory, &totalMemory));
return freeMemory; return freeMemory;
} }
float memoryGPUfreeInGiga(){
return (float)memoryGPUfree()/(1024.f*1025.f*1024.f);
}
/** /**
Determine free memory available on the current device. Determine free memory available on the current device.

View File

@@ -397,6 +397,7 @@ class SAFTHandler
extern void memoryCheck(); extern void memoryCheck();
extern std::size_t memoryGPUfree(); extern std::size_t memoryGPUfree();
extern float memoryGPUfreeInGiga();
extern std::size_t memoryGPUtotal(); extern std::size_t memoryGPUtotal();
extern void performCUDAResultCheck(cudaError_t result, std::string const &file, int line); extern void performCUDAResultCheck(cudaError_t result, std::string const &file, int line);