Update buildMatrix and solveParameterIterator speed up.

This commit is contained in:
sunwen
2023-12-26 15:21:27 +08:00
parent 410d657fe7
commit c40dc8e938
5 changed files with 299 additions and 54 deletions

View File

@@ -0,0 +1,246 @@
#include "Function2D.cuh"
#include <cstdio>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <cuda_runtime.h>
#include <math.h>
#include "CudaMatrix.h"
#include "buildMatrix.cuh"
#include "Sparse.h"
namespace {
const int THREADS_PER_BLOCK = 256;
}
__device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc)
{
float temp1;
float temp2;
float temp3;
temp1 = aEndPt[0]- aStartPt[0];
temp2 = aEndPt[1]- aStartPt[1];
temp3 = aEndPt[2]- aStartPt[2];
temp1 *= aRes[0];
temp2 *= aRes[1];
temp3 *= aRes[2];
temp1 = temp1*temp1;
temp2 = temp2*temp2;
temp3 = temp3*temp3;
return (sqrtf(temp3+temp2+temp1))/(float)aPathLenDisc;
}
__global__ struct b3dStruct{
int max, f, pathlen;
float x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight;
};
__device__ void getRayBLen(float *startPoint, float *endPoint, b3dStruct& aOut) {
aOut.x = startPoint[0];
aOut.y = startPoint[1];
aOut.z = startPoint[2];
float dx = endPoint[0] - aOut.x;
float dy = endPoint[1] - aOut.y;
float dz = endPoint[2] - aOut.z;
aOut.x_inc = (dx < 0) ? -1 : 1;
float l = abs(dx);
aOut.y_inc = (dy < 0) ? -1 : 1;
float m = abs(dy);
aOut.z_inc = (dz < 0) ? -1 : 1;
float n = abs(dz);
aOut.dx2 = l*2;// << 1;
aOut.dy2 = m*2;// << 1;
aOut.dz2 = n*2;// << 1;
float vmax ;
if ((l >= m) && (l >= n)){
aOut.max =ceilf(l);
vmax = l;
aOut.d = aOut.dx2;
aOut.dx2 = 0;
aOut.f = 0;
}
else if((m >= l) && (m >= n)){
aOut.max = ceilf(m);
vmax = m;
aOut.d = aOut.dy2;
aOut.dy2 = 0;
aOut.f = 1;
}
else{
aOut.max = ceilf(n);
vmax = n;
aOut.d = aOut.dz2;
aOut.dz2 = 0;
aOut.f = 2;
}
aOut.err_0 = aOut.f==0?0:(aOut.dx2 - vmax);
aOut.err_1 = aOut.f==1?0:(aOut.dy2 - vmax);
aOut.err_2 = aOut.f==2?0:(aOut.dz2 - vmax);
}
/**
* @brief 计算traceStraightRayBresenham的pathlen和weight的核函数
*
* @param aStratPt
* @param aEndPt
* @param aSize
* @param aRes
* @param aOuts
* @return _
*/
__global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, float* aRes,int aSize, b3dStruct* aOuts ){
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx > aSize)return;
getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts[idx]);
aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes,aOuts[idx].max);
if (idx == 196){
int v=0;
for (int i =0;i<196; i++) {
v+=aOuts[i].max;
}
}
};
__global__ void calcPathOffsetKernel(b3dStruct* aStructs ,unsigned int* aPath, int aSize){
aPath[0] = aStructs[0].max;
for (int i=1; i<aSize; i++) {
aPath[i] =aStructs[i].max+aPath[i-1];
}
}
__device__ float convertToLinearIndices(const float3& aDims, const float3& aPath){
float matrixSize_1 = aDims.x;
float matrixSize_2 = aDims.y;
float coord_col1 = aPath.x;
float coord_col2Sub1 = aPath.y - 1;
float coord_col3Sub1 = aPath.z - 1;
return coord_col1 + coord_col2Sub1 * matrixSize_1 + coord_col3Sub1 * matrixSize_1 * matrixSize_2 - 1 ;
}
__global__ void calcRayBLPathKernel(struct b3dStruct* aInstruct,
float* dims, unsigned int * aOffset,
float* aJOut, unsigned int aSize) {
unsigned int idx = blockIdx.x;
if (idx > aSize)return;
b3dStruct* in = aInstruct+idx;
int f;
float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z;
x = in->x;
y = in->y;
z = in->z;
x_inc = in->x_inc;
y_inc = in->y_inc;
z_inc = in->z_inc;
dx2 = in->dx2;
dy2 = in->dy2;
dz2 = in->dz2;
d = in->d;
err_0 = in->err_0;
err_1 = in->err_1;
err_2 = in->err_2;
max = in->max;
f = in->f;
float3 dims3{dims[0],dims[1],dims[2]};
float* output_ptr = aJOut + (idx ==0?0:aOffset[idx-1]);
for (float i = 0; i < max; i++) {
output_ptr[0]=convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)});
// output_ptr[0]=i;
output_ptr ++;
if (err_0 > 0) {
x += x_inc;
err_0 -= d;
}
if (err_1 > 0) {
y += y_inc;
err_1 -= d;
}
if (err_2 > 0) {
z += z_inc;
err_2 -= d;
}
err_0 += dx2;
err_1 += dy2;
err_2 += dz2;
if (f == 0) x += x_inc;
if (f == 1) y += y_inc;
if (f == 2) z += z_inc;
}
}
__global__ void fillIAndSKernel(struct b3dStruct* aInstructs,unsigned int * aOffset,float* aIOut,float* aSOut){
b3dStruct& s = aInstructs[blockIdx.x];
__shared__ unsigned int beginIdx;
__shared__ unsigned int length;
__shared__ float weight;
if (threadIdx.x == 0){
beginIdx = (blockIdx.x == 0) ? 0 : aOffset[blockIdx.x - 1];
length = s.max;
weight = s.weight;
}
__syncthreads();
for (int i = 0; threadIdx.x + i < length; i+=blockDim.x) {
aIOut[beginIdx + threadIdx.x + i] = blockIdx.x;
aSOut[beginIdx + threadIdx.x + i] = weight;
}
}
Recon::BuildMatrixResult Recon::buildMatrix(const Aurora::CudaMatrix &senderList,
const Aurora::CudaMatrix &receiverList,
Aurora::CudaMatrix& res,
const Aurora::CudaMatrix &dims, bool BENT,
const Aurora::CudaMatrix &potentialMap){
Recon::BuildMatrixResult result;
unsigned int numDims = senderList.getDimSize(0);
unsigned int nTotalRays = senderList.getDimSize(1);
b3dStruct* structList= nullptr;
cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays);
int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
calcLenWeightKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(
senderList.getData(), receiverList.getData(), res.getData(),nTotalRays,structList);
cudaDeviceSynchronize();
unsigned int* offset= nullptr;
cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays);
calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays);
cudaDeviceSynchronize();
unsigned int size;
cudaMemcpy(&size, offset+(nTotalRays-1), sizeof(unsigned int), cudaMemcpyDeviceToHost);
float* iData = nullptr;
float* jData = nullptr;
float* sData = nullptr;
cudaMalloc((void**)&iData, sizeof(float)*size);
cudaMalloc((void**)&jData, sizeof(float)*size);
cudaMalloc((void**)&sData, sizeof(float)*size);
calcRayBLPathKernel<<<nTotalRays,1>>>(structList, dims.getData(), offset, jData, nTotalRays);
fillIAndSKernel<<<nTotalRays,THREADS_PER_BLOCK>>>(structList, offset, iData, sData);
cudaDeviceSynchronize();
cudaFree(structList);
auto mI = Aurora::CudaMatrix::fromRawData(iData, 1, size ,1);
auto mJ = Aurora::CudaMatrix::fromRawData(jData, 1, size,1);
auto mS = Aurora::CudaMatrix::fromRawData(sData, 1, size,1);
result.M = Aurora::Sparse(mI.toHostMatrix(), mJ.toHostMatrix(),mS.toHostMatrix(),nTotalRays, Aurora::prod(dims).getValue(0));
return result;
}

View File

@@ -0,0 +1,12 @@
#ifndef __BUILDMATRIX_CUH__
#define __BUILDMATRIX_CUH__
#include "buildMatrix.h"
namespace Recon {
BuildMatrixResult buildMatrix(const Aurora::CudaMatrix &senderList,
const Aurora::CudaMatrix &receiverList,
Aurora::CudaMatrix& res,
const Aurora::CudaMatrix &dims, bool BENT,
const Aurora::CudaMatrix &potentialMap);
}
#endif // __BUILDMATRIX_H__

View File

@@ -9,6 +9,7 @@
#include "CudaEnvInit.h" #include "CudaEnvInit.h"
#include "log/notify.h" #include "log/notify.h"
#include "transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.h" #include "transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.h"
#include "transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh"
#include "src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.h" #include "src/transmissionReconstruction/reconstruction/solvingEquationSystem/solve.h"
#include <algorithm> #include <algorithm>
@@ -194,8 +195,9 @@ namespace Recon {
BuildMatrixResult buildMatrixR; BuildMatrixResult buildMatrixR;
for(int iter=1; iter<=numIter; ++iter) for(int iter=1; iter<=numIter; ++iter)
{ {
auto resDevice = res.toDeviceMatrix();
//1200ms //1200ms
buildMatrixR = buildMatrix(senderList, receiverList, res, dims, bentRecon && (iter!=1), potentialMap); buildMatrixR = buildMatrix(senderList.toDeviceMatrix(), receiverList.toDeviceMatrix(), resDevice, dims.toDeviceMatrix(), bentRecon && (iter!=1), potentialMap.toDeviceMatrix());
if(!data.isNull() && bentRecon && iter != numIter) if(!data.isNull() && bentRecon && iter != numIter)
{ {
//与默认配置bentRecon不符暂不实现todo //与默认配置bentRecon不符暂不实现todo
@@ -229,51 +231,21 @@ namespace Recon {
allHitMaps.push_back(buildMatrixR.hitmap); allHitMaps.push_back(buildMatrixR.hitmap);
} }
std::promise<Matrix> sosPromise;
std::future<Matrix> sosFuture = sosPromise.get_future();
std::promise<Matrix> attPromise;
std::future<Matrix> attFuture = attPromise.get_future();
solveParameterIteratorFunctionType solveParameterIteratorFunctionPointer = &solveParameterIterator;
slownessToSOSFunctionType slownessToSOSFunctionPointer = &slownessToSOS;
std::thread sosThread;
std::thread attThread;
if(!data.isNull()) if(!data.isNull())
{ {
//1500ms //1500ms
sosThread = std::thread([solveParameterIteratorFunctionPointer, slownessToSOSFunctionPointer, &buildMatrixR, &b, &dims, &SOS_IN_WATER, &sosPromise]() Matrix sosValue = solveParameterIterator(buildMatrixR.M, b, dims, false, transParams::nonNeg)[0][0];
{
//auto bCopy = b.deepCopy();
//auto dimCopy = dims.deepCopy();
Matrix result = solveParameterIteratorFunctionPointer(buildMatrixR.M, b, dims, false, transParams::nonNeg, 0)[0][0];
result = slownessToSOSFunctionPointer(result, SOS_IN_WATER);
sosPromise.set_value(result);
});
//Matrix sosValue = solveParameterIterator(buildMatrixR.M, b, dims, false, transParams::nonNeg)[0][0];
//1ms //1ms
//result.outSOS = slownessToSOS(sosValue, SOS_IN_WATER) ; result.outSOS = slownessToSOS(sosValue, SOS_IN_WATER) ;
} }
if(!dataAtt.isNull()) if(!dataAtt.isNull())
{ {
attThread = std::thread([solveParameterIteratorFunctionPointer, slownessToSOSFunctionPointer, &buildMatrixR, &bAtt, &dims, &attPromise]()
{
//auto bAttCopy = bAtt.deepCopy();
//auto dimCopy = dims.deepCopy();
Matrix result = solveParameterIteratorFunctionPointer(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg, 1)[0][0];
result = result / 100;
attPromise.set_value(result);
});
//1500ms //1500ms
//Matrix attValue = solveParameterIterator(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg)[0][0]; Matrix attValue = solveParameterIterator(buildMatrixR.M, bAtt, dims, false, transParams::nonNeg)[0][0];
//1ms //1ms
//result.outATT = attValue/100 ; result.outATT = attValue/100 ;
} }
sosThread.join();
attThread.join();
result.outSOS = sosFuture.get();
result.outATT = attFuture.get();
Recon::notifyProgress(10 + 10 * (iter/numIter)); Recon::notifyProgress(10 + 10 * (iter/numIter));
} }

View File

@@ -16,28 +16,33 @@
#include <Spectra/MatOp/SparseGenMatProd.h> #include <Spectra/MatOp/SparseGenMatProd.h>
#include <Eigen/Core> #include <Eigen/Core>
#include <Eigen/SparseCore> #include <Eigen/SparseCore>
#include <Eigen/Eigenvalues>
#include <sys/time.h>
namespace Recon namespace Recon
{ {
bool isEigsFinished = false;
bool isEigsStatus = false;
double eigs(Aurora::Sparse& aM) float eigs(Aurora::Sparse& aM)
{ {
double result = NAN;
size_t size = aM.getM(); size_t size = aM.getM();
if(size < aM.getN()) if(size < aM.getN())
{ {
size = aM.getN(); size = aM.getN();
} }
Eigen::SparseMatrix<double> M(size, size); Eigen::SparseMatrix<double> M(size,size);
std::vector<Eigen::Triplet<double>> triplets;
Aurora::Matrix rows = aM.getRowVector(); Aurora::Matrix rows = aM.getRowVector();
Aurora::Matrix columns = aM.getColVector(); Aurora::Matrix columns = aM.getColVector();
Aurora::Matrix values = aM.getValVector(); Aurora::Matrix values = aM.getValVector();
std::vector<Eigen::Triplet<double>> triplets(rows.getDataSize());
#pragma omp parallel for
for (int i = 0; i < rows.getDataSize(); ++i) for (int i = 0; i < rows.getDataSize(); ++i)
{ {
triplets.push_back(Eigen::Triplet<double>(rows[i], columns[i], values[i])); triplets[i] = Eigen::Triplet<double>(rows[i], columns[i], values[i]);
} }
M.setFromTriplets(triplets.begin(), triplets.end()); M.setFromTriplets(triplets.begin(), triplets.end());
float result;
Spectra::SparseGenMatProd<double> op(M); Spectra::SparseGenMatProd<double> op(M);
Spectra::GenEigsSolver<Spectra::SparseGenMatProd<double>> eigs(op, 1, 6); Spectra::GenEigsSolver<Spectra::SparseGenMatProd<double>> eigs(op, 1, 6);
eigs.init(); eigs.init();
@@ -49,14 +54,24 @@ namespace Recon
std::complex<double> complex = evalues[0]; std::complex<double> complex = evalues[0];
result = complex.real(); result = complex.real();
} }
isEigsFinished = true;
if (result> 1 + 1e-10)
{
isEigsStatus = true;
}
return result; return result;
} }
void checkAndScale(Aurora::Sparse& M, Aurora::Matrix& b,size_t n){ void checkAndScale(Aurora::Sparse& M, Aurora::Matrix& b,size_t n)
bool isreal = M.getValVector().getValueType() == Aurora::Normal; {
auto s2 = eigs(M); float s2 = 0.0;
if (s2> 1 + 1e-10) if(!isEigsFinished)
{
s2 = eigs(M);
return;;
}
if (isEigsStatus)
{ {
b = b / sqrt(s2); b = b / sqrt(s2);
M.getValVector() = M.getValVector() / sqrt( s2); M.getValVector() = M.getValVector() / sqrt( s2);

View File

@@ -7,7 +7,7 @@ namespace Recon {
void checkAndScale(Aurora::Sparse& M, Aurora::Matrix& b,size_t n); void checkAndScale(Aurora::Sparse& M, Aurora::Matrix& b,size_t n);
double eigs(Aurora::Sparse& aM); float eigs(Aurora::Sparse& aM);
Aurora::Matrix callTval3(Aurora::Sparse &M, Aurora::Matrix &b, Aurora::Matrix callTval3(Aurora::Sparse &M, Aurora::Matrix &b,
const Aurora::Matrix &dims, int device, const Aurora::Matrix &dims, int device,