UR/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu

#include "Function2D.cuh"


#include <cstdio>
#include <thrust/reduce.h>
#include <thrust/execution_policy.h>
#include <cuda_runtime.h>
#include <math.h>

#include "CudaMatrix.h"
#include "buildMatrix.cuh"
#include "Sparse.h"


namespace  {
    const int THREADS_PER_BLOCK = 256;
}


__device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc)
{
    float temp1;
    float temp2;
    float temp3;

    temp1 = aEndPt[0]- aStartPt[0];
    temp2 = aEndPt[1]- aStartPt[1];
    temp3 = aEndPt[2]- aStartPt[2];
    temp1 *= aRes[0];
    temp2 *= aRes[1];
    temp3 *= aRes[2];
    temp1 = temp1*temp1;
    temp2 = temp2*temp2;
    temp3 = temp3*temp3;
    return (sqrtf(temp3+temp2+temp1))/(float)aPathLenDisc;
}

__global__ struct b3dStruct{
    int max, f, pathlen;
    float  x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight;
};
__device__ void getRayBLen(float *startPoint, float *endPoint, b3dStruct& aOut) {
    aOut.x = startPoint[0];
    aOut.y = startPoint[1];
    aOut.z = startPoint[2];

    float dx = endPoint[0] - aOut.x;
    float dy = endPoint[1] - aOut.y;
    float dz = endPoint[2] - aOut.z;

    aOut.x_inc = (dx < 0) ? -1 : 1;
    float l = abs(dx);
    aOut.y_inc = (dy < 0) ? -1 : 1;
    float m = abs(dy);
    aOut.z_inc = (dz < 0) ? -1 : 1;
    float n = abs(dz);
    aOut.dx2 = l*2;// << 1;
    aOut.dy2 = m*2;// << 1;
    aOut.dz2 = n*2;// << 1;
    float vmax ;
    if ((l >= m) && (l >= n)){
        aOut.max =ceilf(l);
        vmax = l;
        aOut.d = aOut.dx2;
        aOut.dx2 = 0;
        aOut.f = 0;
    }
    else if((m >= l) && (m >= n)){
        aOut.max = ceilf(m);
        vmax = m;
        aOut.d = aOut.dy2;
        aOut.dy2 = 0;
        aOut.f = 1;
    }
    else{
        aOut.max = ceilf(n);
        vmax = n;
        aOut.d = aOut.dz2;
        aOut.dz2 = 0;
        aOut.f = 2;
    }
    aOut.err_0 = aOut.f==0?0:(aOut.dx2 - vmax);
    aOut.err_1 = aOut.f==1?0:(aOut.dy2 - vmax);
    aOut.err_2 = aOut.f==2?0:(aOut.dz2 - vmax);
}

/**
 * @brief 计算traceStraightRayBresenham的pathlen和weight的核函数
 *
 * @param aStratPt
 * @param aEndPt
 * @param aSize
 * @param aRes
 * @param aOuts
 * @return _
 */
__global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, float* aRes,int aSize, b3dStruct* aOuts ){
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if (idx > aSize)return;
    getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts[idx]);

    aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes,aOuts[idx].max);
    if (idx == 196){
        int v=0;
        for (int i =0;i<196; i++) {
            v+=aOuts[i].max;
        }
    }
};

__global__ void calcPathOffsetKernel(b3dStruct* aStructs ,unsigned int* aPath, int aSize){
    aPath[0] = aStructs[0].max;
    for (int i=1; i<aSize; i++) {
        aPath[i] =aStructs[i].max+aPath[i-1];
    }
}

__device__ float convertToLinearIndices(const float3& aDims, const float3& aPath){
    float matrixSize_1 = aDims.x;
    float matrixSize_2 = aDims.y;
    float coord_col1 = aPath.x;
    float coord_col2Sub1 = aPath.y - 1;
    float coord_col3Sub1 = aPath.z - 1;
    return coord_col1 + coord_col2Sub1 * matrixSize_1 + coord_col3Sub1 * matrixSize_1 * matrixSize_2 - 1 ;
}

__global__ void calcRayBLPathKernel(struct b3dStruct* aInstruct,
            float* dims, unsigned int * aOffset,
            float* aJOut, unsigned int aSize) {
    unsigned int idx = blockIdx.x;
    if (idx > aSize)return;
    b3dStruct* in = aInstruct+idx;
    int f;
    float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z;
    x = in->x;
    y = in->y;
    z = in->z;
    x_inc = in->x_inc;
    y_inc = in->y_inc;
    z_inc = in->z_inc;

    dx2 = in->dx2;
    dy2 = in->dy2;
    dz2 = in->dz2;
    d = in->d;

    err_0 = in->err_0;
    err_1 = in->err_1;
    err_2 = in->err_2;
    max = in->max;
    f = in->f;

    float3 dims3{dims[0],dims[1],dims[2]};
    float* output_ptr = aJOut + (idx ==0?0:aOffset[idx-1]);
    for (float i = 0; i < max; i++) {
        output_ptr[0]=convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)});
        // output_ptr[0]=i;

        output_ptr ++;
        if (err_0 > 0) {
          x += x_inc;
          err_0 -= d;
        }
        if (err_1 > 0) {
          y += y_inc;
          err_1 -= d;
        }
        if (err_2 > 0) {
          z += z_inc;
          err_2 -= d;
        }
        err_0 += dx2;
        err_1 += dy2;
        err_2 += dz2;
        if (f == 0) x += x_inc;
        if (f == 1) y += y_inc;
        if (f == 2) z += z_inc;
    }
}

__global__ void fillIAndSKernel(struct b3dStruct* aInstructs,unsigned int * aOffset,float* aIOut,float* aSOut){
    b3dStruct& s = aInstructs[blockIdx.x];
    __shared__ unsigned int beginIdx;
    __shared__ unsigned int length;
    __shared__ float weight;

    if (threadIdx.x == 0){
        beginIdx = (blockIdx.x == 0) ? 0 : aOffset[blockIdx.x - 1];
        length = s.max;
        weight = s.weight;
    }
    __syncthreads();

    for (int i = 0; threadIdx.x + i < length; i+=blockDim.x) {
        aIOut[beginIdx + threadIdx.x + i] = blockIdx.x;
        aSOut[beginIdx + threadIdx.x + i] = weight;
    }
}

Recon::BuildMatrixResult Recon::buildMatrix(const Aurora::CudaMatrix &senderList,
                                  const Aurora::CudaMatrix &receiverList,
                                  Aurora::CudaMatrix& res,
                                  const Aurora::CudaMatrix &dims, bool BENT,
                                  const Aurora::CudaMatrix &potentialMap){
    Recon::BuildMatrixResult result;
    unsigned int numDims = senderList.getDimSize(0);
    unsigned int nTotalRays = senderList.getDimSize(1);
    b3dStruct* structList= nullptr;
    cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays);
    int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    calcLenWeightKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(
        senderList.getData(), receiverList.getData(), res.getData(),nTotalRays,structList);
    cudaDeviceSynchronize();

    unsigned int* offset= nullptr;
    cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays);
    calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays);
    cudaDeviceSynchronize();

    unsigned int size;
    cudaMemcpy(&size, offset+(nTotalRays-1), sizeof(unsigned int), cudaMemcpyDeviceToHost);

    float* iData = nullptr;
    float* jData = nullptr;
    float* sData = nullptr;

    cudaMalloc((void**)&iData, sizeof(float)*size);
    cudaMalloc((void**)&jData, sizeof(float)*size);
    cudaMalloc((void**)&sData, sizeof(float)*size);

    calcRayBLPathKernel<<<nTotalRays,1>>>(structList, dims.getData(), offset, jData, nTotalRays);
    fillIAndSKernel<<<nTotalRays,THREADS_PER_BLOCK>>>(structList, offset, iData, sData);
    cudaDeviceSynchronize();
    cudaFree(structList);

    auto mI = Aurora::CudaMatrix::fromRawData(iData, 1, size ,1);
    auto mJ = Aurora::CudaMatrix::fromRawData(jData, 1, size,1);
    auto mS = Aurora::CudaMatrix::fromRawData(sData, 1, size,1);
    result.M = Aurora::Sparse(mI.toHostMatrix(), mJ.toHostMatrix(),mS.toHostMatrix(),nTotalRays, Aurora::prod(dims).getValue(0));
    return result;

}