#include "Function2D.cuh" #include #include #include #include #include #include "CudaMatrix.h" #include "buildMatrix.cuh" #include "Sparse.h" namespace { const int THREADS_PER_BLOCK = 256; } __device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc) { float temp1; float temp2; float temp3; temp1 = aEndPt[0]- aStartPt[0]; temp2 = aEndPt[1]- aStartPt[1]; temp3 = aEndPt[2]- aStartPt[2]; temp1 *= aRes[0]; temp2 *= aRes[1]; temp3 *= aRes[2]; temp1 = temp1*temp1; temp2 = temp2*temp2; temp3 = temp3*temp3; return (sqrtf(temp3+temp2+temp1))/(float)aPathLenDisc; } __global__ struct b3dStruct{ int max, f, pathlen; float x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight; }; __device__ void getRayBLen(float *startPoint, float *endPoint, b3dStruct& aOut) { aOut.x = startPoint[0]; aOut.y = startPoint[1]; aOut.z = startPoint[2]; float dx = endPoint[0] - aOut.x; float dy = endPoint[1] - aOut.y; float dz = endPoint[2] - aOut.z; aOut.x_inc = (dx < 0) ? -1 : 1; float l = abs(dx); aOut.y_inc = (dy < 0) ? -1 : 1; float m = abs(dy); aOut.z_inc = (dz < 0) ? -1 : 1; float n = abs(dz); aOut.dx2 = l*2;// << 1; aOut.dy2 = m*2;// << 1; aOut.dz2 = n*2;// << 1; float vmax ; if ((l >= m) && (l >= n)){ aOut.max =ceilf(l); vmax = l; aOut.d = aOut.dx2; aOut.dx2 = 0; aOut.f = 0; } else if((m >= l) && (m >= n)){ aOut.max = ceilf(m); vmax = m; aOut.d = aOut.dy2; aOut.dy2 = 0; aOut.f = 1; } else{ aOut.max = ceilf(n); vmax = n; aOut.d = aOut.dz2; aOut.dz2 = 0; aOut.f = 2; } aOut.err_0 = aOut.f==0?0:(aOut.dx2 - vmax); aOut.err_1 = aOut.f==1?0:(aOut.dy2 - vmax); aOut.err_2 = aOut.f==2?0:(aOut.dz2 - vmax); } /** * @brief 计算traceStraightRayBresenham的pathlen和weight的核函数 * * @param aStratPt * @param aEndPt * @param aSize * @param aRes * @param aOuts * @return _ */ __global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, float* aRes,int aSize, b3dStruct* aOuts ){ unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx > aSize)return; getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts[idx]); aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes,aOuts[idx].max); if (idx == 196){ int v=0; for (int i =0;i<196; i++) { v+=aOuts[i].max; } } }; __global__ void calcPathOffsetKernel(b3dStruct* aStructs ,unsigned int* aPath, int aSize){ aPath[0] = aStructs[0].max; for (int i=1; i aSize)return; b3dStruct* in = aInstruct+idx; int f; float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z; x = in->x; y = in->y; z = in->z; x_inc = in->x_inc; y_inc = in->y_inc; z_inc = in->z_inc; dx2 = in->dx2; dy2 = in->dy2; dz2 = in->dz2; d = in->d; err_0 = in->err_0; err_1 = in->err_1; err_2 = in->err_2; max = in->max; f = in->f; float3 dims3{dims[0],dims[1],dims[2]}; float* output_ptr = aJOut + (idx ==0?0:aOffset[idx-1]); for (float i = 0; i < max; i++) { output_ptr[0]=convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)}); // output_ptr[0]=i; output_ptr ++; if (err_0 > 0) { x += x_inc; err_0 -= d; } if (err_1 > 0) { y += y_inc; err_1 -= d; } if (err_2 > 0) { z += z_inc; err_2 -= d; } err_0 += dx2; err_1 += dy2; err_2 += dz2; if (f == 0) x += x_inc; if (f == 1) y += y_inc; if (f == 2) z += z_inc; } } __global__ void fillIAndSKernel(struct b3dStruct* aInstructs,unsigned int * aOffset,float* aIOut,float* aSOut){ b3dStruct& s = aInstructs[blockIdx.x]; __shared__ unsigned int beginIdx; __shared__ unsigned int length; __shared__ float weight; if (threadIdx.x == 0){ beginIdx = (blockIdx.x == 0) ? 0 : aOffset[blockIdx.x - 1]; length = s.max; weight = s.weight; } __syncthreads(); for (int i = 0; threadIdx.x + i < length; i+=blockDim.x) { aIOut[beginIdx + threadIdx.x + i] = blockIdx.x; aSOut[beginIdx + threadIdx.x + i] = weight; } } Recon::BuildMatrixResult Recon::buildMatrix(const Aurora::CudaMatrix &senderList, const Aurora::CudaMatrix &receiverList, Aurora::CudaMatrix& res, const Aurora::CudaMatrix &dims, bool BENT, const Aurora::CudaMatrix &potentialMap){ Recon::BuildMatrixResult result; unsigned int numDims = senderList.getDimSize(0); unsigned int nTotalRays = senderList.getDimSize(1); b3dStruct* structList= nullptr; cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays); int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; calcLenWeightKernel<<>>( senderList.getData(), receiverList.getData(), res.getData(),nTotalRays,structList); cudaDeviceSynchronize(); unsigned int* offset= nullptr; cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays); calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays); cudaDeviceSynchronize(); unsigned int size; cudaMemcpy(&size, offset+(nTotalRays-1), sizeof(unsigned int), cudaMemcpyDeviceToHost); float* iData = nullptr; float* jData = nullptr; float* sData = nullptr; cudaMalloc((void**)&iData, sizeof(float)*size); cudaMalloc((void**)&jData, sizeof(float)*size); cudaMalloc((void**)&sData, sizeof(float)*size); calcRayBLPathKernel<<>>(structList, dims.getData(), offset, jData, nTotalRays); fillIAndSKernel<<>>(structList, offset, iData, sData); cudaDeviceSynchronize(); cudaFree(structList); auto mI = Aurora::CudaMatrix::fromRawData(iData, 1, size ,1); auto mJ = Aurora::CudaMatrix::fromRawData(jData, 1, size,1); auto mS = Aurora::CudaMatrix::fromRawData(sData, 1, size,1); result.M = Aurora::Sparse(mI.toHostMatrix(), mJ.toHostMatrix(),mS.toHostMatrix(),nTotalRays, Aurora::prod(dims).getValue(0)); return result; }