247 lines
7.1 KiB
Plaintext
247 lines
7.1 KiB
Plaintext
#include "Function2D.cuh"
|
|
|
|
|
|
#include <cstdio>
|
|
#include <thrust/reduce.h>
|
|
#include <thrust/execution_policy.h>
|
|
#include <cuda_runtime.h>
|
|
#include <math.h>
|
|
|
|
#include "CudaMatrix.h"
|
|
#include "buildMatrix.cuh"
|
|
#include "Sparse.h"
|
|
|
|
|
|
namespace {
|
|
const int THREADS_PER_BLOCK = 256;
|
|
}
|
|
|
|
|
|
|
|
__device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc)
|
|
{
|
|
float temp1;
|
|
float temp2;
|
|
float temp3;
|
|
|
|
temp1 = aEndPt[0]- aStartPt[0];
|
|
temp2 = aEndPt[1]- aStartPt[1];
|
|
temp3 = aEndPt[2]- aStartPt[2];
|
|
temp1 *= aRes[0];
|
|
temp2 *= aRes[1];
|
|
temp3 *= aRes[2];
|
|
temp1 = temp1*temp1;
|
|
temp2 = temp2*temp2;
|
|
temp3 = temp3*temp3;
|
|
return (sqrtf(temp3+temp2+temp1))/(float)aPathLenDisc;
|
|
}
|
|
|
|
__global__ struct b3dStruct{
|
|
int max, f, pathlen;
|
|
float x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight;
|
|
};
|
|
__device__ void getRayBLen(float *startPoint, float *endPoint, b3dStruct& aOut) {
|
|
aOut.x = startPoint[0];
|
|
aOut.y = startPoint[1];
|
|
aOut.z = startPoint[2];
|
|
|
|
float dx = endPoint[0] - aOut.x;
|
|
float dy = endPoint[1] - aOut.y;
|
|
float dz = endPoint[2] - aOut.z;
|
|
|
|
aOut.x_inc = (dx < 0) ? -1 : 1;
|
|
float l = abs(dx);
|
|
aOut.y_inc = (dy < 0) ? -1 : 1;
|
|
float m = abs(dy);
|
|
aOut.z_inc = (dz < 0) ? -1 : 1;
|
|
float n = abs(dz);
|
|
aOut.dx2 = l*2;// << 1;
|
|
aOut.dy2 = m*2;// << 1;
|
|
aOut.dz2 = n*2;// << 1;
|
|
float vmax ;
|
|
if ((l >= m) && (l >= n)){
|
|
aOut.max =ceilf(l);
|
|
vmax = l;
|
|
aOut.d = aOut.dx2;
|
|
aOut.dx2 = 0;
|
|
aOut.f = 0;
|
|
}
|
|
else if((m >= l) && (m >= n)){
|
|
aOut.max = ceilf(m);
|
|
vmax = m;
|
|
aOut.d = aOut.dy2;
|
|
aOut.dy2 = 0;
|
|
aOut.f = 1;
|
|
}
|
|
else{
|
|
aOut.max = ceilf(n);
|
|
vmax = n;
|
|
aOut.d = aOut.dz2;
|
|
aOut.dz2 = 0;
|
|
aOut.f = 2;
|
|
}
|
|
aOut.err_0 = aOut.f==0?0:(aOut.dx2 - vmax);
|
|
aOut.err_1 = aOut.f==1?0:(aOut.dy2 - vmax);
|
|
aOut.err_2 = aOut.f==2?0:(aOut.dz2 - vmax);
|
|
}
|
|
|
|
/**
|
|
* @brief 计算traceStraightRayBresenham的pathlen和weight的核函数
|
|
*
|
|
* @param aStratPt
|
|
* @param aEndPt
|
|
* @param aSize
|
|
* @param aRes
|
|
* @param aOuts
|
|
* @return _
|
|
*/
|
|
__global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, float* aRes,int aSize, b3dStruct* aOuts ){
|
|
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
|
|
if (idx > aSize)return;
|
|
getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts[idx]);
|
|
|
|
aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes,aOuts[idx].max);
|
|
if (idx == 196){
|
|
int v=0;
|
|
for (int i =0;i<196; i++) {
|
|
v+=aOuts[i].max;
|
|
}
|
|
}
|
|
};
|
|
|
|
__global__ void calcPathOffsetKernel(b3dStruct* aStructs ,unsigned int* aPath, int aSize){
|
|
aPath[0] = aStructs[0].max;
|
|
for (int i=1; i<aSize; i++) {
|
|
aPath[i] =aStructs[i].max+aPath[i-1];
|
|
}
|
|
}
|
|
|
|
__device__ float convertToLinearIndices(const float3& aDims, const float3& aPath){
|
|
float matrixSize_1 = aDims.x;
|
|
float matrixSize_2 = aDims.y;
|
|
float coord_col1 = aPath.x;
|
|
float coord_col2Sub1 = aPath.y - 1;
|
|
float coord_col3Sub1 = aPath.z - 1;
|
|
return coord_col1 + coord_col2Sub1 * matrixSize_1 + coord_col3Sub1 * matrixSize_1 * matrixSize_2 - 1 ;
|
|
}
|
|
|
|
__global__ void calcRayBLPathKernel(struct b3dStruct* aInstruct,
|
|
float* dims, unsigned int * aOffset,
|
|
float* aJOut, unsigned int aSize) {
|
|
unsigned int idx = blockIdx.x;
|
|
if (idx > aSize)return;
|
|
b3dStruct* in = aInstruct+idx;
|
|
int f;
|
|
float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z;
|
|
x = in->x;
|
|
y = in->y;
|
|
z = in->z;
|
|
x_inc = in->x_inc;
|
|
y_inc = in->y_inc;
|
|
z_inc = in->z_inc;
|
|
|
|
dx2 = in->dx2;
|
|
dy2 = in->dy2;
|
|
dz2 = in->dz2;
|
|
d = in->d;
|
|
|
|
err_0 = in->err_0;
|
|
err_1 = in->err_1;
|
|
err_2 = in->err_2;
|
|
max = in->max;
|
|
f = in->f;
|
|
|
|
float3 dims3{dims[0],dims[1],dims[2]};
|
|
float* output_ptr = aJOut + (idx ==0?0:aOffset[idx-1]);
|
|
for (float i = 0; i < max; i++) {
|
|
output_ptr[0]=convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)});
|
|
// output_ptr[0]=i;
|
|
|
|
output_ptr ++;
|
|
if (err_0 > 0) {
|
|
x += x_inc;
|
|
err_0 -= d;
|
|
}
|
|
if (err_1 > 0) {
|
|
y += y_inc;
|
|
err_1 -= d;
|
|
}
|
|
if (err_2 > 0) {
|
|
z += z_inc;
|
|
err_2 -= d;
|
|
}
|
|
err_0 += dx2;
|
|
err_1 += dy2;
|
|
err_2 += dz2;
|
|
if (f == 0) x += x_inc;
|
|
if (f == 1) y += y_inc;
|
|
if (f == 2) z += z_inc;
|
|
}
|
|
}
|
|
|
|
__global__ void fillIAndSKernel(struct b3dStruct* aInstructs,unsigned int * aOffset,float* aIOut,float* aSOut){
|
|
b3dStruct& s = aInstructs[blockIdx.x];
|
|
__shared__ unsigned int beginIdx;
|
|
__shared__ unsigned int length;
|
|
__shared__ float weight;
|
|
|
|
if (threadIdx.x == 0){
|
|
beginIdx = (blockIdx.x == 0) ? 0 : aOffset[blockIdx.x - 1];
|
|
length = s.max;
|
|
weight = s.weight;
|
|
}
|
|
__syncthreads();
|
|
|
|
for (int i = 0; threadIdx.x + i < length; i+=blockDim.x) {
|
|
aIOut[beginIdx + threadIdx.x + i] = blockIdx.x;
|
|
aSOut[beginIdx + threadIdx.x + i] = weight;
|
|
}
|
|
}
|
|
|
|
Recon::BuildMatrixResult Recon::buildMatrix(const Aurora::CudaMatrix &senderList,
|
|
const Aurora::CudaMatrix &receiverList,
|
|
Aurora::CudaMatrix& res,
|
|
const Aurora::CudaMatrix &dims, bool BENT,
|
|
const Aurora::CudaMatrix &potentialMap){
|
|
Recon::BuildMatrixResult result;
|
|
unsigned int numDims = senderList.getDimSize(0);
|
|
unsigned int nTotalRays = senderList.getDimSize(1);
|
|
b3dStruct* structList= nullptr;
|
|
cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays);
|
|
int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
calcLenWeightKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(
|
|
senderList.getData(), receiverList.getData(), res.getData(),nTotalRays,structList);
|
|
cudaDeviceSynchronize();
|
|
|
|
unsigned int* offset= nullptr;
|
|
cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays);
|
|
calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays);
|
|
cudaDeviceSynchronize();
|
|
|
|
unsigned int size;
|
|
cudaMemcpy(&size, offset+(nTotalRays-1), sizeof(unsigned int), cudaMemcpyDeviceToHost);
|
|
|
|
float* iData = nullptr;
|
|
float* jData = nullptr;
|
|
float* sData = nullptr;
|
|
|
|
cudaMalloc((void**)&iData, sizeof(float)*size);
|
|
cudaMalloc((void**)&jData, sizeof(float)*size);
|
|
cudaMalloc((void**)&sData, sizeof(float)*size);
|
|
|
|
calcRayBLPathKernel<<<nTotalRays,1>>>(structList, dims.getData(), offset, jData, nTotalRays);
|
|
fillIAndSKernel<<<nTotalRays,THREADS_PER_BLOCK>>>(structList, offset, iData, sData);
|
|
cudaDeviceSynchronize();
|
|
cudaFree(structList);
|
|
|
|
auto mI = Aurora::CudaMatrix::fromRawData(iData, 1, size ,1);
|
|
auto mJ = Aurora::CudaMatrix::fromRawData(jData, 1, size,1);
|
|
auto mS = Aurora::CudaMatrix::fromRawData(sData, 1, size,1);
|
|
result.M = Aurora::Sparse(mI.toHostMatrix(), mJ.toHostMatrix(),mS.toHostMatrix(),nTotalRays, Aurora::prod(dims).getValue(0));
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|