Add cuda build matrix

This commit is contained in:
kradchen
2023-12-26 15:23:41 +08:00
parent 32ad08f8d7
commit 7ad84ff884
3 changed files with 145 additions and 75 deletions

View File

@@ -115,7 +115,6 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
std::vector<float> points; std::vector<float> points;
float i, l, m, n, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2,d, x,y,z; float i, l, m, n, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2,d, x,y,z;
x = startPoint[0]; x = startPoint[0];
y = startPoint[1]; y = startPoint[1];
z = startPoint[2]; z = startPoint[2];
@@ -134,24 +133,28 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
dy2 = m*2;// << 1; dy2 = m*2;// << 1;
dz2 = n*2;// << 1; dz2 = n*2;// << 1;
float maxV; float maxV;
int f = -1;
if ((l >= m) && (l >= n)){ if ((l >= m) && (l >= n)){
maxV = l; maxV = l;
d = dx2; d = dx2;
dx2 = 0; dx2 = 0;
f =0;
} }
else if((m >= l) && (m >= n)){ else if((m >= l) && (m >= n)){
maxV = m; maxV = m;
d = dy2; d = dy2;
dy2 = 0; dy2 = 0;
f = 1;
} }
else{ else{
maxV = n; maxV = n;
d = dz2; d = dz2;
dz2 = 0; dz2 = 0;
f =2;
} }
err_0 = dx2==0?0:(dx2 - maxV); err_0 = f==0?0:(dx2 - maxV);
err_1 = dy2==0?0:(dy2 - maxV); err_1 = f==1?0:(dy2 - maxV);
err_2 = dz2==0?0:(dz2 - maxV); err_2 = f==2?0:(dz2 - maxV);
for (i = 0; i < maxV; i++) { for (i = 0; i < maxV; i++) {
@@ -173,9 +176,9 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
err_0 += dx2; err_0 += dx2;
err_1 += dy2; err_1 += dy2;
err_2 += dz2; err_2 += dz2;
if (err_0 == 0)x += x_inc; if (f == 0)x += x_inc;
if (err_1 == 0)y += y_inc; if (f == 1)y += y_inc;
if (err_2 == 0)z += z_inc; if (f == 2)z += z_inc;
} }
float* output = new float[points.size()]; float* output = new float[points.size()];
@@ -187,8 +190,6 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
Aurora::Matrix b3dMexDouble(const Aurora::Matrix& StartPt, const Aurora::Matrix& endPts) Aurora::Matrix b3dMexDouble(const Aurora::Matrix& StartPt, const Aurora::Matrix& endPts)
{ {
size_t size = 0; size_t size = 0;
size_t size1 = 0;
auto temp = auto temp =
Recon::b3dMexDouble(StartPt.getData(), endPts.getData(), size); Recon::b3dMexDouble(StartPt.getData(), endPts.getData(), size);
return Aurora::Matrix::fromRawData(temp, 3, size / 3); return Aurora::Matrix::fromRawData(temp, 3, size / 3);

View File

@@ -1,9 +1,17 @@
#include "CudaMatrix.h" #include "Function2D.cuh"
#include <cstdio>
#include <thrust/reduce.h> #include <thrust/reduce.h>
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <math.h> #include <math.h>
#include "CudaMatrix.h"
#include "buildMatrix.cuh"
#include "Sparse.h"
namespace { namespace {
const int THREADS_PER_BLOCK = 256; const int THREADS_PER_BLOCK = 256;
} }
@@ -12,59 +20,69 @@ namespace {
__device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc) __device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc)
{ {
float3 temp; float temp1;
temp.x = aEndPt[0]- aStartPt[0]; float temp2;
temp.y = aEndPt[1]- aStartPt[1]; float temp3;
temp.z = aEndPt[2]- aStartPt[2];
temp.x *= aRes[0]; temp1 = aEndPt[0]- aStartPt[0];
temp.y *= aRes[1]; temp2 = aEndPt[1]- aStartPt[1];
temp.z *= aRes[2]; temp3 = aEndPt[2]- aStartPt[2];
temp.x *= temp.x; temp1 *= aRes[0];
temp.y *= temp.y; temp2 *= aRes[1];
temp.z *= temp.x; temp3 *= aRes[2];
return sqrtf(temp.x+temp.y+temp.z)/aPathLenDisc; temp1 = temp1*temp1;
temp2 = temp2*temp2;
temp3 = temp3*temp3;
return (sqrtf(temp3+temp2+temp1))/(float)aPathLenDisc;
} }
__global__ struct b3dStruct{ __global__ struct b3dStruct{
float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight; int max, f, pathlen;
float x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight;
}; };
__device__ float * getRayBLen(float *startPoint, float *endPoint, struct b3dStruct* aOut) { __device__ void getRayBLen(float *startPoint, float *endPoint, b3dStruct& aOut) {
aOut.x = startPoint[0];
aOut.y = startPoint[1];
aOut.z = startPoint[2];
aOut->x = startPoint[0]; float dx = endPoint[0] - aOut.x;
aOut->y = startPoint[1]; float dy = endPoint[1] - aOut.y;
aOut->z = startPoint[2]; float dz = endPoint[2] - aOut.z;
float dx = endPoint[0] - aOut->x; aOut.x_inc = (dx < 0) ? -1 : 1;
float dy = endPoint[1] - aOut->y;
float dz = endPoint[2] - aOut->z;
aOut->x_inc = (dx < 0) ? -1 : 1;
float l = abs(dx); float l = abs(dx);
aOut->y_inc = (dy < 0) ? -1 : 1; aOut.y_inc = (dy < 0) ? -1 : 1;
float m = abs(dy); float m = abs(dy);
aOut->z_inc = (dz < 0) ? -1 : 1; aOut.z_inc = (dz < 0) ? -1 : 1;
float n = abs(dz); float n = abs(dz);
aOut->dx2 = l*2;// << 1; aOut.dx2 = l*2;// << 1;
aOut->dy2 = m*2;// << 1; aOut.dy2 = m*2;// << 1;
aOut->dz2 = n*2;// << 1; aOut.dz2 = n*2;// << 1;
float vmax ;
if ((l >= m) && (l >= n)){ if ((l >= m) && (l >= n)){
aOut->max = l; aOut.max =ceilf(l);
aOut->d = aOut->dx2; vmax = l;
aOut->dx2 = 0; aOut.d = aOut.dx2;
aOut.dx2 = 0;
aOut.f = 0;
} }
else if((m >= l) && (m >= n)){ else if((m >= l) && (m >= n)){
aOut->max = m; aOut.max = ceilf(m);
aOut->d = aOut->dy2; vmax = m;
aOut->dy2 = 0; aOut.d = aOut.dy2;
aOut.dy2 = 0;
aOut.f = 1;
} }
else{ else{
aOut->max = n; aOut.max = ceilf(n);
aOut->d = aOut->dz2; vmax = n;
aOut->dz2 = 0; aOut.d = aOut.dz2;
aOut.dz2 = 0;
aOut.f = 2;
} }
aOut->err_0 = aOut->dx2==0?0:(aOut->dx2 - aOut->max); aOut.err_0 = aOut.f==0?0:(aOut.dx2 - vmax);
aOut->err_1 = aOut->dy2==0?0:(aOut->dy2 - aOut->max); aOut.err_1 = aOut.f==1?0:(aOut.dy2 - vmax);
aOut->err_2 = aOut->dz2==0?0:(aOut->dz2 - aOut->max); aOut.err_2 = aOut.f==2?0:(aOut.dz2 - vmax);
} }
/** /**
@@ -77,35 +95,37 @@ __device__ float * getRayBLen(float *startPoint, float *endPoint, struct b3dStru
* @param aOuts * @param aOuts
* @return _ * @return _
*/ */
__global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, int aSize,float* aRes, b3dStruct* aOuts ){ __global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, float* aRes,int aSize, b3dStruct* aOuts ){
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx > aSize)return; if (idx > aSize)return;
getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts+idx); getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts[idx]);
aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes + idx*3,aOuts[idx].max);
aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes,aOuts[idx].max);
}; };
__global__ void calcPathOffsetKernel(b3dStruct* aStructs ,float* aPath, int aSize){ __global__ void calcPathOffsetKernel(b3dStruct* aStructs ,unsigned int* aPath, int aSize){
aPath[0] = aStructs[0].max; aPath[0] = aStructs[0].max;
for (int i=1; i<aSize; i++) { for (int i=1; i<aSize; i++) {
aPath[i] =aStructs[i].max+aPath[i]-1; aPath[i] =aStructs[i].max+aPath[i-1];
} }
} }
__global__ void convertToLinearIndices(const float3& aDims, const float3& aPath, float* aOut){ __device__ float convertToLinearIndices(const float3& aDims, const float3& aPath){
float matrixSize_1 = aDims.x; float matrixSize_1 = aDims.x;
float matrixSize_2 = aDims.y; float matrixSize_2 = aDims.y;
float coord_col1 = aPath.x; float coord_col1 = aPath.x;
float coord_col2Sub1 = aPath.y - 1; float coord_col2Sub1 = aPath.y - 1;
float coord_col3Sub1 = aPath.z - 1; float coord_col3Sub1 = aPath.z - 1;
*aOut = coord_col1 + coord_col2Sub1 * matrixSize_1 + coord_col3Sub1 * matrixSize_1 * matrixSize_2 -1; return coord_col1 + coord_col2Sub1 * matrixSize_1 + coord_col3Sub1 * matrixSize_1 * matrixSize_2 - 1 ;
} }
__global__ void calcRayBLPath(struct b3dStruct* aInstruct, __global__ void calcRayBLPathKernel(struct b3dStruct* aInstruct,
float* dims, unsigned int * aOffset, float* aIOut, float* dims, unsigned int * aOffset,
float* aJOut, float* aSOut, unsigned int aSize) { float* aJOut, unsigned int aSize) {
unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; unsigned int idx = blockIdx.x;
if (idx > aSize)return; if (idx > aSize)return;
b3dStruct* in = aInstruct+idx; b3dStruct* in = aInstruct+idx;
int f;
float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z; float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z;
x = in->x; x = in->x;
y = in->y; y = in->y;
@@ -113,15 +133,22 @@ float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2,d, x,y,z;
x_inc = in->x_inc; x_inc = in->x_inc;
y_inc = in->y_inc; y_inc = in->y_inc;
z_inc = in->z_inc; z_inc = in->z_inc;
dx2 = in->dx2;
dy2 = in->dy2;
dz2 = in->dz2;
d = in->d;
err_0 = in->err_0; err_0 = in->err_0;
err_1 = in->err_1; err_1 = in->err_1;
err_2 = in->err_2; err_2 = in->err_2;
max = in->max; max = in->max;
f = in->f;
float3 dims3{dims[0],dims[1],dims[2]}; float3 dims3{dims[0],dims[1],dims[2]};
float* output_ptr = aOutput + aOffset[idx]; float* output_ptr = aJOut + (idx ==0?0:aOffset[idx-1]);
for (float i = 0; i < max; i++) { for (float i = 0; i < max; i++) {
convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)}, output_ptr); output_ptr[0]=convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)});
output_ptr ++; output_ptr ++;
if (err_0 > 0) { if (err_0 > 0) {
x += x_inc; x += x_inc;
@@ -138,32 +165,54 @@ float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2,d, x,y,z;
err_0 += dx2; err_0 += dx2;
err_1 += dy2; err_1 += dy2;
err_2 += dz2; err_2 += dz2;
if (err_0 == 0)x += x_inc; if (f == 0) x += x_inc;
if (err_1 == 0)y += y_inc; if (f == 1) y += y_inc;
if (err_2 == 0)z += z_inc; if (f == 2) z += z_inc;
} }
} }
void buildMatrix(const Aurora::CudaMatrix &senderList, __global__ void fillIAndSKernel(struct b3dStruct* aInstructs,unsigned int * aOffset,float* aIOut,float* aSOut){
b3dStruct& s = aInstructs[blockIdx.x];
__shared__ unsigned int beginIdx;
__shared__ unsigned int length;
__shared__ float weight;
if (threadIdx.x == 0){
beginIdx = (blockIdx.x == 0) ? 0 : aOffset[blockIdx.x - 1];
length = s.max;
weight = s.weight;
}
__syncthreads();
for (int i = 0; threadIdx.x + i < length; i+=blockDim.x) {
aIOut[beginIdx + threadIdx.x + i] = blockIdx.x;
aSOut[beginIdx + threadIdx.x + i] = weight;
}
}
Recon::BuildMatrixResult Recon::buildMatrix(const Aurora::CudaMatrix &senderList,
const Aurora::CudaMatrix &receiverList, const Aurora::CudaMatrix &receiverList,
Aurora::CudaMatrix& res, Aurora::CudaMatrix& res,
const Aurora::CudaMatrix &dims, bool BENT, const Aurora::CudaMatrix &dims, bool BENT,
const Aurora::CudaMatrix &potentialMap){ const Aurora::CudaMatrix &potentialMap){
Recon::BuildMatrixResult result;
unsigned int numDims = senderList.getDimSize(0); unsigned int numDims = senderList.getDimSize(0);
unsigned int nTotalRays = senderList.getDimSize(1); unsigned int nTotalRays = senderList.getDimSize(1);
unsigned int cnt = 0;
unsigned int coeffIndex = 0;
unsigned int rayCount = 0;
b3dStruct* structList= nullptr; b3dStruct* structList= nullptr;
cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays); cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays);
int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
calcLenWeightKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>( calcLenWeightKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(
senderList.getData(), receiverList.getData(), res.getData(),structList); senderList.getData(), receiverList.getData(), res.getData(),nTotalRays,structList);
cudaDeviceSynchronize();
unsigned int* offset= nullptr; unsigned int* offset= nullptr;
cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays); cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays);
calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays); calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays);
cudaDeviceSynchronize();
unsigned int size; unsigned int size;
cudaMemcpy(&size, offset+(nTotalRays-1), 1, cudaMemcpyDeviceToHost); cudaMemcpy(&size, offset+(nTotalRays-1), sizeof(unsigned int), cudaMemcpyDeviceToHost);
float* iData = nullptr; float* iData = nullptr;
float* jData = nullptr; float* jData = nullptr;
float* sData = nullptr; float* sData = nullptr;
@@ -172,9 +221,17 @@ void buildMatrix(const Aurora::CudaMatrix &senderList,
cudaMalloc((void**)&jData, sizeof(float)*size); cudaMalloc((void**)&jData, sizeof(float)*size);
cudaMalloc((void**)&sData, sizeof(float)*size); cudaMalloc((void**)&sData, sizeof(float)*size);
calcRayBLPath(structList, dims.getData(), offset, jData); calcRayBLPathKernel<<<nTotalRays,1>>>(structList, dims.getData(), offset, jData, nTotalRays);
fillIAndSKernel<<<nTotalRays,THREADS_PER_BLOCK>>>(structList, offset, iData, sData);
cudaDeviceSynchronize();
cudaFree(structList); cudaFree(structList);
auto mI = Aurora::CudaMatrix::fromRawData(iData, 1, size ,1);
auto mJ = Aurora::CudaMatrix::fromRawData(jData, 1, size,1);
auto mS = Aurora::CudaMatrix::fromRawData(sData, 1, size,1);
result.M = Aurora::Sparse(mI.toHostMatrix(), mJ.toHostMatrix(),mS.toHostMatrix(),nTotalRays, Aurora::prod(dims).getValue(0));
return result;
} }

View File

@@ -0,0 +1,12 @@
#ifndef __BUILDMATRIX_CUH__
#define __BUILDMATRIX_CUH__
#include "buildMatrix.h"
namespace Recon {
BuildMatrixResult buildMatrix(const Aurora::CudaMatrix &senderList,
const Aurora::CudaMatrix &receiverList,
Aurora::CudaMatrix& res,
const Aurora::CudaMatrix &dims, bool BENT,
const Aurora::CudaMatrix &potentialMap);
}
#endif // __BUILDMATRIX_H__