diff --git a/src/transmissionReconstruction/reconstruction/buildMatrix/Bresenham.cpp b/src/transmissionReconstruction/reconstruction/buildMatrix/Bresenham.cpp
index 1308ec4..420cf3b 100644
--- a/src/transmissionReconstruction/reconstruction/buildMatrix/Bresenham.cpp
+++ b/src/transmissionReconstruction/reconstruction/buildMatrix/Bresenham.cpp
@@ -115,7 +115,6 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
     std::vector<float> points; 
     
     float i, l, m, n, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2,d, x,y,z;
-           
     x = startPoint[0];
     y = startPoint[1];
     z = startPoint[2];
@@ -134,24 +133,28 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
     dy2 = m*2;// << 1;
     dz2 = n*2;// << 1;
     float maxV;
+    int f = -1;
     if ((l >= m) && (l >= n)){
         maxV = l;
         d = dx2;
         dx2 = 0;
+        f =0;
     }
     else if((m >= l) && (m >= n)){
         maxV = m;
         d = dy2;
         dy2 = 0;
+        f = 1;
     }
     else{
         maxV = n;
         d = dz2;
         dz2 = 0;
+        f =2;
     }
-    err_0 = dx2==0?0:(dx2 - maxV);
-    err_1 = dy2==0?0:(dy2 - maxV);
-    err_2 = dz2==0?0:(dz2 - maxV);
+    err_0 = f==0?0:(dx2 - maxV);
+    err_1 = f==1?0:(dy2 - maxV);
+    err_2 = f==2?0:(dz2 - maxV);
 
     for (i = 0; i < maxV; i++) {
 
@@ -173,9 +176,9 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
         err_0 += dx2;
         err_1 += dy2;
         err_2 += dz2;
-        if (err_0 == 0)x += x_inc;
-        if (err_1 == 0)y += y_inc;
-        if (err_2 == 0)z += z_inc;
+        if (f == 0)x += x_inc;
+        if (f == 1)y += y_inc;
+        if (f == 2)z += z_inc;
 
     }
     float* output = new float[points.size()];
@@ -187,11 +190,9 @@ float * b3dMexDouble2(float *startPoint,float *endPoint, size_t& outputSize) {
 Aurora::Matrix b3dMexDouble(const Aurora::Matrix& StartPt, const Aurora::Matrix& endPts)
 {
     size_t size = 0;
-    size_t size1 = 0;
-
     auto temp  =
         Recon::b3dMexDouble(StartPt.getData(), endPts.getData(), size);
-    return Aurora::Matrix::fromRawData(temp, 3, size / 3);
+        return Aurora::Matrix::fromRawData(temp, 3, size / 3);
 }
 
 float * b3dMexDouble(float *startPoint,float *endPoint, size_t& outputSize) {
diff --git a/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu
index b2ac2d7..3c944de 100644
--- a/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu
+++ b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cu
@@ -1,9 +1,17 @@
-#include "CudaMatrix.h"
+#include "Function2D.cuh"
+
+
+#include <cstdio>
 #include <thrust/reduce.h>
 #include <thrust/execution_policy.h>
 #include <cuda_runtime.h>
-
 #include <math.h>
+
+#include "CudaMatrix.h"
+#include "buildMatrix.cuh"
+#include "Sparse.h"
+
+
 namespace  {
     const int THREADS_PER_BLOCK = 256;
 }
@@ -12,59 +20,69 @@ namespace  {
 
 __device__ float getPixelLengthKernel(float* aStartPt, float*aEndPt, float* aRes, int aPathLenDisc)
 {
-    float3 temp;
-    temp.x = aEndPt[0]- aStartPt[0];
-    temp.y = aEndPt[1]- aStartPt[1];
-    temp.z = aEndPt[2]- aStartPt[2];
-    temp.x *= aRes[0];
-    temp.y *= aRes[1];
-    temp.z *= aRes[2];
-    temp.x *= temp.x;
-    temp.y *= temp.y;
-    temp.z *= temp.x;
-    return sqrtf(temp.x+temp.y+temp.z)/aPathLenDisc;
+    float temp1;
+    float temp2;
+    float temp3;
+
+    temp1 = aEndPt[0]- aStartPt[0];
+    temp2 = aEndPt[1]- aStartPt[1];
+    temp3 = aEndPt[2]- aStartPt[2];
+    temp1 *= aRes[0];
+    temp2 *= aRes[1];
+    temp3 *= aRes[2];
+    temp1 = temp1*temp1;
+    temp2 = temp2*temp2;
+    temp3 = temp3*temp3;
+    return (sqrtf(temp3+temp2+temp1))/(float)aPathLenDisc;
 }
 
 __global__ struct b3dStruct{
-    float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight;
+    int max, f, pathlen;
+    float  x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x, y, z, weight;
 };
-__device__ float * getRayBLen(float *startPoint, float *endPoint, struct b3dStruct* aOut) {
-       
-    aOut->x = startPoint[0];
-    aOut->y = startPoint[1];
-    aOut->z = startPoint[2];
+__device__ void getRayBLen(float *startPoint, float *endPoint, b3dStruct& aOut) {  
+    aOut.x = startPoint[0];
+    aOut.y = startPoint[1];
+    aOut.z = startPoint[2];
     
-    float dx = endPoint[0] - aOut->x;
-    float dy = endPoint[1] - aOut->y;
-    float dz = endPoint[2] - aOut->z;
+    float dx = endPoint[0] - aOut.x;
+    float dy = endPoint[1] - aOut.y;
+    float dz = endPoint[2] - aOut.z;
     
-    aOut->x_inc = (dx < 0) ? -1 : 1;
+    aOut.x_inc = (dx < 0) ? -1 : 1;
     float l = abs(dx);
-    aOut->y_inc = (dy < 0) ? -1 : 1;
+    aOut.y_inc = (dy < 0) ? -1 : 1;
     float m = abs(dy);
-    aOut->z_inc = (dz < 0) ? -1 : 1;
+    aOut.z_inc = (dz < 0) ? -1 : 1;
     float n = abs(dz);
-    aOut->dx2 = l*2;// << 1;
-    aOut->dy2 = m*2;// << 1;
-    aOut->dz2 = n*2;// << 1;
+    aOut.dx2 = l*2;// << 1;
+    aOut.dy2 = m*2;// << 1;
+    aOut.dz2 = n*2;// << 1;
+    float vmax ;
     if ((l >= m) && (l >= n)){
-        aOut->max = l;
-        aOut->d = aOut->dx2;
-        aOut->dx2 = 0;
+        aOut.max =ceilf(l);
+        vmax = l;
+        aOut.d = aOut.dx2;
+        aOut.dx2 = 0;
+        aOut.f = 0;
     }
     else if((m >= l) && (m >= n)){
-        aOut->max = m;
-        aOut->d = aOut->dy2;
-        aOut->dy2 = 0;
+        aOut.max = ceilf(m);
+        vmax = m;
+        aOut.d = aOut.dy2;
+        aOut.dy2 = 0;
+        aOut.f = 1;
     }
     else{
-        aOut->max = n;
-        aOut->d = aOut->dz2;
-        aOut->dz2 = 0;
+        aOut.max = ceilf(n);
+        vmax = n;
+        aOut.d = aOut.dz2;
+        aOut.dz2 = 0;
+        aOut.f = 2;
     }
-    aOut->err_0 = aOut->dx2==0?0:(aOut->dx2 - aOut->max);
-    aOut->err_1 = aOut->dy2==0?0:(aOut->dy2 - aOut->max);
-    aOut->err_2 = aOut->dz2==0?0:(aOut->dz2 - aOut->max);
+    aOut.err_0 = aOut.f==0?0:(aOut.dx2 - vmax);
+    aOut.err_1 = aOut.f==1?0:(aOut.dy2 - vmax);
+    aOut.err_2 = aOut.f==2?0:(aOut.dz2 - vmax);
 }
 
 /**
@@ -77,51 +95,60 @@ __device__ float * getRayBLen(float *startPoint, float *endPoint, struct b3dStru
  * @param aOuts 
  * @return _
  */
-__global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, int aSize,float* aRes, b3dStruct* aOuts ){
+__global__ void calcLenWeightKernel(float* aStratPt, float* aEndPt, float* aRes,int aSize, b3dStruct* aOuts ){
     unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
     if (idx > aSize)return;
-    getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts+idx);
-    aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes + idx*3,aOuts[idx].max);
+    getRayBLen(aStratPt + idx*3, aEndPt + idx*3, aOuts[idx]);
+
+    aOuts[idx].weight = getPixelLengthKernel(aStratPt + idx*3, aEndPt + idx*3, aRes,aOuts[idx].max);
 };
 
-__global__ void calcPathOffsetKernel(b3dStruct* aStructs ,float* aPath, int aSize){
+__global__ void calcPathOffsetKernel(b3dStruct* aStructs ,unsigned int* aPath, int aSize){
     aPath[0] = aStructs[0].max;
     for (int i=1; i<aSize; i++) {
-        aPath[i] =aStructs[i].max+aPath[i]-1; 
+        aPath[i] =aStructs[i].max+aPath[i-1]; 
     }
 }
 
-__global__ void convertToLinearIndices(const float3& aDims, const float3& aPath, float* aOut){
+__device__ float convertToLinearIndices(const float3& aDims, const float3& aPath){
     float matrixSize_1 = aDims.x;
     float matrixSize_2 = aDims.y;
     float coord_col1 = aPath.x;
     float coord_col2Sub1 = aPath.y - 1;
     float coord_col3Sub1 = aPath.z - 1;
-    *aOut = coord_col1 + coord_col2Sub1 * matrixSize_1 + coord_col3Sub1 * matrixSize_1 * matrixSize_2 -1;
+    return coord_col1 + coord_col2Sub1 * matrixSize_1 + coord_col3Sub1 * matrixSize_1 * matrixSize_2 - 1 ;
 }
 
-__global__ void calcRayBLPath(struct b3dStruct* aInstruct,
-            float* dims, unsigned int * aOffset, float* aIOut,
-            float* aJOut, float* aSOut, unsigned int aSize) { 
-    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
+__global__ void calcRayBLPathKernel(struct b3dStruct* aInstruct,
+            float* dims, unsigned int * aOffset,
+            float* aJOut, unsigned int aSize) { 
+    unsigned int idx = blockIdx.x;
     if (idx > aSize)return;
-        b3dStruct* in = aInstruct+idx;
-float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2,d, x,y,z;
+    b3dStruct* in = aInstruct+idx;
+    int f;
+    float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2, d, x,y,z;
     x = in->x;
     y = in->y;
     z = in->z;
     x_inc = in->x_inc;
     y_inc = in->y_inc;
     z_inc = in->z_inc;
+
+    dx2 = in->dx2;
+    dy2 = in->dy2;
+    dz2 = in->dz2;
+    d = in->d;
+
     err_0 = in->err_0;
     err_1 = in->err_1;
     err_2 = in->err_2;
     max = in->max;
+    f = in->f;
 
     float3 dims3{dims[0],dims[1],dims[2]};
-    float* output_ptr = aOutput + aOffset[idx];
+    float* output_ptr = aJOut + (idx ==0?0:aOffset[idx-1]);
     for (float i = 0; i < max; i++) {
-        convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)}, output_ptr);
+        output_ptr[0]=convertToLinearIndices(dims3, {roundf(x),roundf(y),roundf(z)});
         output_ptr ++;
         if (err_0 > 0) {
           x += x_inc;
@@ -138,32 +165,54 @@ float max, x_inc, y_inc, z_inc,err_0, err_1, err_2, dx2, dy2, dz2,d, x,y,z;
         err_0 += dx2;
         err_1 += dy2;
         err_2 += dz2;
-        if (err_0 == 0)x += x_inc;
-        if (err_1 == 0)y += y_inc;
-        if (err_2 == 0)z += z_inc;
+        if (f == 0) x += x_inc;
+        if (f == 1) y += y_inc;
+        if (f == 2) z += z_inc;
     }
 }
 
-void buildMatrix(const Aurora::CudaMatrix &senderList,
+__global__ void fillIAndSKernel(struct b3dStruct* aInstructs,unsigned int * aOffset,float* aIOut,float* aSOut){
+    b3dStruct& s = aInstructs[blockIdx.x];
+    __shared__ unsigned int beginIdx; 
+    __shared__ unsigned int length;
+    __shared__ float weight; 
+
+    if (threadIdx.x == 0){
+        beginIdx = (blockIdx.x == 0) ? 0 : aOffset[blockIdx.x - 1];
+        length = s.max;
+        weight = s.weight;
+    }
+    __syncthreads();
+
+    for (int i = 0; threadIdx.x + i < length; i+=blockDim.x) {
+        aIOut[beginIdx + threadIdx.x + i] = blockIdx.x; 
+        aSOut[beginIdx + threadIdx.x + i] = weight; 
+    }
+}
+
+Recon::BuildMatrixResult Recon::buildMatrix(const Aurora::CudaMatrix &senderList,
                                   const Aurora::CudaMatrix &receiverList,
                                   Aurora::CudaMatrix& res,
                                   const Aurora::CudaMatrix &dims, bool BENT,
                                   const Aurora::CudaMatrix &potentialMap){
+    Recon::BuildMatrixResult result;
     unsigned int numDims = senderList.getDimSize(0);
     unsigned int nTotalRays = senderList.getDimSize(1);
-    unsigned int cnt = 0;
-    unsigned int coeffIndex = 0;
-    unsigned int rayCount = 0; 
     b3dStruct* structList= nullptr;
     cudaMalloc((void**)&structList, sizeof(b3dStruct)*nTotalRays);
     int blocksPerGrid = (nTotalRays + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     calcLenWeightKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(
-        senderList.getData(), receiverList.getData(), res.getData(),structList);
+        senderList.getData(), receiverList.getData(), res.getData(),nTotalRays,structList);
+    cudaDeviceSynchronize();
+
     unsigned int* offset= nullptr;
     cudaMalloc((void**)&offset, sizeof(unsigned int)*nTotalRays);
     calcPathOffsetKernel<<<1,1>>>(structList,offset,nTotalRays);
+    cudaDeviceSynchronize();
+
     unsigned int size;
-    cudaMemcpy(&size, offset+(nTotalRays-1), 1, cudaMemcpyDeviceToHost);
+    cudaMemcpy(&size, offset+(nTotalRays-1), sizeof(unsigned int), cudaMemcpyDeviceToHost);
+
     float* iData = nullptr;
     float* jData = nullptr;
     float* sData = nullptr;
@@ -172,9 +221,17 @@ void buildMatrix(const Aurora::CudaMatrix &senderList,
     cudaMalloc((void**)&jData, sizeof(float)*size);
     cudaMalloc((void**)&sData, sizeof(float)*size);
 
-    calcRayBLPath(structList, dims.getData(), offset, jData);
+    calcRayBLPathKernel<<<nTotalRays,1>>>(structList, dims.getData(), offset, jData, nTotalRays);
+    fillIAndSKernel<<<nTotalRays,THREADS_PER_BLOCK>>>(structList, offset, iData, sData);
+    cudaDeviceSynchronize();
     cudaFree(structList);
 
+    auto mI = Aurora::CudaMatrix::fromRawData(iData, 1, size ,1);
+    auto mJ = Aurora::CudaMatrix::fromRawData(jData, 1, size,1);
+    auto mS = Aurora::CudaMatrix::fromRawData(sData, 1, size,1);
+    result.M = Aurora::Sparse(mI.toHostMatrix(), mJ.toHostMatrix(),mS.toHostMatrix(),nTotalRays, Aurora::prod(dims).getValue(0));
+    return result;   
+
 }
 
 
diff --git a/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh
new file mode 100644
index 0000000..61a2c06
--- /dev/null
+++ b/src/transmissionReconstruction/reconstruction/buildMatrix/buildMatrix.cuh
@@ -0,0 +1,12 @@
+#ifndef __BUILDMATRIX_CUH__
+#define __BUILDMATRIX_CUH__
+#include "buildMatrix.h"
+namespace Recon {
+    
+BuildMatrixResult buildMatrix(const Aurora::CudaMatrix &senderList,
+                                  const Aurora::CudaMatrix &receiverList,
+                                  Aurora::CudaMatrix& res,
+                                  const Aurora::CudaMatrix &dims, bool BENT,
+                                  const Aurora::CudaMatrix &potentialMap);
+}
+#endif // __BUILDMATRIX_H__
\ No newline at end of file