URDepends/eikonalGPU/src/StructuredEikonal.cu

#include "StructuredEikonal.h"
#include "omp.h"

StructuredEikonal::StructuredEikonal(bool verbose) 
:verbose_(verbose), isCudaMemCreated_(false),
width_(256), height_(256), depth_(256),
itersPerBlock_(10), solverType_(0) {
cudaGetDevice(&gpuDevice_);
}

StructuredEikonal::~StructuredEikonal() {}

void StructuredEikonal::writeNRRD(std::string filename) {
  std::fstream out(filename.c_str(), std::ios::out | std::ios::binary);
  out << "NRRD0001\n";
  out << "# Complete NRRD file format specification at:\n";
  out << "# http://teem.sourceforge.net/nrrd/format.html\n";
  out << "type: double\n";
  out << "dimension: 3\n";
  out << "sizes: " << this->width_ << " " << this->height_ << " " << this->depth_ << "\n";
  out << "endian: little\n";
  out << "encoding: raw\n\n";
  for(size_t k = 0; k < this->depth_; k++) {
    for(size_t j = 0; j < this->height_; j++) {
      for(size_t i = 0; i < this->width_; i++) {
        double d = this->answer_[i][j][k];
        out.write(reinterpret_cast<const char*>(&d),sizeof(double));
      }
    }
  }
  out.close();
}

void StructuredEikonal::setDims(size_t x, size_t y, size_t z) {
  this->width_ = x;
  this->height_ = y;
  this->depth_ = z;
}

void StructuredEikonal::error(char* msg) {
  printf("%s\n",msg);
  assert(false);
  exit(0);
}

void StructuredEikonal::CheckCUDAMemory() {
  size_t freeMem, totalMem;
  cudaMemGetInfo(&freeMem, &totalMem);
  if (this->verbose_) {
    std::cout << "Total Memory : " << totalMem / (1024 * 1024) << "MB" << std::endl;
    std::cout << "Free Memory  : " << freeMem / (1024 * 1024) << "MB" << std::endl;
    std::cout << "--" << std::endl;
  }
}

void StructuredEikonal::init_cuda_mem() {
  assert(this->width_ > 0 && this->height_ > 0 && this->depth_ > 0);
  if(this->width_ <= 0 || this->height_ <= 0 || this->depth_ <= 0){ 
    printf("Volume dimension cannot be zero");
    exit(1);
  }

  this->CheckCUDAMemory();

  // 1. Create /initialize GPU memory
  size_t nx, ny, nz;

  nx = this->width_ + (BLOCK_LENGTH-this->width_%BLOCK_LENGTH)%BLOCK_LENGTH;
  ny = this->height_ + (BLOCK_LENGTH-this->height_%BLOCK_LENGTH)%BLOCK_LENGTH;
  nz = this->depth_ + (BLOCK_LENGTH-this->depth_%BLOCK_LENGTH)%BLOCK_LENGTH;
  if (this->verbose_) {
    printf("%d %d %d \n",nx,ny,nz);
    printf("BLOCK_LENGTH = %d\n",BLOCK_LENGTH);
    printf("width/height/depth = %d %d %d \n",this->width_,this->height_,this->depth_);
  }

  auto volSize = nx*ny*nz;
  auto blkSize = BLOCK_LENGTH*BLOCK_LENGTH*BLOCK_LENGTH;

  auto nBlkX = nx / BLOCK_LENGTH;
  auto nBlkY = ny / BLOCK_LENGTH;
  auto nBlkZ = nz / BLOCK_LENGTH;
  auto blockNum = nBlkX*nBlkY*nBlkZ;

  this->memoryStruct_.xdim = static_cast<int>(nx);
  this->memoryStruct_.ydim = static_cast<int>(ny);
  this->memoryStruct_.zdim = static_cast<int>(nz);
  this->memoryStruct_.volsize = static_cast<uint>(volSize);
  this->memoryStruct_.blksize = static_cast<uint>(blkSize);
  this->memoryStruct_.blklength = BLOCK_LENGTH;
  this->memoryStruct_.blknum = static_cast<uint>(blockNum);
  this->memoryStruct_.nIter = static_cast<int>(this->itersPerBlock_); // iter per block

  if(this->isCudaMemCreated_) // delete previous memory
  {
    free((DOUBLE*)this->memoryStruct_.h_sol);
    free((uint*)this->memoryStruct_.h_list);
    free((bool*)this->memoryStruct_.h_listed);
    free((bool*)this->memoryStruct_.h_listVol);
    free((int*)this->memoryStruct_.blockOrder);
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_spd) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_sol) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.t_sol) );  // temp solution for ping-pong
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_con) );  // convergence volume
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_list) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_listVol) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_mask) );
  }
  this->isCudaMemCreated_ = true;

  this->memoryStruct_.h_sol = (DOUBLE*) malloc(volSize*sizeof(DOUBLE)); // initial solution
  this->memoryStruct_.h_list = (uint*) malloc(blockNum*sizeof(uint)); // linear list contains active block indices
  this->memoryStruct_.h_listed = (bool*) malloc(blockNum*sizeof(bool));  // whether block is added to the list
  this->memoryStruct_.h_listVol = (bool*) malloc(blockNum*sizeof(bool)); // volume list shows active/nonactive of corresponding block
  this->memoryStruct_.blockOrder = (int*) malloc(blockNum*sizeof(int));

  this->CheckCUDAMemory();

  //
  // create host/device memory using CUDA mem functions
  //
  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_spd), volSize*sizeof(double)) );
  this->CheckCUDAMemory();

  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_sol), volSize*sizeof(DOUBLE)) );
  this->CheckCUDAMemory();

  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.t_sol), volSize*sizeof(DOUBLE)) );  // temp solution for ping-pong
  this->CheckCUDAMemory();

  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_con), volSize*sizeof(bool))  );  // convergence volume
  this->CheckCUDAMemory();

  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_list), blockNum*sizeof(uint)) );
  this->CheckCUDAMemory();

  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_listVol), blockNum*sizeof(bool)) );
  this->CheckCUDAMemory();

  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_mask), volSize*sizeof(bool)) );
  this->CheckCUDAMemory();
}


void StructuredEikonal::freeMemory() {
    free((DOUBLE*)this->memoryStruct_.h_sol);
    free((uint*)this->memoryStruct_.h_list);
    free((bool*)this->memoryStruct_.h_listed);
    free((bool*)this->memoryStruct_.h_listVol);
    free((int*)this->memoryStruct_.blockOrder);
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_spd) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_sol) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.t_sol) );  // temp solution for ping-pong
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_con) );  // convergence volume
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_list) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_listVol) );
    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_mask) );
}

void StructuredEikonal::set_attribute_mask() {
  uint volSize = this->memoryStruct_.volsize;

  int nx, ny, nz, blklength;

  nx = memoryStruct_.xdim;
  ny = memoryStruct_.ydim;
  nz = memoryStruct_.zdim;
  blklength = memoryStruct_.blklength;

  // create host memory
  double *h_spd  = new double[volSize]; // byte speed, host
  bool  *h_mask = new bool[volSize];

  // copy input volume to host memory
  // make each block to be stored contiguously in 1D memory space
  uint idx = 0;
  for(int zStr = 0; zStr < nz; zStr += blklength) {
    for(int yStr = 0; yStr < ny; yStr += blklength) {
      for(int xStr = 0; xStr < nx; xStr += blklength) {
        // for each block
        for(int z=zStr; z<zStr+blklength; z++) {
          for(int y=yStr; y<yStr+blklength; y++) {
            for(int x=xStr; x<xStr+blklength; x++) {
              if(x<width_ && y<height_ && z<depth_) {
                h_spd[idx] = this->speeds_[x][y][z];
                h_mask[idx] = true;
              } else {
                h_spd[idx] = speeds_[0][0][0];
                h_mask[idx] = false;
              }
              idx++;
            }
          }
        }
      }
    }
  }

  // initialize GPU memory with host memory
  CUDA_SAFE_CALL( cudaMemcpy(memoryStruct_.d_spd, h_spd, volSize*sizeof(double), cudaMemcpyHostToDevice) );
  CUDA_SAFE_CALL( cudaMemcpy(memoryStruct_.d_mask, h_mask, volSize*sizeof(bool), cudaMemcpyHostToDevice) );

  delete[] h_spd;
  delete[] h_mask;
}

void StructuredEikonal::initialization() {
  // get / set CUDA device
  int deviceCount;
  cudaGetDeviceCount(&deviceCount);
  int device;
  for(device = 0; device < deviceCount; ++device) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, device);
  }
  
  device = this->gpuDevice_;
  cudaSetDevice(device);

  this->CheckCUDAMemory();
  this->init_cuda_mem();
  this->set_attribute_mask();
  this->CheckCUDAMemory();
}

void StructuredEikonal::map_generator() {
  double pi = 3.141592653589793238462643383;
  this->speeds_ = std::vector<std::vector<std::vector<double> > >(
    this->width_, std::vector<std::vector<double> >(
    this->height_, std::vector<double>(this->depth_,1.)));
  switch(this->solverType_){
  case 0 :
    //Constant Speed Map
    break;
  case 1 :
    //Sinusoid Speed Map
    for (int k = 0 ; k < this->depth_ ; ++k) {
      for (int j = 0 ; j < this->height_; ++j) {
        for ( int i = 0 ; i < this->width_ ; ++i) {
          this->speeds_[i][j][k] =
            (6 + 5*(sin((i*pi)/this->width_ *2))*
            sin((j*pi)/this->height_*2)*
            sin((k*pi)/this->depth_*2));
        }
      }
    }
    break;
  }
}

void StructuredEikonal::setGpuDevice(int device){
    this->gpuDevice_=device;
}


void StructuredEikonal::setSeeds(std::vector<std::array<size_t, 3> > seeds) {
  this->seeds_ = seeds;
}

void StructuredEikonal::setSpeeds(std::vector<std::vector<std::vector<double> > > speeds) {
  this->speeds_ = speeds;
}


void StructuredEikonal::useSeeds() {
  if (this->verbose_) {
    std::cout << "Loading seed volume..." << std::endl;
  }
  uint volSize, blockNum;
  int nx, ny, nz, blklength;

  nx = this->memoryStruct_.xdim;
  ny = this->memoryStruct_.ydim;
  nz = this->memoryStruct_.zdim;
  volSize = this->memoryStruct_.volsize;
  blklength = this->memoryStruct_.blklength;
  blockNum = this->memoryStruct_.blknum;

  // copy input volume to host memory
  // make each block to be stored contiguously in 1D memory space
  uint idx = 0;
  uint blk_idx = 0;
  uint list_idx = 0;
  uint nActiveBlock = 0;
 //#pragma omp parallel num_threads(8) 
 //{
 //#pragma omp for
         
         
  //printf("vol size = %i\n",volSize);       
         
  for(int zStr = 0; zStr < nz; zStr += blklength) {
    for(int yStr = 0; yStr < ny; yStr += blklength) {
      for(int xStr = 0; xStr < nx; xStr += blklength) {
        // for each block
        bool isSeedBlock = false;

        for(int z=zStr; z<zStr+blklength; z++) {
          for(int y=yStr; y<yStr+blklength; y++) {
            for(int x=xStr; x<xStr+blklength; x++) {
                //printf("idx = %i\n",idx); 

              this->memoryStruct_.h_sol[idx] = INF;

              if (this->seeds_.empty()) {
                if (x == nx/2 && y == ny/2 && z == nz/2) {
                  this->memoryStruct_.h_sol[idx] = 0;
                  isSeedBlock = true;
                  if (this->verbose_) {
                    printf("%d is Selected bt source (if) \n",idx);
                  }
                }
              } else {
                for(size_t i = 0; i < this->seeds_.size(); i++) {
                  if (this->seeds_[i][0] == x && 
                    this->seeds_[i][1] == y && 
                    this->seeds_[i][2] == z) {
                    this->memoryStruct_.h_sol[idx] = 0;
                    isSeedBlock = true;
                    if (this->verbose_) {
                      printf("%d is Selected bt source (else) \n",idx);
                    }
                  }
                }
              }
              idx++;
            }
          }
        }
        ///////////////////////////////////////////////
        if(isSeedBlock) {
          if (this->verbose_) {
            printf("%d,%d,%d is Seed Block \n",zStr,yStr,xStr);
          }
          this->memoryStruct_.h_listVol[blk_idx] = true;
          this->memoryStruct_.h_listed[blk_idx] = true;
          this->memoryStruct_.h_list[list_idx] = blk_idx;
          list_idx++;
          nActiveBlock++;
        } else {
          this->memoryStruct_.h_listVol[blk_idx] = false;
          this->memoryStruct_.h_listed[blk_idx] = false;
        }
        blk_idx++;
      }
    }
  }
 //}
  this->memoryStruct_.nActiveBlock = nActiveBlock;
  // initialize GPU memory with host memory
  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_sol, this->memoryStruct_.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );
  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.t_sol, this->memoryStruct_.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );
  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_list, this->memoryStruct_.h_list, nActiveBlock*sizeof(uint), cudaMemcpyHostToDevice) );
  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_listVol, this->memoryStruct_.h_listVol, blockNum*sizeof(bool), cudaMemcpyHostToDevice) );
  // initialize GPU memory with constant value
  CUDA_SAFE_CALL( cudaMemset(this->memoryStruct_.d_con, 1, volSize*sizeof(bool)) );
}

void StructuredEikonal::setMapType(size_t t) {
    this->solverType_ = t;
  }

void StructuredEikonal::solveEikonal() {
  if (this->speeds_.empty()) {
    this->map_generator();
  }
  this->isCudaMemCreated_ = false;
  this->initialization();
  this->useSeeds();
  runEikonalSolverSimple(this->memoryStruct_, this->verbose_);
  this->get_solution();
  this->freeMemory();
}

std::vector< std::vector< std::vector<double> > > 
  StructuredEikonal::getFinalResult() {
    return this->answer_;
  }

void StructuredEikonal::get_solution() {
  // copy solution from GPU
  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.h_sol,
    this->memoryStruct_.d_sol, this->memoryStruct_.volsize*sizeof(DOUBLE), 
    cudaMemcpyDeviceToHost) );
  //put the data where it belongs in the grand scheme of data!
  this->answer_ = std::vector<std::vector<std::vector<double> > >(
    this->width_, std::vector<std::vector<double> >( 
    this->height_, std::vector<double>(this->depth_,0)));
 
 //#pragma omp parallel num_threads(8) 
 //{
 //#pragma omp for
 for(size_t blockID = 0; blockID < this->memoryStruct_.blknum; blockID++) {
    size_t baseAddr = blockID * this->memoryStruct_.blksize;
		size_t xgridlength = this->memoryStruct_.xdim/BLOCK_LENGTH;
		size_t ygridlength = this->memoryStruct_.ydim/BLOCK_LENGTH;
		// compute block index
		size_t bx = blockID%xgridlength;
		size_t tmpIdx = (blockID - bx)/xgridlength;
		size_t by = tmpIdx%ygridlength;
		size_t bz = (tmpIdx-by)/ygridlength;
    //translate back to real space
    for(int k = 0; k < BLOCK_LENGTH; k++) {
      for(int j = 0; j < BLOCK_LENGTH; j++) {
        for(int i = 0; i < BLOCK_LENGTH; i++) {
          double d = this->memoryStruct_.h_sol[baseAddr + 
            k * BLOCK_LENGTH * BLOCK_LENGTH + 
            j * BLOCK_LENGTH + i];
          if ((i + bx * BLOCK_LENGTH) < this->width_ && 
            (j + by * BLOCK_LENGTH) < this->height_ &&
            (k + bz * BLOCK_LENGTH) < this->depth_) {
            this->answer_[(i + bx * BLOCK_LENGTH)][(j +
              by * BLOCK_LENGTH)][k + bz * BLOCK_LENGTH] = d;
          }
        }
      }
    }
  }
 //}
}

void StructuredEikonal::setItersPerBlock(size_t t) {
  this->itersPerBlock_ = t;
}
Commit source 2023-05-18 16:04:27 +08:00			`#include "StructuredEikonal.h"`
			`#include "omp.h"`

			`StructuredEikonal::StructuredEikonal(bool verbose)`
			`:verbose_(verbose), isCudaMemCreated_(false),`
			`width_(256), height_(256), depth_(256),`
			`itersPerBlock_(10), solverType_(0) {`
			`cudaGetDevice(&gpuDevice_);`
			`}`

			`StructuredEikonal::~StructuredEikonal() {}`

			`void StructuredEikonal::writeNRRD(std::string filename) {`
			`std::fstream out(filename.c_str(), std::ios::out \| std::ios::binary);`
			`out << "NRRD0001\n";`
			`out << "# Complete NRRD file format specification at:\n";`
			`out << "# http://teem.sourceforge.net/nrrd/format.html\n";`
			`out << "type: double\n";`
			`out << "dimension: 3\n";`
			`out << "sizes: " << this->width_ << " " << this->height_ << " " << this->depth_ << "\n";`
			`out << "endian: little\n";`
			`out << "encoding: raw\n\n";`
			`for(size_t k = 0; k < this->depth_; k++) {`
			`for(size_t j = 0; j < this->height_; j++) {`
			`for(size_t i = 0; i < this->width_; i++) {`
			`double d = this->answer_[i][j][k];`
			`out.write(reinterpret_cast<const char*>(&d),sizeof(double));`
			`}`
			`}`
			`}`
			`out.close();`
			`}`

			`void StructuredEikonal::setDims(size_t x, size_t y, size_t z) {`
			`this->width_ = x;`
			`this->height_ = y;`
			`this->depth_ = z;`
			`}`

			`void StructuredEikonal::error(char* msg) {`
			`printf("%s\n",msg);`
			`assert(false);`
			`exit(0);`
			`}`

			`void StructuredEikonal::CheckCUDAMemory() {`
			`size_t freeMem, totalMem;`
			`cudaMemGetInfo(&freeMem, &totalMem);`
			`if (this->verbose_) {`
			`std::cout << "Total Memory : " << totalMem / (1024 * 1024) << "MB" << std::endl;`
			`std::cout << "Free Memory : " << freeMem / (1024 * 1024) << "MB" << std::endl;`
			`std::cout << "--" << std::endl;`
			`}`
			`}`

			`void StructuredEikonal::init_cuda_mem() {`
			`assert(this->width_ > 0 && this->height_ > 0 && this->depth_ > 0);`
			`if(this->width_ <= 0 \|\| this->height_ <= 0 \|\| this->depth_ <= 0){`
			`printf("Volume dimension cannot be zero");`
			`exit(1);`
			`}`

			`this->CheckCUDAMemory();`

			`// 1. Create /initialize GPU memory`
			`size_t nx, ny, nz;`

			`nx = this->width_ + (BLOCK_LENGTH-this->width_%BLOCK_LENGTH)%BLOCK_LENGTH;`
			`ny = this->height_ + (BLOCK_LENGTH-this->height_%BLOCK_LENGTH)%BLOCK_LENGTH;`
			`nz = this->depth_ + (BLOCK_LENGTH-this->depth_%BLOCK_LENGTH)%BLOCK_LENGTH;`
			`if (this->verbose_) {`
			`printf("%d %d %d \n",nx,ny,nz);`
			`printf("BLOCK_LENGTH = %d\n",BLOCK_LENGTH);`
			`printf("width/height/depth = %d %d %d \n",this->width_,this->height_,this->depth_);`
			`}`

			`auto volSize = nxnynz;`
			`auto blkSize = BLOCK_LENGTHBLOCK_LENGTHBLOCK_LENGTH;`

			`auto nBlkX = nx / BLOCK_LENGTH;`
			`auto nBlkY = ny / BLOCK_LENGTH;`
			`auto nBlkZ = nz / BLOCK_LENGTH;`
			`auto blockNum = nBlkXnBlkYnBlkZ;`

			`this->memoryStruct_.xdim = static_cast<int>(nx);`
			`this->memoryStruct_.ydim = static_cast<int>(ny);`
			`this->memoryStruct_.zdim = static_cast<int>(nz);`
			`this->memoryStruct_.volsize = static_cast<uint>(volSize);`
			`this->memoryStruct_.blksize = static_cast<uint>(blkSize);`
			`this->memoryStruct_.blklength = BLOCK_LENGTH;`
			`this->memoryStruct_.blknum = static_cast<uint>(blockNum);`
			`this->memoryStruct_.nIter = static_cast<int>(this->itersPerBlock_); // iter per block`

			`if(this->isCudaMemCreated_) // delete previous memory`
			`{`
			`free((DOUBLE*)this->memoryStruct_.h_sol);`
			`free((uint*)this->memoryStruct_.h_list);`
			`free((bool*)this->memoryStruct_.h_listed);`
			`free((bool*)this->memoryStruct_.h_listVol);`
			`free((int*)this->memoryStruct_.blockOrder);`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_spd) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_sol) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.t_sol) ); // temp solution for ping-pong`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_con) ); // convergence volume`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_list) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_listVol) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_mask) );`
			`}`
			`this->isCudaMemCreated_ = true;`

			`this->memoryStruct_.h_sol = (DOUBLE) malloc(volSizesizeof(DOUBLE)); // initial solution`
			`this->memoryStruct_.h_list = (uint) malloc(blockNumsizeof(uint)); // linear list contains active block indices`
			`this->memoryStruct_.h_listed = (bool) malloc(blockNumsizeof(bool)); // whether block is added to the list`
			`this->memoryStruct_.h_listVol = (bool) malloc(blockNumsizeof(bool)); // volume list shows active/nonactive of corresponding block`
			`this->memoryStruct_.blockOrder = (int) malloc(blockNumsizeof(int));`

			`this->CheckCUDAMemory();`

			`//`
			`// create host/device memory using CUDA mem functions`
			`//`
			`CUDA_SAFE_CALL( cudaMalloc((void*)&(this->memoryStruct_.d_spd), volSizesizeof(double)) );`
			`this->CheckCUDAMemory();`

			`CUDA_SAFE_CALL( cudaMalloc((void*)&(this->memoryStruct_.d_sol), volSizesizeof(DOUBLE)) );`
			`this->CheckCUDAMemory();`

			`CUDA_SAFE_CALL( cudaMalloc((void*)&(this->memoryStruct_.t_sol), volSizesizeof(DOUBLE)) ); // temp solution for ping-pong`
			`this->CheckCUDAMemory();`

			`CUDA_SAFE_CALL( cudaMalloc((void*)&(this->memoryStruct_.d_con), volSizesizeof(bool)) ); // convergence volume`
			`this->CheckCUDAMemory();`

			`CUDA_SAFE_CALL( cudaMalloc((void*)&(this->memoryStruct_.d_list), blockNumsizeof(uint)) );`
			`this->CheckCUDAMemory();`

			`CUDA_SAFE_CALL( cudaMalloc((void*)&(this->memoryStruct_.d_listVol), blockNumsizeof(bool)) );`
			`this->CheckCUDAMemory();`

			`CUDA_SAFE_CALL( cudaMalloc((void*)&(this->memoryStruct_.d_mask), volSizesizeof(bool)) );`
			`this->CheckCUDAMemory();`
			`}`


			`void StructuredEikonal::freeMemory() {`
			`free((DOUBLE*)this->memoryStruct_.h_sol);`
			`free((uint*)this->memoryStruct_.h_list);`
			`free((bool*)this->memoryStruct_.h_listed);`
			`free((bool*)this->memoryStruct_.h_listVol);`
			`free((int*)this->memoryStruct_.blockOrder);`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_spd) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_sol) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.t_sol) ); // temp solution for ping-pong`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_con) ); // convergence volume`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_list) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_listVol) );`
			`CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_mask) );`
			`}`

			`void StructuredEikonal::set_attribute_mask() {`
			`uint volSize = this->memoryStruct_.volsize;`

			`int nx, ny, nz, blklength;`

			`nx = memoryStruct_.xdim;`
			`ny = memoryStruct_.ydim;`
			`nz = memoryStruct_.zdim;`
			`blklength = memoryStruct_.blklength;`

			`// create host memory`
			`double *h_spd = new double[volSize]; // byte speed, host`
			`bool *h_mask = new bool[volSize];`

			`// copy input volume to host memory`
			`// make each block to be stored contiguously in 1D memory space`
			`uint idx = 0;`
			`for(int zStr = 0; zStr < nz; zStr += blklength) {`
			`for(int yStr = 0; yStr < ny; yStr += blklength) {`
			`for(int xStr = 0; xStr < nx; xStr += blklength) {`
			`// for each block`
			`for(int z=zStr; z<zStr+blklength; z++) {`
			`for(int y=yStr; y<yStr+blklength; y++) {`
			`for(int x=xStr; x<xStr+blklength; x++) {`
			`if(x<width_ && y<height_ && z<depth_) {`
			`h_spd[idx] = this->speeds_[x][y][z];`
			`h_mask[idx] = true;`
			`} else {`
			`h_spd[idx] = speeds_[0][0][0];`
			`h_mask[idx] = false;`
			`}`
			`idx++;`
			`}`
			`}`
			`}`
			`}`
			`}`
			`}`

			`// initialize GPU memory with host memory`
			`CUDA_SAFE_CALL( cudaMemcpy(memoryStruct_.d_spd, h_spd, volSize*sizeof(double), cudaMemcpyHostToDevice) );`
			`CUDA_SAFE_CALL( cudaMemcpy(memoryStruct_.d_mask, h_mask, volSize*sizeof(bool), cudaMemcpyHostToDevice) );`

			`delete[] h_spd;`
			`delete[] h_mask;`
			`}`

			`void StructuredEikonal::initialization() {`
			`// get / set CUDA device`
			`int deviceCount;`
			`cudaGetDeviceCount(&deviceCount);`
			`int device;`
			`for(device = 0; device < deviceCount; ++device) {`
			`cudaDeviceProp deviceProp;`
			`cudaGetDeviceProperties(&deviceProp, device);`
			`}`

			`device = this->gpuDevice_;`
			`cudaSetDevice(device);`

			`this->CheckCUDAMemory();`
			`this->init_cuda_mem();`
			`this->set_attribute_mask();`
			`this->CheckCUDAMemory();`
			`}`

			`void StructuredEikonal::map_generator() {`
			`double pi = 3.141592653589793238462643383;`
			`this->speeds_ = std::vector<std::vector<std::vector<double> > >(`
			`this->width_, std::vector<std::vector<double> >(`
			`this->height_, std::vector<double>(this->depth_,1.)));`
			`switch(this->solverType_){`
			`case 0 :`
			`//Constant Speed Map`
			`break;`
			`case 1 :`
			`//Sinusoid Speed Map`
			`for (int k = 0 ; k < this->depth_ ; ++k) {`
			`for (int j = 0 ; j < this->height_; ++j) {`
			`for ( int i = 0 ; i < this->width_ ; ++i) {`
			`this->speeds_[i][j][k] =`
			`(6 + 5(sin((ipi)/this->width_ 2))`
			`sin((jpi)/this->height_2)*`
			`sin((kpi)/this->depth_2));`
			`}`
			`}`
			`}`
			`break;`
			`}`
			`}`

			`void StructuredEikonal::setGpuDevice(int device){`
			`this->gpuDevice_=device;`
			`}`


			`void StructuredEikonal::setSeeds(std::vector<std::array<size_t, 3> > seeds) {`
			`this->seeds_ = seeds;`
			`}`

			`void StructuredEikonal::setSpeeds(std::vector<std::vector<std::vector<double> > > speeds) {`
			`this->speeds_ = speeds;`
			`}`


			`void StructuredEikonal::useSeeds() {`
			`if (this->verbose_) {`
			`std::cout << "Loading seed volume..." << std::endl;`
			`}`
			`uint volSize, blockNum;`
			`int nx, ny, nz, blklength;`

			`nx = this->memoryStruct_.xdim;`
			`ny = this->memoryStruct_.ydim;`
			`nz = this->memoryStruct_.zdim;`
			`volSize = this->memoryStruct_.volsize;`
			`blklength = this->memoryStruct_.blklength;`
			`blockNum = this->memoryStruct_.blknum;`

			`// copy input volume to host memory`
			`// make each block to be stored contiguously in 1D memory space`
			`uint idx = 0;`
			`uint blk_idx = 0;`
			`uint list_idx = 0;`
			`uint nActiveBlock = 0;`
			`//#pragma omp parallel num_threads(8)`
			`//{`
			`//#pragma omp for`


			`//printf("vol size = %i\n",volSize);`

			`for(int zStr = 0; zStr < nz; zStr += blklength) {`
			`for(int yStr = 0; yStr < ny; yStr += blklength) {`
			`for(int xStr = 0; xStr < nx; xStr += blklength) {`
			`// for each block`
			`bool isSeedBlock = false;`

			`for(int z=zStr; z<zStr+blklength; z++) {`
			`for(int y=yStr; y<yStr+blklength; y++) {`
			`for(int x=xStr; x<xStr+blklength; x++) {`
			`//printf("idx = %i\n",idx);`

			`this->memoryStruct_.h_sol[idx] = INF;`

			`if (this->seeds_.empty()) {`
			`if (x == nx/2 && y == ny/2 && z == nz/2) {`
			`this->memoryStruct_.h_sol[idx] = 0;`
			`isSeedBlock = true;`
			`if (this->verbose_) {`
			`printf("%d is Selected bt source (if) \n",idx);`
			`}`
			`}`
			`} else {`
			`for(size_t i = 0; i < this->seeds_.size(); i++) {`
			`if (this->seeds_[i][0] == x &&`
			`this->seeds_[i][1] == y &&`
			`this->seeds_[i][2] == z) {`
			`this->memoryStruct_.h_sol[idx] = 0;`
			`isSeedBlock = true;`
			`if (this->verbose_) {`
			`printf("%d is Selected bt source (else) \n",idx);`
			`}`
			`}`
			`}`
			`}`
			`idx++;`
			`}`
			`}`
			`}`
			`///////////////////////////////////////////////`
			`if(isSeedBlock) {`
			`if (this->verbose_) {`
			`printf("%d,%d,%d is Seed Block \n",zStr,yStr,xStr);`
			`}`
			`this->memoryStruct_.h_listVol[blk_idx] = true;`
			`this->memoryStruct_.h_listed[blk_idx] = true;`
			`this->memoryStruct_.h_list[list_idx] = blk_idx;`
			`list_idx++;`
			`nActiveBlock++;`
			`} else {`
			`this->memoryStruct_.h_listVol[blk_idx] = false;`
			`this->memoryStruct_.h_listed[blk_idx] = false;`
			`}`
			`blk_idx++;`
			`}`
			`}`
			`}`
			`//}`
			`this->memoryStruct_.nActiveBlock = nActiveBlock;`
			`// initialize GPU memory with host memory`
			`CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_sol, this->memoryStruct_.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );`
			`CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.t_sol, this->memoryStruct_.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );`
			`CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_list, this->memoryStruct_.h_list, nActiveBlock*sizeof(uint), cudaMemcpyHostToDevice) );`
			`CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_listVol, this->memoryStruct_.h_listVol, blockNum*sizeof(bool), cudaMemcpyHostToDevice) );`
			`// initialize GPU memory with constant value`
			`CUDA_SAFE_CALL( cudaMemset(this->memoryStruct_.d_con, 1, volSize*sizeof(bool)) );`
			`}`

			`void StructuredEikonal::setMapType(size_t t) {`
			`this->solverType_ = t;`
			`}`

			`void StructuredEikonal::solveEikonal() {`
			`if (this->speeds_.empty()) {`
			`this->map_generator();`
			`}`
			`this->isCudaMemCreated_ = false;`
			`this->initialization();`
			`this->useSeeds();`
			`runEikonalSolverSimple(this->memoryStruct_, this->verbose_);`
			`this->get_solution();`
			`this->freeMemory();`
			`}`

			`std::vector< std::vector< std::vector<double> > >`
			`StructuredEikonal::getFinalResult() {`
			`return this->answer_;`
			`}`

			`void StructuredEikonal::get_solution() {`
			`// copy solution from GPU`
			`CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.h_sol,`
			`this->memoryStruct_.d_sol, this->memoryStruct_.volsize*sizeof(DOUBLE),`
			`cudaMemcpyDeviceToHost) );`
			`//put the data where it belongs in the grand scheme of data!`
			`this->answer_ = std::vector<std::vector<std::vector<double> > >(`
			`this->width_, std::vector<std::vector<double> >(`
			`this->height_, std::vector<double>(this->depth_,0)));`

			`//#pragma omp parallel num_threads(8)`
			`//{`
			`//#pragma omp for`
			`for(size_t blockID = 0; blockID < this->memoryStruct_.blknum; blockID++) {`
			`size_t baseAddr = blockID * this->memoryStruct_.blksize;`
			`size_t xgridlength = this->memoryStruct_.xdim/BLOCK_LENGTH;`
			`size_t ygridlength = this->memoryStruct_.ydim/BLOCK_LENGTH;`
			`// compute block index`
			`size_t bx = blockID%xgridlength;`
			`size_t tmpIdx = (blockID - bx)/xgridlength;`
			`size_t by = tmpIdx%ygridlength;`
			`size_t bz = (tmpIdx-by)/ygridlength;`
			`//translate back to real space`
			`for(int k = 0; k < BLOCK_LENGTH; k++) {`
			`for(int j = 0; j < BLOCK_LENGTH; j++) {`
			`for(int i = 0; i < BLOCK_LENGTH; i++) {`
			`double d = this->memoryStruct_.h_sol[baseAddr +`
			`k * BLOCK_LENGTH * BLOCK_LENGTH +`
			`j * BLOCK_LENGTH + i];`
			`if ((i + bx * BLOCK_LENGTH) < this->width_ &&`
			`(j + by * BLOCK_LENGTH) < this->height_ &&`
			`(k + bz * BLOCK_LENGTH) < this->depth_) {`
			`this->answer_[(i + bx * BLOCK_LENGTH)][(j +`
			`by * BLOCK_LENGTH)][k + bz * BLOCK_LENGTH] = d;`
			`}`
			`}`
			`}`
			`}`
			`}`
			`//}`
			`}`

			`void StructuredEikonal::setItersPerBlock(size_t t) {`
			`this->itersPerBlock_ = t;`
			`}`