Commit source

2023-05-18 16:04:27 +08:00
parent 88cf81e4ea
commit c6cd188732
83 changed files with 39921 additions and 0 deletions
--- a/eikonalGPU/src/StructuredEikonal.cu
+++ b/eikonalGPU/src/StructuredEikonal.cu
@@ -0,0 +1,424 @@
+#include "StructuredEikonal.h"
+#include "omp.h"
+
+StructuredEikonal::StructuredEikonal(bool verbose) 
+:verbose_(verbose), isCudaMemCreated_(false),
+width_(256), height_(256), depth_(256),
+itersPerBlock_(10), solverType_(0) {
+cudaGetDevice(&gpuDevice_);
+}
+
+StructuredEikonal::~StructuredEikonal() {}
+
+void StructuredEikonal::writeNRRD(std::string filename) {
+  std::fstream out(filename.c_str(), std::ios::out | std::ios::binary);
+  out << "NRRD0001\n";
+  out << "# Complete NRRD file format specification at:\n";
+  out << "# http://teem.sourceforge.net/nrrd/format.html\n";
+  out << "type: double\n";
+  out << "dimension: 3\n";
+  out << "sizes: " << this->width_ << " " << this->height_ << " " << this->depth_ << "\n";
+  out << "endian: little\n";
+  out << "encoding: raw\n\n";
+  for(size_t k = 0; k < this->depth_; k++) {
+    for(size_t j = 0; j < this->height_; j++) {
+      for(size_t i = 0; i < this->width_; i++) {
+        double d = this->answer_[i][j][k];
+        out.write(reinterpret_cast<const char*>(&d),sizeof(double));
+      }
+    }
+  }
+  out.close();
+}
+
+void StructuredEikonal::setDims(size_t x, size_t y, size_t z) {
+  this->width_ = x;
+  this->height_ = y;
+  this->depth_ = z;
+}
+
+void StructuredEikonal::error(char* msg) {
+  printf("%s\n",msg);
+  assert(false);
+  exit(0);
+}
+
+void StructuredEikonal::CheckCUDAMemory() {
+  size_t freeMem, totalMem;
+  cudaMemGetInfo(&freeMem, &totalMem);
+  if (this->verbose_) {
+    std::cout << "Total Memory : " << totalMem / (1024 * 1024) << "MB" << std::endl;
+    std::cout << "Free Memory  : " << freeMem / (1024 * 1024) << "MB" << std::endl;
+    std::cout << "--" << std::endl;
+  }
+}
+
+void StructuredEikonal::init_cuda_mem() {
+  assert(this->width_ > 0 && this->height_ > 0 && this->depth_ > 0);
+  if(this->width_ <= 0 || this->height_ <= 0 || this->depth_ <= 0){ 
+    printf("Volume dimension cannot be zero");
+    exit(1);
+  }
+
+  this->CheckCUDAMemory();
+
+  // 1. Create /initialize GPU memory
+  size_t nx, ny, nz;
+
+  nx = this->width_ + (BLOCK_LENGTH-this->width_%BLOCK_LENGTH)%BLOCK_LENGTH;
+  ny = this->height_ + (BLOCK_LENGTH-this->height_%BLOCK_LENGTH)%BLOCK_LENGTH;
+  nz = this->depth_ + (BLOCK_LENGTH-this->depth_%BLOCK_LENGTH)%BLOCK_LENGTH;
+  if (this->verbose_) {
+    printf("%d %d %d \n",nx,ny,nz);
+    printf("BLOCK_LENGTH = %d\n",BLOCK_LENGTH);
+    printf("width/height/depth = %d %d %d \n",this->width_,this->height_,this->depth_);
+  }
+
+  auto volSize = nx*ny*nz;
+  auto blkSize = BLOCK_LENGTH*BLOCK_LENGTH*BLOCK_LENGTH;
+
+  auto nBlkX = nx / BLOCK_LENGTH;
+  auto nBlkY = ny / BLOCK_LENGTH;
+  auto nBlkZ = nz / BLOCK_LENGTH;
+  auto blockNum = nBlkX*nBlkY*nBlkZ;
+
+  this->memoryStruct_.xdim = static_cast<int>(nx);
+  this->memoryStruct_.ydim = static_cast<int>(ny);
+  this->memoryStruct_.zdim = static_cast<int>(nz);
+  this->memoryStruct_.volsize = static_cast<uint>(volSize);
+  this->memoryStruct_.blksize = static_cast<uint>(blkSize);
+  this->memoryStruct_.blklength = BLOCK_LENGTH;
+  this->memoryStruct_.blknum = static_cast<uint>(blockNum);
+  this->memoryStruct_.nIter = static_cast<int>(this->itersPerBlock_); // iter per block
+
+  if(this->isCudaMemCreated_) // delete previous memory
+  {
+    free((DOUBLE*)this->memoryStruct_.h_sol);
+    free((uint*)this->memoryStruct_.h_list);
+    free((bool*)this->memoryStruct_.h_listed);
+    free((bool*)this->memoryStruct_.h_listVol);
+    free((int*)this->memoryStruct_.blockOrder);
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_spd) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_sol) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.t_sol) );  // temp solution for ping-pong
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_con) );  // convergence volume
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_list) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_listVol) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_mask) );
+  }
+  this->isCudaMemCreated_ = true;
+
+  this->memoryStruct_.h_sol = (DOUBLE*) malloc(volSize*sizeof(DOUBLE)); // initial solution
+  this->memoryStruct_.h_list = (uint*) malloc(blockNum*sizeof(uint)); // linear list contains active block indices
+  this->memoryStruct_.h_listed = (bool*) malloc(blockNum*sizeof(bool));  // whether block is added to the list
+  this->memoryStruct_.h_listVol = (bool*) malloc(blockNum*sizeof(bool)); // volume list shows active/nonactive of corresponding block
+  this->memoryStruct_.blockOrder = (int*) malloc(blockNum*sizeof(int));
+
+  this->CheckCUDAMemory();
+
+  //
+  // create host/device memory using CUDA mem functions
+  //
+  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_spd), volSize*sizeof(double)) );
+  this->CheckCUDAMemory();
+
+  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_sol), volSize*sizeof(DOUBLE)) );
+  this->CheckCUDAMemory();
+
+  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.t_sol), volSize*sizeof(DOUBLE)) );  // temp solution for ping-pong
+  this->CheckCUDAMemory();
+
+  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_con), volSize*sizeof(bool))  );  // convergence volume
+  this->CheckCUDAMemory();
+
+  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_list), blockNum*sizeof(uint)) );
+  this->CheckCUDAMemory();
+
+  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_listVol), blockNum*sizeof(bool)) );
+  this->CheckCUDAMemory();
+
+  CUDA_SAFE_CALL( cudaMalloc((void**)&(this->memoryStruct_.d_mask), volSize*sizeof(bool)) );
+  this->CheckCUDAMemory();
+}
+
+
+void StructuredEikonal::freeMemory() {
+    free((DOUBLE*)this->memoryStruct_.h_sol);
+    free((uint*)this->memoryStruct_.h_list);
+    free((bool*)this->memoryStruct_.h_listed);
+    free((bool*)this->memoryStruct_.h_listVol);
+    free((int*)this->memoryStruct_.blockOrder);
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_spd) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_sol) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.t_sol) );  // temp solution for ping-pong
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_con) );  // convergence volume
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_list) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_listVol) );
+    CUDA_SAFE_CALL( cudaFree(this->memoryStruct_.d_mask) );
+}
+
+void StructuredEikonal::set_attribute_mask() {
+  uint volSize = this->memoryStruct_.volsize;
+
+  int nx, ny, nz, blklength;
+
+  nx = memoryStruct_.xdim;
+  ny = memoryStruct_.ydim;
+  nz = memoryStruct_.zdim;
+  blklength = memoryStruct_.blklength;
+
+  // create host memory
+  double *h_spd  = new double[volSize]; // byte speed, host
+  bool  *h_mask = new bool[volSize];
+
+  // copy input volume to host memory
+  // make each block to be stored contiguously in 1D memory space
+  uint idx = 0;
+  for(int zStr = 0; zStr < nz; zStr += blklength) {
+    for(int yStr = 0; yStr < ny; yStr += blklength) {
+      for(int xStr = 0; xStr < nx; xStr += blklength) {
+        // for each block
+        for(int z=zStr; z<zStr+blklength; z++) {
+          for(int y=yStr; y<yStr+blklength; y++) {
+            for(int x=xStr; x<xStr+blklength; x++) {
+              if(x<width_ && y<height_ && z<depth_) {
+                h_spd[idx] = this->speeds_[x][y][z];
+                h_mask[idx] = true;
+              } else {
+                h_spd[idx] = speeds_[0][0][0];
+                h_mask[idx] = false;
+              }
+              idx++;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // initialize GPU memory with host memory
+  CUDA_SAFE_CALL( cudaMemcpy(memoryStruct_.d_spd, h_spd, volSize*sizeof(double), cudaMemcpyHostToDevice) );
+  CUDA_SAFE_CALL( cudaMemcpy(memoryStruct_.d_mask, h_mask, volSize*sizeof(bool), cudaMemcpyHostToDevice) );
+
+  delete[] h_spd;
+  delete[] h_mask;
+}
+
+void StructuredEikonal::initialization() {
+  // get / set CUDA device
+  int deviceCount;
+  cudaGetDeviceCount(&deviceCount);
+  int device;
+  for(device = 0; device < deviceCount; ++device) {
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, device);
+  }
+  
+  device = this->gpuDevice_;
+  cudaSetDevice(device);
+
+  this->CheckCUDAMemory();
+  this->init_cuda_mem();
+  this->set_attribute_mask();
+  this->CheckCUDAMemory();
+}
+
+void StructuredEikonal::map_generator() {
+  double pi = 3.141592653589793238462643383;
+  this->speeds_ = std::vector<std::vector<std::vector<double> > >(
+    this->width_, std::vector<std::vector<double> >(
+    this->height_, std::vector<double>(this->depth_,1.)));
+  switch(this->solverType_){
+  case 0 :
+    //Constant Speed Map
+    break;
+  case 1 :
+    //Sinusoid Speed Map
+    for (int k = 0 ; k < this->depth_ ; ++k) {
+      for (int j = 0 ; j < this->height_; ++j) {
+        for ( int i = 0 ; i < this->width_ ; ++i) {
+          this->speeds_[i][j][k] =
+            (6 + 5*(sin((i*pi)/this->width_ *2))*
+            sin((j*pi)/this->height_*2)*
+            sin((k*pi)/this->depth_*2));
+        }
+      }
+    }
+    break;
+  }
+}
+
+void StructuredEikonal::setGpuDevice(int device){
+    this->gpuDevice_=device;
+}
+
+
+void StructuredEikonal::setSeeds(std::vector<std::array<size_t, 3> > seeds) {
+  this->seeds_ = seeds;
+}
+
+void StructuredEikonal::setSpeeds(std::vector<std::vector<std::vector<double> > > speeds) {
+  this->speeds_ = speeds;
+}
+
+
+void StructuredEikonal::useSeeds() {
+  if (this->verbose_) {
+    std::cout << "Loading seed volume..." << std::endl;
+  }
+  uint volSize, blockNum;
+  int nx, ny, nz, blklength;
+
+  nx = this->memoryStruct_.xdim;
+  ny = this->memoryStruct_.ydim;
+  nz = this->memoryStruct_.zdim;
+  volSize = this->memoryStruct_.volsize;
+  blklength = this->memoryStruct_.blklength;
+  blockNum = this->memoryStruct_.blknum;
+
+  // copy input volume to host memory
+  // make each block to be stored contiguously in 1D memory space
+  uint idx = 0;
+  uint blk_idx = 0;
+  uint list_idx = 0;
+  uint nActiveBlock = 0;
+ //#pragma omp parallel num_threads(8) 
+ //{
+ //#pragma omp for
+         
+         
+  //printf("vol size = %i\n",volSize);       
+         
+  for(int zStr = 0; zStr < nz; zStr += blklength) {
+    for(int yStr = 0; yStr < ny; yStr += blklength) {
+      for(int xStr = 0; xStr < nx; xStr += blklength) {
+        // for each block
+        bool isSeedBlock = false;
+
+        for(int z=zStr; z<zStr+blklength; z++) {
+          for(int y=yStr; y<yStr+blklength; y++) {
+            for(int x=xStr; x<xStr+blklength; x++) {
+                //printf("idx = %i\n",idx); 
+
+              this->memoryStruct_.h_sol[idx] = INF;
+
+              if (this->seeds_.empty()) {
+                if (x == nx/2 && y == ny/2 && z == nz/2) {
+                  this->memoryStruct_.h_sol[idx] = 0;
+                  isSeedBlock = true;
+                  if (this->verbose_) {
+                    printf("%d is Selected bt source (if) \n",idx);
+                  }
+                }
+              } else {
+                for(size_t i = 0; i < this->seeds_.size(); i++) {
+                  if (this->seeds_[i][0] == x && 
+                    this->seeds_[i][1] == y && 
+                    this->seeds_[i][2] == z) {
+                    this->memoryStruct_.h_sol[idx] = 0;
+                    isSeedBlock = true;
+                    if (this->verbose_) {
+                      printf("%d is Selected bt source (else) \n",idx);
+                    }
+                  }
+                }
+              }
+              idx++;
+            }
+          }
+        }
+        ///////////////////////////////////////////////
+        if(isSeedBlock) {
+          if (this->verbose_) {
+            printf("%d,%d,%d is Seed Block \n",zStr,yStr,xStr);
+          }
+          this->memoryStruct_.h_listVol[blk_idx] = true;
+          this->memoryStruct_.h_listed[blk_idx] = true;
+          this->memoryStruct_.h_list[list_idx] = blk_idx;
+          list_idx++;
+          nActiveBlock++;
+        } else {
+          this->memoryStruct_.h_listVol[blk_idx] = false;
+          this->memoryStruct_.h_listed[blk_idx] = false;
+        }
+        blk_idx++;
+      }
+    }
+  }
+ //}
+  this->memoryStruct_.nActiveBlock = nActiveBlock;
+  // initialize GPU memory with host memory
+  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_sol, this->memoryStruct_.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );
+  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.t_sol, this->memoryStruct_.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );
+  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_list, this->memoryStruct_.h_list, nActiveBlock*sizeof(uint), cudaMemcpyHostToDevice) );
+  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.d_listVol, this->memoryStruct_.h_listVol, blockNum*sizeof(bool), cudaMemcpyHostToDevice) );
+  // initialize GPU memory with constant value
+  CUDA_SAFE_CALL( cudaMemset(this->memoryStruct_.d_con, 1, volSize*sizeof(bool)) );
+}
+
+void StructuredEikonal::setMapType(size_t t) {
+    this->solverType_ = t;
+  }
+
+void StructuredEikonal::solveEikonal() {
+  if (this->speeds_.empty()) {
+    this->map_generator();
+  }
+  this->isCudaMemCreated_ = false;
+  this->initialization();
+  this->useSeeds();
+  runEikonalSolverSimple(this->memoryStruct_, this->verbose_);
+  this->get_solution();
+  this->freeMemory();
+}
+
+std::vector< std::vector< std::vector<double> > > 
+  StructuredEikonal::getFinalResult() {
+    return this->answer_;
+  }
+
+void StructuredEikonal::get_solution() {
+  // copy solution from GPU
+  CUDA_SAFE_CALL( cudaMemcpy(this->memoryStruct_.h_sol,
+    this->memoryStruct_.d_sol, this->memoryStruct_.volsize*sizeof(DOUBLE), 
+    cudaMemcpyDeviceToHost) );
+  //put the data where it belongs in the grand scheme of data!
+  this->answer_ = std::vector<std::vector<std::vector<double> > >(
+    this->width_, std::vector<std::vector<double> >( 
+    this->height_, std::vector<double>(this->depth_,0)));
+ 
+ //#pragma omp parallel num_threads(8) 
+ //{
+ //#pragma omp for
+ for(size_t blockID = 0; blockID < this->memoryStruct_.blknum; blockID++) {
+    size_t baseAddr = blockID * this->memoryStruct_.blksize;
+		size_t xgridlength = this->memoryStruct_.xdim/BLOCK_LENGTH;
+		size_t ygridlength = this->memoryStruct_.ydim/BLOCK_LENGTH;
+		// compute block index
+		size_t bx = blockID%xgridlength;
+		size_t tmpIdx = (blockID - bx)/xgridlength;
+		size_t by = tmpIdx%ygridlength;
+		size_t bz = (tmpIdx-by)/ygridlength;
+    //translate back to real space
+    for(int k = 0; k < BLOCK_LENGTH; k++) {
+      for(int j = 0; j < BLOCK_LENGTH; j++) {
+        for(int i = 0; i < BLOCK_LENGTH; i++) {
+          double d = this->memoryStruct_.h_sol[baseAddr + 
+            k * BLOCK_LENGTH * BLOCK_LENGTH + 
+            j * BLOCK_LENGTH + i];
+          if ((i + bx * BLOCK_LENGTH) < this->width_ && 
+            (j + by * BLOCK_LENGTH) < this->height_ &&
+            (k + bz * BLOCK_LENGTH) < this->depth_) {
+            this->answer_[(i + bx * BLOCK_LENGTH)][(j +
+              by * BLOCK_LENGTH)][k + bz * BLOCK_LENGTH] = d;
+          }
+        }
+      }
+    }
+  }
+ //}
+}
+
+void StructuredEikonal::setItersPerBlock(size_t t) {
+  this->itersPerBlock_ = t;
+}
--- a/eikonalGPU/src/StructuredEikonal.h
+++ b/eikonalGPU/src/StructuredEikonal.h
@@ -0,0 +1,55 @@
+#ifndef __STRUCTUREDEIKONAL_H__
+#define __STRUCTUREDEIKONAL_H__
+
+#include <vector>
+#include <array>
+#include <string>
+#include <fstream>
+#include "common_def.h"
+#include "cuda_fim.h"
+
+/** The class that represents all of the available options for StructuredEikonal */
+class StructuredEikonal {
+public:
+  StructuredEikonal(bool verbose = false);
+  virtual ~StructuredEikonal();
+  void setDims(size_t w, size_t h, size_t d);
+  void setMapType(size_t t);
+  void setItersPerBlock(size_t t);
+  void setSpeeds(std::vector<std::vector<std::vector<double> > > speed);
+  void setSeeds(std::vector<std::array<size_t, 3> > seeds);
+  void setGpuDevice(int device);
+  void writeNRRD(std::string filename);
+  void freeMemory();
+  std::vector< std::vector< std::vector<double> > > getFinalResult();
+  /**
+  * Runs the algorithm.
+  *
+  * @data The set of options for the Eikonal algorithm.
+  *       The defaults are used if nothing is provided.
+  */
+  void solveEikonal();
+  //public member for answer
+  std::vector<std::vector<std::vector<double> > > answer_;
+  
+private:
+  void error(char* msg);
+  void CheckCUDAMemory();
+  void init_cuda_mem();
+  void set_attribute_mask();
+  void initialization();
+  void map_generator();
+  void get_solution();
+  void useSeeds();
+  //data
+  bool verbose_;
+  bool isCudaMemCreated_;
+  size_t width_, height_, depth_;
+  size_t itersPerBlock_, solverType_;
+  int gpuDevice_;
+  std::vector<std::array<size_t, 3> > seeds_;
+  CUDAMEMSTRUCT memoryStruct_;
+  std::vector<std::vector<std::vector<double> > > speeds_;
+};
+
+#endif
--- a/eikonalGPU/src/common_def.h
+++ b/eikonalGPU/src/common_def.h
@@ -0,0 +1,78 @@
+//
+// CUDA implementation of FIM (Fast Iterative Method) for Eikonal equations
+//
+// Copyright (c) Won-Ki Jeong (wkjeong@unist.ac.kr)
+//
+// 2016. 2. 4
+//
+
+//
+// Common to entire project
+//
+
+#ifndef __COMMON_DEF_H__
+#define __COMMON_DEF_H__
+
+#include "float.h"
+#include "math.h"
+#include "helper_timer.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <assert.h>
+//
+// common definition for Eikonal solvers
+//
+#ifndef INF
+#define INF 1e20//FLT_MAX //
+#endif
+
+#define BLOCK_LENGTH 4
+
+#ifndef FLOAT
+
+	#define DOUBLE double
+	#define EPS (DOUBLE)1e-16
+
+#else
+
+	#define DOUBLE float
+	#define EPS (DOUBLE)1e-6
+
+#endif
+
+//
+// itk image volume definition for 3D anisotropic eikonal solvers
+//
+typedef unsigned int uint;
+typedef unsigned char uchar;
+
+struct CUDA_MEM_STRUCTURE {
+  // volsize/blksize : # of pixel in volume/block
+  // blknum : # of block
+  // blklength : # of pixel in one dimemsion of block
+  uint nActiveBlock, blknum, volsize, blksize;
+  int xdim, ydim, zdim, nIter, blklength; // new new x,y,z dim to aligh power of 4
+
+  // host memory
+  uint *h_list;
+  bool *h_listVol, *h_listed;
+
+  // device memory
+  uint *d_list;
+  double *d_spd;
+  bool *d_mask, *d_listVol, *d_con;	
+  
+  DOUBLE *h_sol;//h_speedtable[256];
+  DOUBLE *d_sol, *t_sol; 
+
+  // GroupOrder
+  int* blockOrder;
+  int K;
+};
+
+typedef struct CUDA_MEM_STRUCTURE CUDAMEMSTRUCT;
+
+void CUT_SAFE_CALL(cudaError_t error);
+void CUDA_SAFE_CALL(cudaError_t error);
+
+#endif
--- a/eikonalGPU/src/cuda_compute_capability.c
+++ b/eikonalGPU/src/cuda_compute_capability.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2011 Florian Rathgeber, florian.rathgeber@gmail.com
+ *
+ * This code is licensed under the MIT License.  See the FindCUDA.cmake script
+ * for the text of the license.
+ *
+ * Based on code by Christopher Bruns published on Stack Overflow (CC-BY):
+ * http://stackoverflow.com/questions/2285185
+ */
+
+#include <stdio.h>
+#include <cuda_runtime.h>
+
+int main() {
+  int deviceCount, device, major = 9999, minor = 9999;
+  int gpuDeviceCount = 0;
+  struct cudaDeviceProp properties;
+
+  if (cudaGetDeviceCount(&deviceCount) != cudaSuccess)
+    {
+      printf("Couldn't get device count: %s\n", cudaGetErrorString(cudaGetLastError()));
+      return 1;
+    }
+  /* machines with no GPUs can still report one emulation device */
+  for (device = 0; device < deviceCount; ++device) {
+    cudaGetDeviceProperties(&properties, device);
+    if (properties.major != 9999) {/* 9999 means emulation only */
+      ++gpuDeviceCount;
+      /*  get minimum compute capability of all devices */
+      if (major > properties.major) {
+        major = properties.major;
+        minor = properties.minor;
+      } else if (minor > properties.minor) {
+        minor = properties.minor;
+      }
+    }
+  }
+
+  /* don't just return the number of gpus, because other runtime cuda
+     errors can also yield non-zero return values */
+  if (gpuDeviceCount > 0) {
+    if (major == 2 && minor == 1)
+      {
+        // There is no --arch compute_21 flag for nvcc, so force minor to 0
+        minor = 0;
+      }
+    /* this output will be parsed by FindCUDA.cmake */
+    printf("%d%d", major, minor);
+    return 0; /* success */
+  }
+  return 1; /* failure */
+}
--- a/eikonalGPU/src/cuda_fim.cu
+++ b/eikonalGPU/src/cuda_fim.cu
@@ -0,0 +1,326 @@
+//
+// CUDA implementation of FIM (Fast Iterative Method) for Eikonal equations
+//
+// Copyright (c) Won-Ki Jeong (wkjeong@unist.ac.kr)
+//
+// 2016. 2. 4
+//
+
+#include <cstdio>
+#include <string>
+#include <cmath>
+#include <float.h>
+#include <assert.h>
+#include <vector>
+#include <iostream>
+#include "cuda_fim_kernel.h"
+#include "cuda_fim.h"
+
+void CUT_SAFE_CALL(cudaError_t error) {
+	if(error != cudaSuccess)
+	{
+		printf("CUDA error! %d \n",error);
+		exit(EXIT_FAILURE);
+	}
+}
+
+void CUDA_SAFE_CALL(cudaError_t error) {
+	CUT_SAFE_CALL(error);
+}
+
+void runEikonalSolverSimple(CUDAMEMSTRUCT &cmem, bool verbose)
+{
+  int deviceID;
+  cudaGetDevice(&deviceID);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, deviceID);
+  if (verbose) {
+    printf("Current device id : %d, name : %s\n", deviceID, deviceProp.name);
+  }
+
+  int xdim, ydim, zdim;
+  xdim = cmem.xdim;
+  ydim = cmem.ydim;
+  zdim = cmem.zdim;
+
+#ifdef __DEVICE_EMULATION__
+  assert(xdim%BLOCK_LENGTH == 0);
+  assert(ydim%BLOCK_LENGTH == 0);
+  assert(zdim%BLOCK_LENGTH == 0);
+#endif
+
+  // create volumes
+  uint volSize = cmem.volsize;
+  uint blockNum = cmem.blknum;
+
+  //int nBlkZ = zdim/BLOCK_LENGTH;
+
+  if (verbose) {
+    printf("# of total voxels : %d\n", volSize);
+    printf("# of total blocks : %d\n", blockNum);
+  }
+
+  // h_ : host memory, d_ : device memory
+
+  // copy speed table to constant variable
+  //cudaMemcpyToSymbol((const char*)_speed, cmem.h_speedtable, sizeof(float)*256);
+
+  int nIter = cmem.nIter;
+  uint nActiveBlock = cmem.nActiveBlock; // active list
+
+  double *d_spd = cmem.d_spd;
+  DOUBLE *d_sol = cmem.d_sol;
+  DOUBLE *t_sol = cmem.t_sol;
+
+  uint *d_list = cmem.d_list;
+  bool *d_listVol = cmem.d_listVol;
+
+  bool *d_con = cmem.d_con;
+  bool *d_mask = cmem.d_mask;
+
+  // copy so that original value should not be modified
+  uint *h_list = (uint*) malloc(blockNum*sizeof(uint));
+  bool *h_listed = (bool*) malloc(blockNum*sizeof(bool));
+  bool *h_listVol = (bool*) malloc(blockNum*sizeof(bool));
+
+  // initialization
+  memcpy(h_list, cmem.h_list, blockNum*sizeof(uint));
+  memcpy(h_listed, cmem.h_listed, blockNum*sizeof(bool));
+  memcpy(h_listVol, cmem.h_listVol, blockNum*sizeof(bool));
+
+  CUT_SAFE_CALL( cudaMemcpy(cmem.d_list, cmem.h_list, nActiveBlock*sizeof(uint), cudaMemcpyHostToDevice) );
+  CUT_SAFE_CALL( cudaMemcpy(cmem.d_listVol, cmem.h_listVol, blockNum*sizeof(bool), cudaMemcpyHostToDevice) );
+  CUT_SAFE_CALL( cudaMemcpy(cmem.d_sol, cmem.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );
+  CUT_SAFE_CALL( cudaMemcpy(cmem.t_sol, cmem.h_sol, volSize*sizeof(DOUBLE), cudaMemcpyHostToDevice) );
+  CUT_SAFE_CALL( cudaMemset(cmem.d_con, 1, volSize*sizeof(bool)) );
+
+  // set dimension of block and entire grid size
+  dim3 dimBlock(BLOCK_LENGTH,BLOCK_LENGTH,BLOCK_LENGTH);
+  dim3 dimEntireGrid(blockNum);
+  dim3 dimGrid(nActiveBlock);
+
+  int nTotalIter = 0;
+  //uint sharedmemsize = sizeof(float)*BLOCK_LENGTH*BLOCK_LENGTH*(3*BLOCK_LENGTH + 2);
+
+  std::vector<int> sourceList;
+  sourceList.push_back((zdim/2)*ydim*xdim + (ydim/2)*xdim + (xdim/2));
+
+  // initialize & start timer
+  StopWatchInterface *timer_total, *timer_solver, *timer_reduction, *timer_list, *timer_list2, *timer_coarse;
+  timer_total = timer_solver = timer_reduction = timer_list = timer_list2 = timer_coarse = NULL;
+
+  sdkCreateTimer(&timer_total);
+  sdkCreateTimer(&timer_solver);
+  sdkCreateTimer(&timer_reduction);
+  sdkCreateTimer(&timer_list);
+  sdkCreateTimer(&timer_list2);
+  sdkCreateTimer(&timer_coarse);
+
+  sdkStartTimer(&timer_total);
+
+  uint nTotalBlockProcessed = 0;
+
+  int maxIter = 1000;
+  
+  // start solver
+  while(nActiveBlock > 0 && nTotalIter < maxIter)
+  {
+    //CUT_SAFE_CALL( cudaMemcpy(cmem.d_sol, cmem.h_sol, 64*nActiveBlock*sizeof(DOUBLE), cudaMemcpyHostToDevice) );
+    assert(nActiveBlock < 4294967295);
+
+    nTotalBlockProcessed += nActiveBlock;
+
+    nTotalIter++;
+
+    //
+    // solve current blocks in the active lists
+    //
+
+    //      printf("# of active tiles : %u\n", nActiveBlock);
+    if (verbose) {
+      printf("# of active tiles : %u\n", nActiveBlock);
+    }
+    //////////////////////////////////////////////////////////////////
+    // 1. run solver on current active tiles
+
+    sdkStartTimer(&timer_solver);
+
+    dimGrid.y = (unsigned int)floor(((double)nActiveBlock-1)/65535)+1;
+    dimGrid.x = (unsigned int)ceil ((double)nActiveBlock/(double)dimGrid.y);
+
+    if (verbose) {
+      printf("Grid size : %d x %d\n", dimGrid.x, dimGrid.y);
+    }
+
+    CUT_SAFE_CALL( cudaMemcpy(d_list, h_list, nActiveBlock*sizeof(uint), cudaMemcpyHostToDevice) );
+
+    run_solver<<< dimGrid, dimBlock >>>(d_spd, d_mask, d_sol, t_sol, d_con, d_list, xdim, ydim, zdim, nIter, nActiveBlock);
+
+    CUT_SAFE_CALL(cudaGetLastError());
+
+    cudaDeviceSynchronize();
+    sdkStopTimer(&timer_solver);
+
+
+    //////////////////////////////////////////////////////////////////
+    // 2. reduction (only active tiles)
+
+    sdkStartTimer(&timer_reduction);
+
+    run_reduction<<< dimGrid, dim3(BLOCK_LENGTH,BLOCK_LENGTH,BLOCK_LENGTH/2) >>>(d_con, d_listVol, d_list, nActiveBlock);
+
+    CUT_SAFE_CALL(cudaGetLastError());
+    //CUT_CHECK_ERROR("Kernel execution failed");
+    cudaDeviceSynchronize();
+
+    sdkStopTimer(&timer_reduction);
+
+
+    //////////////////////////////////////////////////////////////////
+    // 3. check neighbor tiles of converged tile
+    // Add any active block of neighbor of converged block is inserted
+    // to the list
+
+    // read-back active list volume
+#ifdef TIMER
+    sdkStartTimer(&timer_list);
+#endif
+    CUT_SAFE_CALL( cudaMemcpy(h_listVol, d_listVol, blockNum*sizeof(bool), cudaMemcpyDeviceToHost) );
+
+    uint nOldActiveBlock = nActiveBlock;
+    uint nBlkX = xdim/BLOCK_LENGTH;
+    uint nBlkY = ydim/BLOCK_LENGTH;
+
+    for(uint i=0; i<nOldActiveBlock; i++)
+    {
+      // check 6-neighbor of current active tile
+      uint currBlkIdx = h_list[i];
+
+      if(!h_listVol[currBlkIdx]) // not active : converged
+      {
+        uint nb[6];
+        nb[0] = (currBlkIdx < nBlkX*nBlkY) ? currBlkIdx : (currBlkIdx - nBlkX*nBlkY);  //tp
+        nb[1] = ((currBlkIdx + nBlkX*nBlkY) >= blockNum) ? currBlkIdx : (currBlkIdx + nBlkX*nBlkY); //bt
+        nb[2] = (currBlkIdx < nBlkX) ? currBlkIdx : (currBlkIdx - nBlkX); //up
+        nb[3] = ((currBlkIdx + nBlkX) >= blockNum) ? currBlkIdx : (currBlkIdx + nBlkX); //dn
+        nb[4] = (currBlkIdx%nBlkX == 0) ? currBlkIdx : currBlkIdx-1; //lf
+        nb[5] = ((currBlkIdx+1)%nBlkX == 0) ? currBlkIdx : currBlkIdx+1; //rt
+
+        for(int nbIdx = 0; nbIdx < 6; nbIdx++)
+        {
+          uint currIdx = nb[nbIdx];
+
+          //  assert(currIdx < volSize);
+
+          if(!h_listed[currIdx])
+          {
+            h_listed[currIdx] = true;
+            h_list[nActiveBlock++] = currIdx;
+          }
+        }
+      }
+    }
+    cudaDeviceSynchronize();
+#ifdef TIMER
+    sdkStopTimer(&timer_list);
+#endif
+
+
+    //////////////////////////////////////////////////////////////////
+    // 4. run solver only once for neighbor blocks of converged block
+    // current active list contains active blocks and neighbor blocks of
+    // any converged blocks.
+    //
+
+#ifdef TIMER
+    sdkStartTimer(&timer_solver);
+#endif
+
+    // update grid dimension because nActiveBlock is changed
+    dimGrid.y = (unsigned int)floor(((double)nActiveBlock-1)/65535)+1;
+    dimGrid.x = (unsigned int)ceil((double)nActiveBlock/(double)dimGrid.y);
+
+    if (verbose) {
+      printf("Grid size : %d x %d\n", dimGrid.x, dimGrid.y);
+    }
+
+    CUT_SAFE_CALL(cudaMemcpy(d_list, h_list, nActiveBlock*sizeof(uint), cudaMemcpyHostToDevice) );
+    run_check_neighbor<<< dimGrid, dimBlock >>>(d_spd, d_mask, t_sol, d_sol, d_con, d_list, xdim, ydim, zdim, nOldActiveBlock, nActiveBlock);
+    CUT_SAFE_CALL(cudaGetLastError());
+    cudaDeviceSynchronize();
+
+#ifdef TIMER
+    sdkStopTimer(&timer_solver);
+#endif
+
+
+
+    //////////////////////////////////////////////////////////////////
+    // 5. reduction
+
+    sdkStartTimer(&timer_reduction);
+
+    run_reduction<<< dimGrid, dim3(BLOCK_LENGTH,BLOCK_LENGTH,BLOCK_LENGTH/2) >>>(d_con, d_listVol, d_list, nActiveBlock);
+    CUT_SAFE_CALL(cudaGetLastError());
+    cudaDeviceSynchronize();
+
+    sdkStopTimer(&timer_reduction);
+
+
+    //////////////////////////////////////////////////////////////////
+    // 6. update active list
+    // read back active volume from the device and add
+    // active block to active list on the host memory
+
+
+#ifdef TIMER
+    sdkStartTimer(&timer_list2);
+#endif
+    nActiveBlock = 0;
+    CUT_SAFE_CALL( cudaMemcpy(h_listVol, d_listVol, blockNum*sizeof(bool), cudaMemcpyDeviceToHost) );
+
+    for(uint i=0; i<blockNum; i++)
+    {
+      if(h_listVol[i]) // true : active block (not converged)
+      {
+        h_listed[i] = true;
+        h_list[nActiveBlock++] = i;
+        //printf("Block %d added\n", i);
+      }
+      else h_listed[i] = false;
+    }
+    cudaDeviceSynchronize();
+#ifdef TIMER
+    sdkStopTimer(&timer_list2);
+#endif
+
+
+    if (verbose) {
+      printf("Iteration : %d\n", nTotalIter);
+    }
+  }
+  sdkStopTimer(&timer_total);
+
+  if (verbose) {
+    printf("Eikonal solver converged after %d iterations\n", nTotalIter);
+    printf("Total Running Time: %f (sec)\n", sdkGetTimerValue(&timer_total) / 1000);
+    printf("Time for solver : %f (sec)\n", sdkGetTimerValue(&timer_solver) / 1000);
+    printf("Time for reduction : %f (sec)\n", sdkGetTimerValue(&timer_reduction) / 1000);
+#ifdef TIMER
+    printf("Time for list update-1 (CPU) : %f (sec)\n", sdkGetTimerValue(&timer_list) / 1000);
+    printf("Time for list update-2 (CPU) : %f (sec)\n", sdkGetTimerValue(&timer_list2) / 1000);
+
+#endif
+    printf("Total # of blocks processed : %d\n", nTotalBlockProcessed);
+  }
+
+  sdkDeleteTimer(&timer_total);
+  sdkDeleteTimer(&timer_solver);
+  sdkDeleteTimer(&timer_reduction);
+  sdkDeleteTimer(&timer_list);
+
+  // delete dynamically allocated host memory
+  free(h_list);
+  free(h_listed);
+  free(h_listVol);
+}
--- a/eikonalGPU/src/cuda_fim.h
+++ b/eikonalGPU/src/cuda_fim.h
@@ -0,0 +1,20 @@
+//
+// CUDA implementation of FIM (Fast Iterative Method) for Eikonal equations
+//
+// Copyright (c) Won-Ki Jeong (wkjeong@unist.ac.kr)
+//
+// 2016. 2. 4
+//
+#ifndef __CUDA_FIM_H__
+#define __CUDA_FIM_H__
+
+#include <cstdlib>
+#include "common_def.h"
+
+#define TIMER
+
+void CUT_SAFE_CALL(cudaError_t error);
+void CUDA_SAFE_CALL(cudaError_t error);
+void runEikonalSolverSimple(CUDAMEMSTRUCT &cmem, bool verbose);
+
+#endif
--- a/eikonalGPU/src/cuda_fim_kernel.cu
+++ b/eikonalGPU/src/cuda_fim_kernel.cu
@@ -0,0 +1,431 @@
+//
+// CUDA implementation of FIM (Fast Iterative Method) for Eikonal equations
+//
+// Copyright (c) Won-Ki Jeong (wkjeong@unist.ac.kr)
+//
+// 2016. 2. 4
+//
+
+#include "cuda_fim_kernel.h"
+
+__device__ DOUBLE get_time_eikonal(DOUBLE a, DOUBLE b, DOUBLE c, DOUBLE s)
+{
+	DOUBLE ret, tmp;
+
+	// a > b > c
+	if(a < b) { tmp = a; a = b; b = tmp; }
+	if(b < c) { tmp = b; b = c; c = tmp; }
+	if(a < b) { tmp = a; a = b; b = tmp; }
+
+	ret = INF;
+
+	if(c < INF)
+	{
+		ret = c + s;
+		
+		if(ret > b) 
+		{	
+			tmp = ((b+c) + sqrtf(2.0f*s*s-(b-c)*(b-c)))*0.5f;
+		
+			if(tmp > b) ret = tmp; 
+
+			if(ret > a)	{			
+				tmp = (a+b+c)/3.0f + sqrtf(2.0f*(a*(b-a)+b*(c-b)+c*(a-c))+3.0f*s*s)/3.0f; 
+
+				if(tmp > a) ret = tmp;
+			}
+		}
+	}
+
+	return ret;
+}
+
+__global__ void run_solver(double* spd, bool* mask, const DOUBLE *sol_in, DOUBLE *sol_out, bool *con, uint* list, int xdim, int ydim, int zdim, int nIter, uint nActiveBlock)
+{
+	uint list_idx = blockIdx.y*gridDim.x + blockIdx.x;
+
+	if(list_idx < nActiveBlock)
+	{
+		// retrieve actual block index from the active list
+		uint block_idx = list[list_idx];
+
+		double F;
+		bool isValid;
+		uint blocksize = BLOCK_LENGTH*BLOCK_LENGTH*BLOCK_LENGTH;
+		uint base_addr = block_idx*blocksize;
+
+		uint xgridlength = xdim/BLOCK_LENGTH;
+		uint ygridlength = ydim/BLOCK_LENGTH;
+		uint zgridlength = zdim/BLOCK_LENGTH;
+
+		// compute block index
+		uint bx = block_idx%xgridlength;
+		uint tmpIdx = (block_idx - bx)/xgridlength;
+		uint by = tmpIdx%ygridlength;
+		uint bz = (tmpIdx-by)/ygridlength;
+
+		uint tx = threadIdx.x;
+		uint ty = threadIdx.y;
+		uint tz = threadIdx.z;
+		uint tIdx = tz*BLOCK_LENGTH*BLOCK_LENGTH + ty*BLOCK_LENGTH + tx;
+
+		__shared__ DOUBLE _sol[BLOCK_LENGTH+2][BLOCK_LENGTH+2][BLOCK_LENGTH+2];
+
+		// copy global to shared memory
+		dim3 idx(tx+1,ty+1,tz+1);
+
+		SOL(idx.x,idx.y,idx.z) = sol_in[base_addr + tIdx];
+		//F = _speed[(int)spd[base_addr + tIdx]];
+		F = spd[base_addr + tIdx];
+		if(F > 0) F = 1.0/F; // F = 1/f
+		isValid = mask[base_addr + tIdx];
+
+		uint new_base_addr, new_tIdx;
+
+		// 1-neighborhood values
+		if(tx == 0) 
+		{
+			if(bx == 0) // end of the grid
+			{	
+				new_tIdx = tIdx;
+				new_base_addr = base_addr;
+			}
+			else
+			{
+				new_tIdx = tIdx + BLOCK_LENGTH-1;
+				new_base_addr = (block_idx - 1)*blocksize;	
+			}
+
+			SOL(tx,idx.y,idx.z) = sol_in[new_base_addr + new_tIdx];	
+		}
+
+		if(tx == BLOCK_LENGTH-1)
+		{
+			if(bx == xgridlength-1) // end of the grid
+			{
+				new_tIdx = tIdx;
+				new_base_addr = base_addr;
+			}
+			else
+			{
+				new_tIdx = tIdx - (BLOCK_LENGTH-1);
+				new_base_addr = (block_idx + 1)*blocksize;	
+			}
+			SOL(tx+2,idx.y,idx.z) = sol_in[new_base_addr + new_tIdx];	
+		}
+
+		if(ty == 0)
+		{
+			if(by == 0)
+			{
+				new_tIdx = tIdx;
+				new_base_addr = base_addr;
+			}
+			else
+			{
+				new_tIdx = tIdx + (BLOCK_LENGTH-1)*BLOCK_LENGTH;
+				new_base_addr = (block_idx - xgridlength)*blocksize;
+			}
+
+			SOL(idx.x,ty,idx.z) = sol_in[new_base_addr + new_tIdx];
+		}
+
+		if(ty == BLOCK_LENGTH-1)
+		{
+			if(by == ygridlength-1) 
+			{
+				new_tIdx = tIdx;
+				new_base_addr = base_addr;
+			}
+			else
+			{
+				new_tIdx = tIdx - (BLOCK_LENGTH-1)*BLOCK_LENGTH;
+				new_base_addr = (block_idx + xgridlength)*blocksize;
+			}
+
+			SOL(idx.x,ty+2,idx.z) = sol_in[new_base_addr + new_tIdx];
+		}
+
+		if(tz == 0)
+		{
+			if(bz == 0)
+			{
+				new_tIdx = tIdx;
+				new_base_addr = base_addr;
+			}
+			else
+			{
+				new_tIdx = tIdx + (BLOCK_LENGTH-1)*BLOCK_LENGTH*BLOCK_LENGTH;
+				new_base_addr = (block_idx - xgridlength*ygridlength)*blocksize;
+			}
+
+			SOL(idx.x,idx.y,tz) = sol_in[new_base_addr + new_tIdx];
+		}
+
+		if(tz == BLOCK_LENGTH-1)
+		{
+			if(bz == zgridlength-1) 
+			{
+				new_tIdx = tIdx;
+				new_base_addr = base_addr;
+			}
+			else
+			{
+				new_tIdx = tIdx - (BLOCK_LENGTH-1)*BLOCK_LENGTH*BLOCK_LENGTH;
+				new_base_addr = (block_idx + xgridlength*ygridlength)*blocksize;
+			}
+
+			SOL(idx.x,idx.y,tz+2) = sol_in[new_base_addr + new_tIdx];
+		}
+
+		__syncthreads();
+
+		DOUBLE a,b,c,oldT,newT;
+
+		for(int iter=0; iter<nIter; iter++)	
+		{
+			//
+			// compute new value
+			//
+			oldT = newT = SOL(idx.x,idx.y,idx.z);
+
+			if(isValid)
+			{
+				a = min(SOL(tx,idx.y,idx.z),SOL(tx+2,idx.y,idx.z));
+				b = min(SOL(idx.x,ty,idx.z),SOL(idx.x,ty+2,idx.z));
+				c = min(SOL(idx.x,idx.y,tz),SOL(idx.x,idx.y,tz+2));
+
+				DOUBLE tmp = (DOUBLE) get_time_eikonal(a, b, c, F);
+
+				newT = min(tmp,oldT);
+			}
+			__syncthreads();	
+
+			if(isValid) SOL(idx.x,idx.y,idx.z) = newT;
+
+			__syncthreads(); // this may not required		
+		}
+
+
+		DOUBLE residue = oldT - newT;
+
+		// write back to global memory
+		con[base_addr + tIdx] = (residue < EPS) ? true : false;
+		sol_out[base_addr + tIdx] = newT;		
+	}
+}
+
+__global__ void run_reduction(bool *con, bool *listVol, uint *list, uint nActiveBlock)
+{
+	uint list_idx = blockIdx.y*gridDim.x + blockIdx.x;
+
+	if(list_idx < nActiveBlock)
+	{
+		uint block_idx = list[list_idx];
+
+		__shared__ bool conv[BLOCK_LENGTH*BLOCK_LENGTH*BLOCK_LENGTH];
+
+		uint blocksize = BLOCK_LENGTH*BLOCK_LENGTH*BLOCK_LENGTH/2;
+		uint base_addr = block_idx*blocksize*2;
+		uint tx = threadIdx.x;
+		uint ty = threadIdx.y;
+		uint tz = threadIdx.z;
+		uint tIdx = tz*BLOCK_LENGTH*BLOCK_LENGTH + ty*BLOCK_LENGTH + tx;
+
+		conv[tIdx] = con[base_addr + tIdx];
+		conv[tIdx + blocksize] = con[base_addr + tIdx + blocksize];
+
+		__syncthreads();
+
+		for(uint i=blocksize; i>0; i/=2)
+		{
+			if(tIdx < i)
+			{
+				bool b1, b2;
+				b1 = conv[tIdx];
+				b2 = conv[tIdx+i];
+				conv[tIdx] = (b1 && b2) ? true : false ;
+			}
+			__syncthreads();
+		}
+
+		if(tIdx == 0) 
+		{		
+			listVol[block_idx] = !conv[0]; // active list is negation of tile convergence (active = not converged)
+		}
+	}
+}
+
+__global__ void run_check_neighbor(double* spd, bool* mask, const DOUBLE *sol_in, DOUBLE *sol_out, 
+																	 bool *con, uint* list, int xdim, int ydim, int zdim,
+																	 uint nActiveBlock, uint nTotalBlock)
+{
+
+	uint list_idx = blockIdx.y*gridDim.x + blockIdx.x;
+
+	if(list_idx < nTotalBlock)
+	{
+		double F;
+		bool isValid;
+		__shared__ DOUBLE _sol[BLOCK_LENGTH+2][BLOCK_LENGTH+2][BLOCK_LENGTH+2];
+
+		uint block_idx = list[list_idx];
+		uint blocksize = BLOCK_LENGTH*BLOCK_LENGTH*BLOCK_LENGTH;
+		uint base_addr = block_idx*blocksize;
+
+		uint tx = threadIdx.x;
+		uint ty = threadIdx.y;
+		uint tz = threadIdx.z;
+		uint tIdx = tz*BLOCK_LENGTH*BLOCK_LENGTH + ty*BLOCK_LENGTH + tx;
+
+		if(list_idx < nActiveBlock) // copy value
+		{
+			sol_out[base_addr + tIdx] = sol_in[base_addr + tIdx];
+		} 
+		else
+		{
+			uint xgridlength = xdim/BLOCK_LENGTH;
+			uint ygridlength = ydim/BLOCK_LENGTH;
+			uint zgridlength = zdim/BLOCK_LENGTH;
+
+			// compute block index
+			uint bx = block_idx%xgridlength;
+			uint tmpIdx = (block_idx - bx)/xgridlength;
+			uint by = tmpIdx%ygridlength;
+			uint bz = (tmpIdx-by)/ygridlength;
+
+#ifdef __DEVICE_EMULATION__
+			assert(block_idx == bz*xgridlength*ygridlength + by*xgridlength + bx);
+			printf("Block %d's index  : %d, %d, %d\n", block_idx, bx, by, bz);
+			printf("Thread %d's index : %d, %d, %d\n", tIdx, tx, ty, tz);
+#endif
+
+			// copy global to shared memory
+			dim3 idx(tx+1,ty+1,tz+1);
+			_sol[idx.x][idx.y][idx.z] = sol_in[base_addr + tIdx];
+			F = spd[base_addr + tIdx];
+			if(F > 0) F = 1.0/F;
+			isValid = mask[base_addr + tIdx];
+
+			uint new_base_addr, new_tIdx;
+
+			// 1-neighborhood values
+			if(tx == 0) 
+			{
+				if(bx == 0) // end of the grid
+				{	
+					new_tIdx = tIdx;
+					new_base_addr = base_addr;
+				}
+				else
+				{
+					new_tIdx = tIdx + BLOCK_LENGTH-1;
+					new_base_addr = (block_idx - 1)*blocksize;	
+				}
+				_sol[tx][idx.y][idx.z] = sol_in[new_base_addr + new_tIdx];	
+			}
+
+			if(tx == BLOCK_LENGTH-1)
+			{
+				if(bx == xgridlength-1) // end of the grid
+				{
+					new_tIdx = tIdx;
+					new_base_addr = base_addr;
+				}
+				else
+				{
+					new_tIdx = tIdx - (BLOCK_LENGTH-1);
+					new_base_addr = (block_idx + 1)*blocksize;	
+				}
+				_sol[tx+2][idx.y][idx.z] = sol_in[new_base_addr + new_tIdx];	
+			}
+
+			if(ty == 0)
+			{
+				if(by == 0)
+				{
+					new_tIdx = tIdx;
+					new_base_addr = base_addr;
+				}
+				else
+				{
+					new_tIdx = tIdx + (BLOCK_LENGTH-1)*BLOCK_LENGTH;
+					new_base_addr = (block_idx - xgridlength)*blocksize;
+				}
+				_sol[idx.x][ty][idx.z] = sol_in[new_base_addr + new_tIdx];
+			}
+
+			if(ty == BLOCK_LENGTH-1)
+			{
+				if(by == ygridlength-1) 
+				{
+					new_tIdx = tIdx;
+					new_base_addr = base_addr;
+				}
+				else
+				{
+					new_tIdx = tIdx - (BLOCK_LENGTH-1)*BLOCK_LENGTH;
+					new_base_addr = (block_idx + xgridlength)*blocksize;
+				}
+				_sol[idx.x][ty+2][idx.z] = sol_in[new_base_addr + new_tIdx];
+			}
+
+			if(tz == 0)
+			{
+				if(bz == 0)
+				{
+					new_tIdx = tIdx;
+					new_base_addr = base_addr;
+				}
+				else
+				{
+					new_tIdx = tIdx + (BLOCK_LENGTH-1)*BLOCK_LENGTH*BLOCK_LENGTH;
+					new_base_addr = (block_idx - xgridlength*ygridlength)*blocksize;
+				}
+				_sol[idx.x][idx.y][tz] = sol_in[new_base_addr + new_tIdx];
+			}
+
+			if(tz == BLOCK_LENGTH-1)
+			{
+				if(bz == zgridlength-1) 
+				{
+					new_tIdx = tIdx;
+					new_base_addr = base_addr;
+				}
+				else
+				{
+					new_tIdx = tIdx - (BLOCK_LENGTH-1)*BLOCK_LENGTH*BLOCK_LENGTH;
+					new_base_addr = (block_idx + xgridlength*ygridlength)*blocksize;
+				}
+				_sol[idx.x][idx.y][tz+2] = sol_in[new_base_addr + new_tIdx];
+			}
+
+			__syncthreads();
+
+
+			DOUBLE a,b,c,oldT,newT;
+
+			//
+			// compute new value
+			//
+			oldT = newT = _sol[idx.x][idx.y][idx.z];
+
+			if(isValid)
+			{
+				a = min(_sol[tx][idx.y][idx.z],_sol[tx+2][idx.y][idx.z]);
+				b = min(_sol[idx.x][ty][idx.z],_sol[idx.x][ty+2][idx.z]);
+				c = min(_sol[idx.x][idx.y][tz],_sol[idx.x][idx.y][tz+2]);
+
+				DOUBLE tmp = (DOUBLE) get_time_eikonal(a, b, c, F);
+				newT = min(tmp,oldT);
+
+				sol_out[base_addr + tIdx] = newT;
+			}
+			// write back to global memory
+			DOUBLE residue = oldT - newT;
+			con[base_addr + tIdx] = (residue < EPS) ? true : false;	
+		}
+	}
+}
+
+
--- a/eikonalGPU/src/cuda_fim_kernel.h
+++ b/eikonalGPU/src/cuda_fim_kernel.h
@@ -0,0 +1,53 @@
+//
+// CUDA implementation of FIM (Fast Iterative Method) for Eikonal equations
+//
+// Copyright (c) Won-Ki Jeong (wkjeong@unist.ac.kr)
+//
+// 2016. 2. 4
+//
+
+#ifndef _cuda_fim_KERNEL_H_
+#define _cuda_fim_KERNEL_H_
+
+#include <cstdio>
+#include "common_def.h"
+
+// check bank confilct only when device emulation mode
+#ifdef __DEVICE_EMULATION__
+#define CHECK_BANK_CONFLICTS
+#endif
+
+#ifdef CHECK_BANK_CONFLICTS
+#define MEM(index) CUT_BANK_CHECKER(_mem, index)
+#define SOL(i,j,k) CUT_BANK_CHECKER(((float*)&_sol[0][0][0]), (k*(BLOCK_LENGTH+2)*(BLOCK_LENGTH+2) + j*(BLOCK_LENGTH+2) + i))
+#define SPD(i,j,k) CUT_BANK_CHECKER(((float*)&_spd[0][0][0]), (k*(BLOCK_LENGTH)*(BLOCK_LENGTH) + j*(BLOCK_LENGTH) + i))
+#else
+#define MEM(index) _mem[index]
+#define SOL(i,j,k) _sol[i][j][k]
+#define SPD(i,j,k) _spd[i][j][k]
+#endif
+
+__device__ DOUBLE get_time_eikonal(DOUBLE a, DOUBLE b, DOUBLE c, DOUBLE s);
+//
+// F : Input speed (positive)
+// if F =< 0, skip that pixel (masking out)
+//
+__global__ void run_solver(double* spd, bool* mask, const DOUBLE *sol_in, 
+  DOUBLE *sol_out, bool *con, uint* list, int xdim, int ydim, int zdim,
+  int nIter, uint nActiveBlock);
+//
+// run_reduction
+//
+// con is pixelwise convergence. Do reduction on active tiles and write tile-wise
+// convergence to listVol. The implementation assumes that the block size is 4x4x4.
+//
+__global__ void run_reduction(bool *con, bool *listVol, uint *list, uint nActiveBlock);
+//
+// if block is active block, copy values
+// if block is neighbor, run solver once
+//
+__global__ void run_check_neighbor(double* spd, bool* mask, const DOUBLE *sol_in, DOUBLE *sol_out,
+  bool *con, uint* list, int xdim, int ydim, int zdim,
+  uint nActiveBlock, uint nTotalBlock);
+#endif // #ifndef _cuda_fim_KERNEL_H_
+
--- a/eikonalGPU/src/eikonal.cpp
+++ b/eikonalGPU/src/eikonal.cpp
@@ -0,0 +1,183 @@
+#include "eikonal.h"
+
+#include <vector>
+#include <time.h>
+
+#include "StructuredEikonal.h"
+
+#define mwSize int
+
+// #define timeMeasure 1
+// #define verbose 1
+
+double* eikonal(double* volume,int* dims,double* startpoint) {
+
+    // for time measurement
+    #ifdef timeMeasure
+        clock_t begin, end;
+        float z;
+    #endif
+
+    // default parameters for number of threads and iters per block
+    size_t itersPerBlock = 10;
+    int nthreads=8;
+    int gpuSelection;
+    bool structuredEikonalVerbose = false;
+    #ifdef verbose
+        structuredEikonalVerbose = true;
+    #endif
+
+    #ifdef timeMeasure
+        begin = clock();
+    #endif
+
+
+    /* check if 3D and adapted dimension to size = 1 if 2D
+       (mxGetDimensions reduces dim[2] to 0 if 2D map is given) */
+    mwSize dimsAdapted[3] = {0,0,0};
+
+
+    if(dims[0] == 0) {
+        dimsAdapted[0] = 1;
+    } else {
+        dimsAdapted[0] = dims[0];
+    }
+    if(dims[1] == 0) {
+        dimsAdapted[1] = 1;
+    }else {
+        dimsAdapted[1] = dims[1];
+    }
+    if(dims[2] == 0) {
+        dimsAdapted[2] = 1;
+    }else {
+        dimsAdapted[2] = dims[2];
+    }
+
+
+    /* Create output data structures */
+    double* out = new double[dimsAdapted[0]*dimsAdapted[1]*dimsAdapted[2]]{0};
+
+
+
+    #ifdef timeMeasure
+        end = clock();
+        z=end - begin;
+        z/=CLOCKS_PER_SEC;
+        printf("Time init:                                 %f sec.\n", z);
+
+        begin = clock();
+    #endif
+
+
+    /* vector data structure for input volume */
+    std::vector<std::vector<std::vector<double> > > input;
+    input = std::vector<std::vector<std::vector<double> > >(
+        dimsAdapted[0], std::vector<std::vector<double> >(
+        dimsAdapted[1], std::vector<double>(dimsAdapted[2],1.)));
+
+
+    int linInd;
+    int linInd_k;
+    int linInd_j;
+
+    /* sort input volume input vector data structure */
+    #pragma omp parallel num_threads(nthreads)
+    {
+    #pragma omp for
+    for(size_t k = 0; k < dimsAdapted[2]; k++) {
+        linInd_k = k*dimsAdapted[0]*dimsAdapted[1];
+        for(size_t j = 0; j < dimsAdapted[1]; j++) {
+            linInd_j = linInd_k+j*dimsAdapted[0];
+            for(size_t i = 0; i < dimsAdapted[0]; i++) {
+                linInd = linInd_j + i;
+                input[i][j][k] = volume[linInd];
+            }
+        }
+    }
+    }
+
+    #ifdef timeMeasure
+        end = clock();
+        z=end - begin;
+        z/=CLOCKS_PER_SEC;
+        printf("Time vector init and data copy:            %f sec.\n", z);
+
+        begin = clock();
+    #endif
+
+
+
+
+
+
+    /* init the Eikonal solver */
+    // create object
+    StructuredEikonal data(structuredEikonalVerbose);
+
+    // set the dimensions of the input volume
+    data.setDims(dimsAdapted[0],dimsAdapted[1],dimsAdapted[2]);
+
+    // setting the number of iterations per block (default = 10)
+    data.setItersPerBlock(itersPerBlock);
+
+    // setting the start point for the Eikonal propagation 
+    data.setSeeds({ { { { (size_t) startpoint[0], (size_t) startpoint[1], (size_t) startpoint[2] } } } });
+
+    // setting the input data (aka speeds) to the Eikonal solver
+    data.setSpeeds(input);
+
+
+    #ifdef timeMeasure
+        end = clock();
+        z=end - begin;
+        z/=CLOCKS_PER_SEC;
+        printf("Time Eikonal init:                         %f sec.\n", z);
+
+        begin = clock();
+    #endif
+
+
+    /* Solve Eikonal and get the results */
+    data.solveEikonal();
+
+
+    #ifdef timeMeasure
+        end = clock();
+        z=end - begin;
+        z/=CLOCKS_PER_SEC;
+        printf("Time Eikonal solve:                        %f sec.\n", z);
+
+        begin = clock();
+    #endif
+
+
+    /* sort vector into output array. */
+    #pragma omp parallel num_threads(nthreads)
+    {
+    #pragma omp for
+    for(size_t k = 0; k < dimsAdapted[2]; k++) {
+        linInd_k = k*dimsAdapted[0]*dimsAdapted[1];
+        for(size_t j = 0; j < dimsAdapted[1]; j++) {
+            linInd_j = linInd_k+j*dimsAdapted[0];
+            for(size_t i = 0; i < dimsAdapted[0]; i++) {
+                linInd = linInd_j + i;
+                out[linInd] = data.answer_[i][j][k];
+            }
+        }
+    }
+    }
+
+    #ifdef timeMeasure
+        end = clock();
+        z=end - begin;
+        z/=CLOCKS_PER_SEC;
+        printf("Time output conversion:                    %f sec.\n", z);
+    #endif
+
+    /* and return to MATLAB */
+    return out;
+
+}
+
+
+
--- a/eikonalGPU/src/eikonal.h
+++ b/eikonalGPU/src/eikonal.h
@@ -0,0 +1 @@
+extern double* eikonal(double* volume,int* dims,double* startpoint);
--- a/eikonalGPU/src/helper_timer.cpp
+++ b/eikonalGPU/src/helper_timer.cpp
@@ -0,0 +1,303 @@
+/////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+// includes, project
+#include "helper_timer.h"
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#ifdef WIN32
+StopWatchWin::StopWatchWin() :
+start_time(), end_time(),
+diff_time(0.0f), total_time(0.0f),
+running(false), clock_sessions(0), freq(0), freq_set(false)
+{
+  if (!freq_set) {
+    // helper variable
+    LARGE_INTEGER temp;
+
+    // get the tick frequency from the OS
+    QueryPerformanceFrequency((LARGE_INTEGER*)&temp);
+
+    // convert to type in which it is needed
+    freq = ((double)temp.QuadPart) / 1000.0;
+
+    // rememeber query
+    freq_set = true;
+  }
+}
+
+StopWatchWin::~StopWatchWin() { }
+// functions, d
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+void
+StopWatchWin::start()
+{
+  QueryPerformanceCounter((LARGE_INTEGER*)&start_time);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+void
+StopWatchWin::stop()
+{
+  QueryPerformanceCounter((LARGE_INTEGER*)&end_time);
+  diff_time = (float)
+    (((double)end_time.QuadPart - (double)start_time.QuadPart) / freq);
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does 
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+void
+StopWatchWin::reset()
+{
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+  if (running)
+    QueryPerformanceCounter((LARGE_INTEGER*)&start_time);
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the 
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+float
+StopWatchWin::getTime()
+{
+  // Return the TOTAL time to date
+  float retval = total_time;
+  if (running)
+  {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter((LARGE_INTEGER*)&temp);
+    retval += (float)
+      (((double)(temp.QuadPart - start_time.QuadPart)) / freq);
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+float
+StopWatchWin::getAverageTime()
+{
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+StopWatchLinux::StopWatchLinux() :
+start_time(), diff_time(0.0), total_time(0.0),
+running(false), clock_sessions(0)
+{ }
+
+// Destructor
+StopWatchLinux::~StopWatchLinux() { }
+
+// functions, d
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+void
+StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+void
+StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does 
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+void
+StopWatchLinux::reset()
+{
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+  if (running)
+    gettimeofday(&start_time, 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the 
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+float
+StopWatchLinux::getTime()
+{
+  // Return the TOTAL time to date
+  float retval = total_time;
+  if (running) {
+    retval += getDiffTime();
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+float
+StopWatchLinux::getAverageTime()
+{
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+float
+StopWatchLinux::getDiffTime()
+{
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return  (float)(1000.0 * (t_time.tv_sec - start_time.tv_sec)
+    + (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif // _WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+bool
+sdkCreateTimer(StopWatchInterface **timer_interface)
+{
+  //printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#ifdef _WIN32
+  *timer_interface = (StopWatchInterface *)new StopWatchWin();
+#else
+  *timer_interface = (StopWatchInterface *)new StopWatchLinux();
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+bool
+sdkDeleteTimer(StopWatchInterface **timer_interface)
+{
+  //printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) delete *timer_interface;
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+bool
+sdkStartTimer(StopWatchInterface **timer_interface)
+{
+  //printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) (*timer_interface)->start();
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+bool
+sdkStopTimer(StopWatchInterface **timer_interface)
+{
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) (*timer_interface)->stop();
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+bool
+sdkResetTimer(StopWatchInterface **timer_interface)
+{
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) (*timer_interface)->reset();
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer 
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+float
+sdkGetAverageTimerValue(StopWatchInterface **timer_interface)
+{
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface)
+    return (*timer_interface)->getAverageTime();
+  else
+    return 0.0f;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+float
+sdkGetTimerValue(StopWatchInterface **timer_interface)
+{
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface)
+    return (*timer_interface)->getTime();
+  else
+    return 0.0f;
+}
--- a/eikonalGPU/src/helper_timer.h
+++ b/eikonalGPU/src/helper_timer.h
@@ -0,0 +1,228 @@
+/////////////////////////////////////////////////////////////////////////////
+//
+// Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+//
+// Please refer to the NVIDIA end user license agreement (EULA) associated
+// with this source code for terms and conditions that govern your use of
+// this software. Any use, reproduction, disclosure, or distribution of
+// this software and related documentation outside the terms of the EULA
+// is strictly prohibited.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+// Helper Timer Functions (this is the inlined version)
+
+#ifndef HELPER_TIMER_H
+#define HELPER_TIMER_H
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include "my_exception.h"
+
+// Definition of the StopWatch Interface, this is used if we don't want to use the CUT functions
+// But rather in a self contained class interface
+class StopWatchInterface
+{
+public:
+  StopWatchInterface() {};
+  virtual ~StopWatchInterface() {};
+
+public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been 
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#ifdef WIN32
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface
+{
+public:
+  //! Constructor, default
+  StopWatchWin();
+
+  // Destructor
+  ~StopWatchWin();
+
+public:
+  //! Start time measurement
+  void start();
+
+  //! Stop time measurement
+  void stop();
+
+  //! Reset time counters to zero
+  void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been 
+  //! _stopped_ (ie finished sessions) and the current total time
+  float getAverageTime();
+
+private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER  start_time;
+  //! End of measurement
+  LARGE_INTEGER  end_time;
+
+  //! Time difference between the last start and stop
+  float  diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float  total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double  freq;
+
+  //! flag if the frequency has been set
+  bool  freq_set;
+};
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <ctime>
+#include <sys/time.h>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface
+{
+public:
+  //! Constructor, default
+  StopWatchLinux();
+
+  // Destructor
+  virtual ~StopWatchLinux();
+public:
+  //! Start time measurement
+  void start();
+
+  //! Stop time measurement
+  void stop();
+
+  //! Reset time counters to zero
+  void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been 
+  //! _stopped_ (ie finished sessions) and the current total time
+  float getAverageTime();
+
+private:
+
+  // helper functions
+
+  //! Get difference between start time and current time
+  float getDiffTime();
+
+private:
+
+  // member variables
+
+  //! Start of measurement
+  struct timeval  start_time;
+
+  //! Time difference between the last start and stop
+  float  diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float  total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+#endif // _WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+bool sdkCreateTimer(StopWatchInterface **timer_interface);
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+bool sdkDeleteTimer(StopWatchInterface **timer_interface);
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+bool sdkStartTimer(StopWatchInterface **timer_interface);
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+bool sdkStopTimer(StopWatchInterface **timer_interface);
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+bool sdkResetTimer(StopWatchInterface **timer_interface);
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer 
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+float sdkGetAverageTimerValue(StopWatchInterface **timer_interface);
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+float sdkGetTimerValue(StopWatchInterface **timer_interface);
+
+#endif // HELPER_TIMER_H
--- a/eikonalGPU/src/my_exception.h
+++ b/eikonalGPU/src/my_exception.h
@@ -0,0 +1,151 @@
+/*
+* Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+/* CUda UTility Library */
+#ifndef _MY_EXCEPTION_H_
+#define _MY_EXCEPTION_H_
+
+// includes, system
+#include <exception>
+#include <stdexcept>
+#include <iostream>
+#include <stdlib.h>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template<class Std_Exception>
+class Exception : public Std_Exception 
+{
+public:
+
+    //! @brief Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it( const char* file, 
+                          const int line,
+                          const char* detailed = "-" );  
+
+    //! Static construction interface
+    //! @return Alwayss throws ( Located_Exception<Exception>)
+    //! @param file file in which the Exception occurs
+    //! @param line line in which the Exception occurs
+    //! @param detailed details on the code fragment causing the Exception
+    static void throw_it( const char* file, 
+                          const int line,      
+                          const std::string& detailed);  
+
+    //! Destructor
+    virtual ~Exception() throw(); 
+
+private:
+
+    //! Constructor, default (private)
+    Exception(); 
+
+    //! Constructor, standard
+    //! @param str string returned by what()
+    Exception( const std::string& str); 
+
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template<class Exception_Typ>
+inline void
+handleException( const Exception_Typ& ex) 
+{
+    std::cerr << ex.what() << std::endl;
+
+    exit( EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION( msg) \
+    Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION( msg) \
+    Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION( msg) \
+    Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it( const char* file, const int line, const char* detailed) 
+{
+    std::stringstream s;
+
+    // Quiet heavy-weight but exceptions are not for 
+    // performance / release versions
+    s << "Exception in file '" << file << "' in line " << line << "\n"
+      << "Detailed description: " << detailed << "\n";
+
+    throw Exception( s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template<class Std_Exception>
+void
+Exception<Std_Exception>::
+throw_it( const char* file, const int line, const std::string& msg) 
+{
+    throw_it( file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception() :
+ Exception("Unknown Exception.\n")
+{ }
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::Exception( const std::string& s) :
+ Std_Exception( s)
+{ }   
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template<class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() { }
+
+// functions, exported
+
+#endif // #ifndef _EXCEPTION_H_
+
				`@@ -0,0 +1 @@`
				`extern double* eikonal(double* volume,int* dims,double* startpoint);`