Commit source

2023-05-18 16:04:27 +08:00
parent 88cf81e4ea
commit c6cd188732
83 changed files with 39921 additions and 0 deletions
--- a/TVALCPU/src/blas_wrap.h
+++ b/TVALCPU/src/blas_wrap.h
@@ -0,0 +1,237 @@
+/*
+ * blas_wrap.h
+ *
+ *  Created on: Oct 7, 2011
+ *      Author: ditlevsen
+ */
+
+#ifndef BLAS_WRAP_H_
+#define BLAS_WRAP_H_
+
+#include <complex>
+#include <mkl_cblas.h>
+#include <mkl_spblas.h>
+
+// overloaded wrapper functions for blas / mkl sparse blas calls...
+inline void cblas_scal(int N, float alpha, float *X, int incX) {
+	cblas_sscal(N, alpha, X, incX);
+}
+
+inline void cblas_scal(int N, double alpha, double *X, int incX) {
+	cblas_dscal(N, alpha, X, incX);
+}
+
+inline void cblas_scal(int N, std::complex<float> alpha, std::complex<float> *X, int incX) {
+	cblas_cscal(N, &alpha, X, incX);
+}
+
+inline void cblas_scal(int N, std::complex<double> alpha, std::complex<double> *X, int incX) {
+	cblas_zscal(N, &alpha, X, incX);
+}
+
+inline void cblas_scal(int N, float alpha, std::complex<float> *X, int incX) {
+	cblas_csscal(N, alpha, X, incX);
+}
+
+inline void cblas_scal(int N, double alpha, std::complex<double> *X, int incX) {
+	cblas_zdscal(N, alpha, X, incX);
+}
+
+
+
+inline void cblas_axpy(int N, float alpha, const float *X, int incX, float *Y, int incY) {
+	cblas_saxpy(N, alpha, X, incX, Y, incY);
+}
+
+inline void cblas_axpy(int N, double alpha, const double *X, int incX, double *Y, int incY) {
+	cblas_daxpy(N, alpha, X, incX, Y, incY);
+}
+
+inline void cblas_axpy(int N, std::complex<float> alpha, const std::complex<float> *X, int incX, std::complex<float> *Y, int incY) {
+	cblas_caxpy(N, &alpha, X, incX, Y, incY);
+}
+
+inline void cblas_axpy(int N, std::complex<double> alpha, const std::complex<double> *X, int incX, std::complex<double> *Y, int incY) {
+	cblas_zaxpy(N, &alpha, X, incX, Y, incY);
+}
+
+
+inline void cblas_copy(int N, const float *X, int incX, float *Y, int incY) {
+	cblas_scopy(N, X, incX, Y, incY);
+}
+
+inline void cblas_copy(int N, const double *X, int incX, double *Y, int incY) {
+	cblas_dcopy(N, X, incX, Y, incY);
+}
+
+inline void cblas_copy(int N, const std::complex<float> *X, int incX, std::complex<float> *Y, int incY) {
+	cblas_ccopy(N,X, incX, Y, incY);
+}
+
+inline void cblas_copy(int N, const std::complex<double> *X, int incX, std::complex<double> *Y, int incY) {
+	cblas_zcopy(N, X, incX, Y, incY);
+}
+
+
+inline float cblas_dotc(int N, const float *X, int incX, const float *Y, int incY) {
+	return cblas_sdot(N, X, incX, Y, incY);
+}
+
+inline double cblas_dotc(int N, const double *X, int incX, const double *Y, int incY) {
+	return cblas_ddot(N, X, incX, Y, incY);
+}
+
+inline std::complex<float> cblas_dotc(int N, const std::complex<float> *X, int incX, const std::complex<float> *Y, int incY) {
+	std::complex<float> result;
+	cblas_cdotc_sub(N, X, incX, Y, incY, &result);
+	return result;
+}
+
+inline std::complex<double> cblas_dotc(int N, const std::complex<double> *X, int incX, const std::complex<double> *Y, int incY) {
+	std::complex<double> result;
+	cblas_zdotc_sub(N, X, incX, Y, incY, &result);
+	return result;
+}
+
+
+
+inline float cblas_nrm2(int N, const float *X, int incX) {
+	return cblas_snrm2(N, X, incX);
+}
+
+inline double cblas_nrm2(int N, const double *X, int incX) {
+	return cblas_dnrm2(N, X, incX);
+}
+
+inline float cblas_nrm2(int N, const std::complex<float> *X, int incX) {
+	return cblas_scnrm2(N, X, incX);
+}
+
+inline double cblas_nrm2(int N, const std::complex<double> *X, int incX) {
+	return cblas_dznrm2(N, X, incX);
+}
+
+
+
+inline void cblas_gemv(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, int M, int N,
+		float alpha, const float *A, int lda, const float *X, int incX, float beta, float *Y, int incY) {
+		cblas_sgemv(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+}
+
+inline void cblas_gemv(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, int M, int N,
+		double alpha, const double *A, int lda, const double *X, int incX, double beta, double *Y, int incY) {
+	cblas_dgemv(Order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
+}
+
+inline void cblas_gemv(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, int M, int N,
+		std::complex<float> alpha, const std::complex<float> *A, int lda, const std::complex<float> *X, int incX,
+		std::complex<float> beta, std::complex<float> *Y, int incY)
+{
+		cblas_cgemv(Order, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
+}
+
+inline void cblas_gemv(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, int M, int N,
+				std::complex<double> alpha, const std::complex<double> *A, int lda, const std::complex<double> *X, int incX,
+		std::complex<double> beta, std::complex<double> *Y, int incY)
+{
+		cblas_zgemv(Order, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
+}
+
+inline void cblas_gemm(const  CBLAS_ORDER Order, const  CBLAS_TRANSPOSE TransA,
+                 const  CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const float alpha, const float *A,
+                 const int lda, const float *B, const int ldb,
+                 const float beta, float *C, const int ldc)
+{
+	cblas_sgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+inline void cblas_gemm(const  CBLAS_ORDER Order, const  CBLAS_TRANSPOSE TransA,
+                 const  CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const double alpha, const double *A,
+                 const int lda, const double *B, const int ldb,
+                 const double beta, double *C, const int ldc)
+{
+	cblas_dgemm(Order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+inline void cblas_gemm(const  CBLAS_ORDER Order, const  CBLAS_TRANSPOSE TransA,
+                 const  CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const std::complex<float> alpha, const std::complex<float> *A,
+                 const int lda, const std::complex<float> *B, const int ldb,
+                 const std::complex<float> beta, std::complex<float> *C, const int ldc)
+{
+	cblas_cgemm(Order, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
+}
+
+inline void cblas_gemm(const  CBLAS_ORDER Order, const  CBLAS_TRANSPOSE TransA,
+                 const  CBLAS_TRANSPOSE TransB, const int M, const int N,
+                 const int K, const std::complex<double> alpha, const std::complex<double> *A,
+                 const int lda, const std::complex<double> *B, const int ldb,
+                 const std::complex<double> beta, std::complex<double> *C, const int ldc)
+{
+	cblas_zgemm(Order, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
+}
+
+
+
+inline void mkl_cscmv(char *transa, MKL_INT *m, MKL_INT *k, float *alpha, char *matdescra, float  *val, MKL_INT *indx,
+		MKL_INT *pntrb, MKL_INT *pntre, float *x, float *beta, float *y)
+{
+	mkl_scscmv(transa, m, k, alpha, matdescra, val, indx,  pntrb, pntre, x, beta, y);
+}
+
+inline void mkl_cscmv(char *transa, MKL_INT *m, MKL_INT *k, double *alpha, char *matdescra, double  *val, MKL_INT *indx,
+		MKL_INT *pntrb, MKL_INT *pntre, double *x, double *beta, double *y)
+{
+	mkl_dcscmv(transa, m, k, alpha, matdescra, val, indx,  pntrb, pntre, x, beta, y);
+}
+
+inline void mkl_cscmv(char *transa, MKL_INT *m, MKL_INT *k, std::complex<float> *alpha, char *matdescra,
+		std::complex<float>  *val, MKL_INT *indx,  MKL_INT *pntrb, MKL_INT *pntre, std::complex<float> *x,
+		std::complex<float> *beta, std::complex<float> *y)
+{
+	mkl_ccscmv(transa, m, k, (MKL_Complex8 *)alpha, matdescra, (MKL_Complex8 *)val, indx,  pntrb, pntre,
+			(MKL_Complex8 *)x, (MKL_Complex8 *)beta, (MKL_Complex8 *)y);
+}
+
+inline void mkl_cscmv(char *transa, MKL_INT *m, MKL_INT *k, std::complex<double> *alpha, char *matdescra,
+		std::complex<double>  *val, MKL_INT *indx,  MKL_INT *pntrb, MKL_INT *pntre, std::complex<double> *x,
+		std::complex<double> *beta, std::complex<double> *y)
+{
+	mkl_zcscmv(transa, m, k, (MKL_Complex16 *)alpha, matdescra, (MKL_Complex16 *)val, indx,  pntrb, pntre,
+			(MKL_Complex16 *)x, (MKL_Complex16 *)beta, (MKL_Complex16 *)y);
+}
+
+
+
+void mkl_cscmm(char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, float *alpha, char *matdescra, float  *val,
+		MKL_INT *indx,  MKL_INT *pntrb, MKL_INT *pntre, float *b, MKL_INT *ldb, float *beta, float *c, MKL_INT *ldc)
+{
+	mkl_scscmm(transa, m, n, k, alpha, matdescra, val, indx,  pntrb, pntre, b, ldb, beta, c, ldc);
+}
+
+void mkl_cscmm(char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, double *alpha, char *matdescra,
+		double *val, MKL_INT *indx,  MKL_INT *pntrb, MKL_INT *pntre, double *b, MKL_INT *ldb, double *beta, double *c,
+		MKL_INT *ldc)
+{
+	mkl_dcscmm(transa, m, n, k, alpha, matdescra, val, indx,  pntrb, pntre, b, ldb, beta, c, ldc);
+}
+
+void mkl_cscmm(char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, std::complex<float> *alpha, char *matdescra,
+		std::complex<float>  *val, MKL_INT *indx,  MKL_INT *pntrb, MKL_INT *pntre, std::complex<float> *b, MKL_INT *ldb,
+		std::complex<float> *beta, std::complex<float> *c, MKL_INT *ldc)
+{
+	mkl_ccscmm(transa, m, n, k, (MKL_Complex8 *)alpha, matdescra, (MKL_Complex8 *)val, indx,  pntrb, pntre,
+			(MKL_Complex8 *)b, ldb, (MKL_Complex8 *)beta, (MKL_Complex8 *)c, ldc);
+}
+
+void mkl_cscmm(char *transa, MKL_INT *m, MKL_INT *n, MKL_INT *k, std::complex<double> *alpha, char *matdescra,
+		std::complex<double>  *val, MKL_INT *indx,  MKL_INT *pntrb, MKL_INT *pntre, std::complex<double> *b, MKL_INT *ldb,
+		std::complex<double> *beta, std::complex<double> *c, MKL_INT *ldc)
+{
+	mkl_zcscmm(transa, m, n, k, (MKL_Complex16 *)alpha, matdescra, (MKL_Complex16 *)val, indx,  pntrb, pntre,
+			(MKL_Complex16 *)b, ldb, (MKL_Complex16 *)beta, (MKL_Complex16 *)c, ldc);
+}
+
+#endif /* BLAS_WRAP_H_ */
--- a/TVALCPU/src/container.h
+++ b/TVALCPU/src/container.h
@@ -0,0 +1,291 @@
+/*
+ * Classes for handling matrices and vectors:
+ *
+ * Class mat<Type>:         dense matrices/vectors
+ * Class sparse_mat<Type>:  sparse matrices
+ * Class geometry:         data for dynamic calculation of the measurement matrix
+ *
+ */
+
+#ifndef CONTAINER_H_
+#define CONTAINER_H_
+
+
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+#include <exception>
+
+// enumerations...
+enum T_alloc {allocated_in_constructor, preallocated_buffer};
+enum T_mat_format {mat_col_major, mat_row_major};
+enum T_sparse_mat_format {sparse_mat_csc, sparse_mat_csr};
+
+// exceptions...
+class illegal_mat_assignment: public std::exception
+{
+  virtual const char* what() const throw()
+  {
+    return "Illegale Zuweisung von mat-Objekten.";
+  }
+};
+
+class illegal_sparse_mat_assignment: public std::exception
+{
+  virtual const char* what() const throw()
+  {
+    return "Illegale Zuweisung von sparse_mat-Objekten.";
+  }
+};
+
+//---------------------------------------
+// class mat...
+//---------------------------------------
+
+template <class Type>
+class mat {
+private:
+	Type *pbuf;
+	T_alloc alloc_info;
+public:
+	const int dim_y;
+	const int dim_x;
+	const int dim_z;
+	const int len;
+	const T_mat_format format;
+
+	mat(int l_y, int l_x, int l_z, bool init=true, T_mat_format storage=mat_row_major);
+	mat(int num_elements, bool init=true);
+	mat(int l_y, int l_x, int l_z, Type *buffer, T_mat_format storage=mat_row_major);
+	mat(int num_elements, Type *buffer);
+	mat(const mat &m);
+	~mat() { if(pbuf && (alloc_info == allocated_in_constructor)) delete[] pbuf; };
+
+	Type *data() { return pbuf; }
+	const Type *data() const { return pbuf; }
+
+	Type &operator[](int i) { return pbuf[i]; }
+	const Type &operator[](int i) const { return pbuf[i]; }
+
+	mat &operator=(const mat<Type> &m) throw(illegal_mat_assignment);
+
+	void randomFill(Type scl);
+};
+
+
+//---------------------------------------
+// class sparse_mat...
+//---------------------------------------
+
+template<class Type>
+class sparse_mat {
+private:
+	int *p_ptr;
+	int *p_ind;
+	Type *p_val;
+	T_alloc alloc_info;
+public:
+	const int dim_y;
+	const int dim_x;
+	const int nnz;
+	const T_sparse_mat_format format;
+
+	sparse_mat(int l_y, int l_x, int n_nonzero, T_sparse_mat_format storage_format=sparse_mat_csc, bool init=true);
+	sparse_mat(int l_y, int l_x, int n_nonzero, Type *buff_val, int *buff_ptr, int *buff_ind,
+			T_sparse_mat_format storage_format=sparse_mat_csc);
+	sparse_mat(const sparse_mat &m);
+	~sparse_mat();
+
+	Type *val() { return p_val; }
+	const Type *val() const { return p_val; }
+
+	int *ptr() { return p_ptr; }
+	const int *ptr() const { return p_ptr; }
+
+	int *ind() { return p_ind; }
+	const int *ind() const { return p_ind; }
+
+	sparse_mat<Type> &operator=(const mat<Type> &m) throw(illegal_sparse_mat_assignment);
+};
+
+//---------------------------------------
+// Struct geomety
+//---------------------------------------
+
+struct geometry {
+	int *x_emitters;
+	int *y_emitters;
+	int *z_emitters;
+	int *x_receivers;
+	int *y_receivers;
+	int *z_receivers;
+	int num_emitters;
+	int num_receivers;
+	int rv_x;
+	int rv_y;
+	int rv_z;
+	float scale_factor;
+};
+
+// ------------------------------------------------------------------------
+// member functions, etc. class mat
+// ------------------------------------------------------------------------
+
+template<class Type>
+mat<Type>::mat(int l_y, int l_x, int l_z, bool init, T_mat_format storage): pbuf(NULL), alloc_info(allocated_in_constructor),
+	dim_y(l_y), dim_x(l_x), dim_z(l_z), len(l_y*l_x*l_z), format(storage) {
+	if(len > 0) {
+		pbuf = new Type[len];
+		if(init)
+			memset(pbuf, 0, len*sizeof(Type));
+	}
+};
+
+template<class Type>
+mat<Type>::mat(int num_elements, bool init): pbuf(NULL), alloc_info(allocated_in_constructor), dim_y(num_elements), dim_x(1),
+	dim_z(1), len(num_elements), format(mat_row_major) {
+	if(len > 0) {
+		pbuf = new Type[len];
+		if(init)
+			memset(pbuf, 0, len*sizeof(Type));
+	}
+};
+
+template<class Type>
+mat<Type>::mat(int l_y, int l_x, int l_z, Type *buffer, T_mat_format storage): pbuf(buffer), dim_y(l_y), dim_x(l_x),
+	dim_z(l_z), len(l_y*l_x*l_z), alloc_info(preallocated_buffer), format(storage) {};
+
+template<class Type>
+mat<Type>::mat(int num_elements, Type *buffer): pbuf(buffer), dim_y(num_elements), dim_x(1),
+	dim_z(1), len(num_elements), alloc_info(preallocated_buffer), format(mat_row_major) {};
+
+template<class Type>
+mat<Type>::mat(const mat &m): pbuf(NULL), alloc_info(allocated_in_constructor), dim_y(m.dim_y), dim_x(m.dim_x),
+	dim_z(m.dim_z), len(m.len), format(m.format) {
+	if(len > 0) {
+		pbuf = new Type[len];
+		memcpy(pbuf, m.pbuf, len*sizeof(Type));
+	}
+};
+
+template<class Type>
+mat<Type> &mat<Type>::operator=(const mat<Type> &m) throw(illegal_mat_assignment) {
+	if(m.len == len)
+		memcpy(pbuf, m.pbuf, len*sizeof(Type));
+	else
+		throw illegal_mat_assignment();
+	return *this;
+}
+
+template<class Type>
+void mat<Type>::randomFill(Type scl) {
+	for(int i=0; i<len; i++)
+		pbuf[i] = ((Type)rand() / RAND_MAX - 0.5)/scl;
+}
+
+template<class Type>
+std::ostream &operator<<(std::ostream &stream, const mat<Type> &m) {
+	for(int z=0; z < m.dim_z; z++) {
+		stream << "\nz=" << z+1 << ":\n\n";
+		for(int i=0; i<m.dim_y; i++) {
+			for(int j=0; j<m.dim_x; j++) {
+				if(m.format == mat_row_major)
+					stream << m[z*m.dim_x*m.dim_y + i*m.dim_x + j] << " ";
+				else
+					stream << m[z*m.dim_x*m.dim_y + i + j*m.dim_y] << " ";
+			}
+			stream << "\n";
+		}
+		return stream;
+	}
+}
+
+
+// ------------------------------------------------------------------------
+// member functions, etc. class sparse_mat
+// ------------------------------------------------------------------------
+
+template<class Type>
+sparse_mat<Type>::sparse_mat(int l_y, int l_x, int n_nonzero, T_sparse_mat_format storage_format, bool init):
+	p_val(NULL), p_ptr(NULL), p_ind(NULL), dim_y(l_y), dim_x(l_x), nnz(n_nonzero), format(storage_format),
+	alloc_info(allocated_in_constructor)
+{
+	if(nnz > 0) {
+		int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
+
+		p_val = new Type[nnz];
+		p_ptr = new int[len_ptr];
+		p_ind = new int[nnz];
+		if(init) {
+			memset(p_val, 0, nnz*sizeof(Type));
+			memset(p_ptr, 0, (len_ptr)*sizeof(int));
+			memset(p_ind, 0, nnz*sizeof(int));
+		}
+	}
+};
+
+template<class Type>
+sparse_mat<Type>::sparse_mat(int l_y, int l_x, int n_nonzero, Type *buff_val, int *buff_ptr, int *buff_ind,
+		T_sparse_mat_format storage_format): p_val(buff_val), p_ptr(buff_ptr), p_ind(buff_ind), dim_y(l_y), dim_x(l_x),
+		nnz(n_nonzero), format(storage_format), alloc_info(preallocated_buffer) {}
+
+
+template<class Type>
+sparse_mat<Type>::sparse_mat(const sparse_mat &m): p_val(NULL), p_ptr(NULL), p_ind(NULL), dim_y(m.dim_y),
+	dim_x(m.dim_x), nnz(m.nnz), format(m.format), alloc_info(allocated_in_constructor)
+{
+	if(nnz > 0) {
+		int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
+
+		p_val = new Type[nnz];
+		memcpy(p_val, m.p_val, nnz*sizeof(Type));
+		p_ptr = new int[len_ptr];
+		memcpy(p_ptr, m.p_ptr, (len_ptr)*sizeof(int));
+		p_ind = new int[nnz];
+		memcpy(p_ind, m.p_ind, nnz*sizeof(int));
+	}
+};
+
+template<class Type>
+sparse_mat<Type>::~sparse_mat() {
+	if(alloc_info == allocated_in_constructor) {
+		if(p_val) delete[] p_val;
+		if(p_ptr) delete[] p_ptr;
+		if(p_ind) delete[] p_ind;
+	}
+}
+
+template<class Type>
+sparse_mat<Type> &sparse_mat<Type>::operator=(const mat<Type> &m) throw(illegal_sparse_mat_assignment){
+
+	if(m.nnz == nnz && m.dim_y == dim_y && m.dim_x == dim_x && m.format == format) {
+		int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
+
+		memcpy(p_val, m.p_val, nnz*sizeof(Type));
+		memcpy(p_ptr, m.p_ptr, (len_ptr)*sizeof(int));
+		memcpy(p_ind, m.p_ind, nnz*sizeof(int));
+	} else {
+		throw illegal_sparse_mat_assignment();
+	}
+
+	return *this;
+}
+
+// output operator ...
+template<class Type>
+std::ostream &operator<<(std::ostream &stream, const sparse_mat<Type> &m) {
+	mat<Type> tmp(m.dim_y, m.dim_x);
+	if(m.format == sparse_mat_csc) {
+		for(int c=0; c < m.dim_x; c++)
+			for(int i = m.ptr()[c]; i < m.ptr()[c+1]; i++)
+				tmp[c*m.dim_y + m.ind()[i]] = m.val()[i];
+	} else {
+		for(int r=0; r < m.dim_y; r++)
+			for(int i = m.ptr()[r]; i < m.ptr()[r+1]; i++)
+				tmp[m.ind()[i]*m.dim_y + r] = m.val()[i];
+	}
+	stream << tmp;
+	return stream;
+}
+
+#endif /* CONTAINER_H_ */
--- a/TVALCPU/src/mat_mult.h
+++ b/TVALCPU/src/mat_mult.h
@@ -0,0 +1,260 @@
+/*
+ * mat_mult.h
+ *
+ *  Created on: Oct 7, 2011
+ *      Author: ditlevsen
+ */
+
+#ifndef MAT_MULT_H_
+#define MAT_MULT_H_
+
+//#include <omp.h>
+#include <cmath>
+#include <complex>
+#include "container.h"
+#include "blas_wrap.h"
+
+// overloaded function for matrix-vector multiplication (dense / sparse / dynamically calculated)
+
+template <class Type>
+// y = Op(A)*x
+// A Row Major, x, y column Major!
+inline void gemv(CBLAS_TRANSPOSE TransA, const mat<Type> &A, const mat<Type> &x, mat<Type> &y) {
+		cblas_gemv(CblasRowMajor, TransA, A.dim_y, A.dim_x, (Type)1, A.data(), A.dim_x, x.data(), 1, (Type)0, y.data(), 1);
+}
+
+template <class Type>
+inline void gemv(CBLAS_TRANSPOSE TransA, const sparse_mat<Type> &A, const mat<Type> &x, mat<Type> &y) {
+	char transpose;
+	if(TransA == CblasTrans)
+		transpose = 't';
+	else if(TransA == CblasConjTrans)
+		transpose = 'c';
+	else
+		transpose = 'n';
+	char *matdescra = (char *)"GLLC";
+	Type alpha = 1;
+	Type beta = 0;
+
+	mkl_cscmv(&transpose, (int *)&A.dim_y, (int *)&A.dim_x, &alpha, matdescra, (Type *)A.val(), (int *)A.ind(),
+			(int *)A.ptr(), (int *)(A.ptr() + 1), (Type *)x.data(), &beta, y.data());
+}
+
+// functions for the dynamic calculation of the measurement matrix...
+
+template <class Type>
+inline void mult_element(CBLAS_TRANSPOSE TransA, int ray, int x_pixel, int y_pixel, int z_pixel, int rv_x, int rv_y, float scale_factor, const mat<Type> &x, mat<Type> &y) {
+	int lin_index = x_pixel + y_pixel*rv_x + z_pixel*rv_x*rv_y;
+	if(TransA == CblasNoTrans) {
+		y[ray] += x[lin_index] * (Type) scale_factor;
+	} else {
+		y[lin_index] += x[ray] * (Type) scale_factor;
+	}
+}
+
+template <class Type>
+void gemv(CBLAS_TRANSPOSE TransA, const geometry &A, const mat<Type> &x, mat<Type> &y) {
+	memset(y.data(), 0, y.len*sizeof(Type));
+
+	mat<Type> **vecs;
+	int numthreads;
+	#pragma omp parallel
+	{
+		int dx, dy, dz, x_inc, mx, y_inc, my, z_inc, mz, x_pixel, y_pixel, z_pixel, ray, err_1, err_2;
+
+		int threadnum = omp_get_thread_num();
+
+		#pragma omp single
+		{
+			numthreads = omp_get_num_threads();
+			vecs = new mat<Type>* [numthreads];
+		}
+
+		if(threadnum==0)
+			vecs[0] = &y;
+		else
+			vecs[threadnum] = new mat<Type>(y.len);
+
+		#pragma omp for
+		for(int emitter=0; emitter < A.num_emitters; emitter++) {
+			for(int receiver=0; receiver < A.num_receivers; receiver++) {
+
+				ray = emitter*A.num_receivers + receiver;
+
+				// trace path using Bresenheimer algorithm...
+
+				dx = A.x_receivers[receiver] - A.x_emitters[emitter];
+				dy = A.y_receivers[receiver] - A.y_emitters[emitter];
+				dz = A.z_receivers[receiver] - A.z_emitters[emitter];
+
+				x_inc = (dx < 0) ? -1 : 1;
+				mx = abs(dx);
+
+				y_inc = (dy < 0) ? -1 : 1;
+				my = abs(dy);
+
+				z_inc = (dz < 0) ? -1 : 1;
+				mz = abs(dz);
+
+				x_pixel = A.x_emitters[emitter];
+				y_pixel = A.y_emitters[emitter];
+				z_pixel = A.z_emitters[emitter];
+
+				mult_element(TransA, ray, x_pixel, y_pixel, z_pixel, A.rv_x, A.rv_y, A.scale_factor, x, *vecs[threadnum]);
+
+				if (mx >= my && mx >= mz) {
+					err_1 = 2*my - mx;
+					err_2 = 2*mz - mx;
+					for(int j = 1; j < mx+1; j++) {
+						if (err_1 > 0) {
+							y_pixel += y_inc;
+							err_1 -= 2*mx;
+						}
+						if (err_2 > 0) {
+							z_pixel += z_inc;
+							err_2 -= 2*mx;
+						}
+						err_1 += 2*my;
+						err_2 += 2*mz;
+						x_pixel += x_inc;
+						mult_element(TransA, ray, x_pixel, y_pixel, z_pixel, A.rv_x, A.rv_y, A.scale_factor, x, *vecs[threadnum]);
+					}
+				} else if(my >= mx && my >= mz) {
+					err_1 = 2*mx - my;
+					err_2 = 2*mz - my;
+					for (int j = 1; j < my+1; j++) {
+						if (err_1 > 0) {
+							x_pixel += x_inc;
+							err_1 -= 2*my;
+						}
+						if (err_2 > 0) {
+							z_pixel += z_inc;
+							err_2 -= 2*my;
+						}
+						err_1 += 2*mx;
+						err_2 += 2*mz;
+						y_pixel += y_inc;
+						mult_element(TransA, ray, x_pixel, y_pixel, z_pixel, A.rv_x, A.rv_y, A.scale_factor, x, *vecs[threadnum]);
+					}
+				} else {
+					err_1 = 2*mx - mz;
+					err_2 = 2*my - mz;
+					for (int j = 1; j < mz+1; j++) {
+						if (err_1 > 0) {
+							x_pixel += x_inc;
+							err_1 -= 2*mz;
+						}
+						if (err_2 > 0) {
+							y_pixel += y_inc;
+							err_2 -= 2*mz;
+						}
+						err_1 += 2*mx;
+						err_2 += 2*my;
+						z_pixel += z_inc;
+						mult_element(TransA, ray, x_pixel, y_pixel, z_pixel, A.rv_x, A.rv_y, A.scale_factor, x, *vecs[threadnum]);
+					}
+				}
+			}
+		}
+	}
+	for(int i=1; i<numthreads; i++) {
+		cblas_axpy(y.len, 1, vecs[i]->data(), 1, y.data(), 1);
+		delete vecs[i];
+	}
+	delete vecs;
+}
+
+/*
+// Version mit nur einem Loop für alle Richtungen (-> "Prototyp" für GraKa-Version), auf CPU etwas langsamer
+template <class Type>
+void gemv(CBLAS_TRANSPOSE TransA, const geometry &A, const mat<Type> &x, mat<Type> &y) {
+	memset(y.data(), 0, y.len*sizeof(Type));
+
+	mat<Type> **vecs;
+	int numthreads;
+	#pragma omp parallel
+	{
+		int dx, dy, dz, x_inc, mx, y_inc, my, z_inc, mz, x_pixel, y_pixel, z_pixel, ray, err_1, err_2;
+		int *m1, *m2, *m3, *inc1, *inc2, *inc3, *pixel1, *pixel2, *pixel3;
+
+		int threadnum = omp_get_thread_num();
+
+		#pragma omp single
+		{
+			numthreads = omp_get_num_threads();
+			vecs = new mat<Type>* [numthreads];
+		}
+
+		if(threadnum==0)
+			vecs[0] = &y;
+		else
+			vecs[threadnum] = new mat<Type>(y.len);
+
+		#pragma omp for
+		for(int emitter=0; emitter < A.num_emitters; emitter++) {
+			for(int receiver=0; receiver < A.num_receivers; receiver++) {
+
+				ray = emitter*A.num_receivers + receiver;
+
+				// trace path using Bresenheimer algorithm...
+
+				dx = A.x_receivers[receiver] - A.x_emitters[emitter];
+				dy = A.y_receivers[receiver] - A.y_emitters[emitter];
+				dz = A.z_receivers[receiver] - A.z_emitters[emitter];
+
+				x_inc = (dx < 0) ? -1 : 1;
+				mx = abs(dx);
+
+				y_inc = (dy < 0) ? -1 : 1;
+				my = abs(dy);
+
+				z_inc = (dz < 0) ? -1 : 1;
+				mz = abs(dz);
+
+				x_pixel = A.x_emitters[emitter];
+				y_pixel = A.y_emitters[emitter];
+				z_pixel = A.z_emitters[emitter];
+
+				mult_element(TransA, ray, x_pixel, y_pixel, z_pixel, A.rv_x, A.rv_y, A.scale_factor, x, *vecs[threadnum]);
+
+				if (mx >= my && mx >= mz) {
+					m1 = &mx; pixel1 = &x_pixel; inc1 = &x_inc;
+					m2 = &my; pixel2 = &y_pixel; inc2 = &y_inc;
+					m3 = &mz; pixel3 = &z_pixel; inc3 = &z_inc;
+				} else if(my >= mx && my >= mz) {
+					m1 = &my; pixel1 = &y_pixel; inc1 = &y_inc;
+					m2 = &mx; pixel2 = &x_pixel; inc2 = &x_inc;
+					m3 = &mz; pixel3 = &z_pixel; inc3 = &z_inc;
+				} else {
+					m1 = &mz; pixel1 = &z_pixel; inc1 = &z_inc;
+					m2 = &mx; pixel2 = &x_pixel; inc2 = &x_inc;
+					m3 = &my; pixel3 = &y_pixel; inc3 = &y_inc;
+				}
+
+				err_1 = 2 * *m2 - *m1;
+				err_2 = 2 * *m3 - *m1;
+				for(int j = 1; j < *m1+1; j++) {
+					if (err_1 > 0) {
+						*pixel2 += *inc2;
+						err_1 -= 2 * *m1;
+					}
+					if (err_2 > 0) {
+						*pixel3 += *inc3;
+						err_2 -= 2 * *m1;
+					}
+					err_1 += 2 * *m2;
+					err_2 += 2 * *m3;
+					*pixel1 += *inc1;
+					mult_element(TransA, ray, x_pixel, y_pixel, z_pixel, A.rv_x, A.rv_y, A.scale_factor, x, *vecs[threadnum]);
+				}
+			}
+		}
+	}
+	for(int i=1; i<numthreads; i++) {
+		cblas_axpy(y.len, 1, vecs[i]->data(), 1, y.data(), 1);
+		delete vecs[i];
+	}
+	delete vecs;
+}*/
+
+#endif /* MAT_MULT_H_ */
--- a/TVALCPU/src/timestamp.cpp
+++ b/TVALCPU/src/timestamp.cpp
@@ -0,0 +1,21 @@
+/*
+ * utility.cpp
+ *
+ *  Created on: Mar 22, 2011
+ *      Author: ditlevsen
+ */
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <cstdlib>
+
+// get time stamp in seconds using Native Linux Time Measurement
+double get_timestamp() {
+	struct timeval tp;
+	double sec, usec;
+	gettimeofday(&tp, NULL);
+	sec = static_cast<double>( tp.tv_sec );
+	usec = static_cast<double>( tp.tv_usec )/1E6;
+	return sec + usec;
+}
+
--- a/TVALCPU/src/timestamp.h
+++ b/TVALCPU/src/timestamp.h
@@ -0,0 +1,7 @@
+#ifndef TIMESTAMP_H_
+#define TIMESTAMP_H_
+
+// returns a time stamp in seconds, using native Linux time measurement
+extern double get_timestamp();
+
+#endif
--- a/TVALCPU/src/tval3.cpp
+++ b/TVALCPU/src/tval3.cpp
@@ -0,0 +1,861 @@
+/*
+ * tval3.cpp
+ *
+ *  Created on: Mar 21, 2011d
+ *      Author: ditlevsen
+ */
+
+#include <cstdlib>
+#include <cmath>
+#include <omp.h>
+#include <complex>
+#include <string>
+#include "tval3_types.h"
+#include "utility.h"
+#include "timestamp.h"
+#include "blas_wrap.h"
+#include "mat_mult.h"
+#include <mkl.h>
+#include <fstream>
+#include <cfloat>
+
+using namespace std;
+
+//#define PROFILING
+
+#ifdef PROFILING
+double t_mult, t_d, t_dt;
+bool rec_time;
+#endif
+
+// this struct holds various values, accessed by the functions get_g(), update_g(), update_W() and update_mlp()
+// the member names correspond to the variable names in the MATLAB-version
+template<class T_comp, class T_scal>
+struct tval3_data { // parameters and intermediate results...
+	// dimensions...
+	int p; // y-dimension of reconstruction volume
+	int q; // x-dimension of reconstruction volume
+	int r; // z-dimension of reconstruction volume
+	int n; // number of pixels (n = p*q*r)
+	int m; // number of measurements
+
+	// penalty parameters...
+	T_scal mu;
+	T_scal beta;
+	T_scal muDbeta; // mu/beta
+
+	// multiplier...
+	mat<T_comp> delta;
+	mat<T_comp> sigmaX;
+	mat<T_comp> sigmaY;
+	mat<T_comp> sigmaZ;
+
+	// lagrangian function and sub terms...
+	T_comp f;
+	T_scal lam1;
+	T_scal lam2;
+	T_scal lam3;
+	T_comp lam4;
+	T_comp lam5;
+
+	mat<T_comp> Up;
+	mat<T_comp> dU; // U-Up, after steepest descent
+	mat<T_comp> uup; // U-Up, after "backtracking"
+	mat<T_comp> Ux; // gradients in x-direction
+	mat<T_comp> Uxp;
+	mat<T_comp> dUx;
+	mat<T_comp> Uy; // gradients in y-direction
+	mat<T_comp> Uyp;
+	mat<T_comp> dUy;
+	mat<T_comp> Uz; // gradients in y-direction
+	mat<T_comp> Uzp;
+	mat<T_comp> dUz;
+	mat<T_comp> Wx;
+	mat<T_comp> Wy;
+	mat<T_comp> Wz;
+	mat<T_comp> Atb; // A'*b
+	mat<T_comp> Au; // A*u
+	mat<T_comp> Aup;
+	mat<T_comp> dAu;
+	mat<T_comp> d; // gradient of objective function (2.28)
+	mat<T_comp> g; // sub term of (2.28)
+	mat<T_comp> gp;
+	mat<T_comp> dg;
+	mat<T_comp> g2; // sub term of (2.28)
+	mat<T_comp> g2p;
+	mat<T_comp> dg2;
+	tval3_data<T_comp, T_scal>(const mat<T_comp> &U, const mat<T_comp> &b, T_scal mu0, T_scal beta0):
+		p(U.dim_y), q(U.dim_x), r(U.dim_z), n(U.len), m(b.len), mu(mu0), beta(beta0), delta(b.len), sigmaX(U.dim_y, U.dim_x, U.dim_z),
+		sigmaY(U.dim_y, U.dim_x, U.dim_z), sigmaZ(U.dim_y, U.dim_x, U.dim_z), Up(U.dim_y, U.dim_x, U.dim_z), dU(U.dim_y, U.dim_x, U.dim_z),
+		uup(U.len), Ux(U.dim_y, U.dim_x, U.dim_z), Uxp(U.dim_y, U.dim_x, U.dim_z), dUx(U.dim_y, U.dim_x, U.dim_z), Uy(U.dim_y, U.dim_x, U.dim_z),
+		Uyp(U.dim_y, U.dim_x, U.dim_z), dUy(U.dim_y, U.dim_x, U.dim_z), Uz(U.dim_y, U.dim_x, U.dim_z), Uzp(U.dim_y, U.dim_x, U.dim_z),
+		dUz(U.dim_y, U.dim_x, U.dim_z), Wx(U.dim_y, U.dim_x, U.dim_z), Wy(U.dim_y, U.dim_x, U.dim_z), Wz(U.dim_y, U.dim_x, U.dim_z), Atb(U.len),
+		Au(b.len), Aup(b.len), dAu(b.len), d(U.dim_y, U.dim_x, U.dim_z), g(U.len), gp(U.len), dg(U.len), g2(U.len), g2p(U.len), dg2(U.len) {}
+};
+
+// z <- x + alpha*y
+// N: number of elements
+template<class T_mat, class T_scal>
+void vec_add(const int N, const int stride, const T_mat *x, const T_scal alpha, const T_mat *y, T_mat *z) {
+	#pragma vector always
+	#pragma parallel always
+	for(int i=0; i<N; i++)
+		z[i*stride] = x[i*stride] + alpha*y[i*stride];
+}
+
+template<class T_mat, class T_scal>
+void vec_add(const int N, const int stride, const std::complex<T_mat> *x, const T_scal alpha, const std::complex<T_mat> *y, std::complex<T_mat> *z) {
+	T_mat *data_x = (T_mat *)x;
+	T_mat *data_y = (T_mat *)y;
+	T_mat *data_z = (T_mat *)z;
+	#pragma vector always
+	#pragma parallel always
+	for(int i=0; i<N; i++) {
+		data_z[2*i*stride] = data_x[2*i*stride] + alpha*data_y[2*i*stride];
+		data_z[2*i*stride + 1] = data_x[2*i*stride + 1] + alpha*data_y[2*i*stride + 1];
+	}
+}
+
+// Scales the measurement vector. Returns the scale factor.
+template<class T_comp, class T_scal>
+T_scal scaleb(mat<T_comp> &b) {
+	T_scal scl=1;
+	if(b.len > 0) {
+		T_scal threshold1 = 0.5;
+		T_scal threshold2 = 1.5;
+		scl=1;
+		T_comp val;
+		T_scal val_abs;
+		T_comp bmin = b[0];
+		T_comp bmax = bmin;
+		T_scal bmin_abs = abs(bmin);
+		T_scal bmax_abs = bmin_abs;
+		for(int i=0; i < b.len; i++) {
+			val = b[i];
+			val_abs = abs(val);
+			if(val_abs < bmin_abs) {
+				bmin = val;
+				bmin_abs = val_abs;
+			}
+			if(val_abs > bmax_abs) {
+				bmax = val;
+				bmax_abs = val_abs;
+			}
+		}
+		T_scal b_dif = abs(bmax - bmin);
+		if(b_dif < threshold1) {
+			scl = threshold1/b_dif;
+			cblas_scal(b.len, scl, b.data(), 1);
+		} else if(b_dif > threshold2) {
+			scl = threshold2/b_dif;
+			cblas_scal(b.len, scl, b.data(), 1);
+		}
+	}
+	return scl;
+}
+
+template<class T_comp, class T_scal>
+void D(mat<T_comp> &Ux, mat<T_comp> &Uy, mat<T_comp> &Uz, const mat<T_comp> &U) {
+#ifdef PROFILING
+	double start = get_timestamp();
+#endif
+	vec_add(U.dim_x*U.dim_y*U.dim_z - 1, 1, U.data()+1, -1, U.data(), Ux.data());
+	vec_add(U.dim_y*U.dim_z, U.dim_x, U.data(), -1, U.data()+U.dim_x-1, Ux.data()+U.dim_x-1);
+
+	vec_add((U.dim_y*U.dim_z-1)*U.dim_x, 1, U.data() + U.dim_x, -1, U.data(), Uy.data());
+	for(int z = 0; z < U.dim_z; z++)
+		vec_add(U.dim_x, 1, U.data() + z*U.dim_x*U.dim_y, -1, U.data() + (U.dim_y-1)*U.dim_x + z*U.dim_x*U.dim_y,
+				Uy.data() + (U.dim_y-1)*U.dim_x + z*U.dim_x*U.dim_y);
+
+	vec_add(U.dim_x*U.dim_y*(U.dim_z - 1), 1, U.data() + U.dim_x*U.dim_y, -1, U.data(), Uz.data());
+	vec_add(U.dim_x*U.dim_y, 1, U.data(), -1, U.data() + (U.dim_z - 1) * U.dim_x * U.dim_y,
+			Uz.data()  + (U.dim_z - 1) * U.dim_x * U.dim_y);
+#ifdef PROFILING
+	double stop = get_timestamp();
+	if(rec_time) t_d += stop - start;
+#endif
+}
+
+template<class T_comp, class T_scal>
+void Dt(mat<T_comp> &res, const mat<T_comp> &X, const mat<T_comp> &Y,  const mat<T_comp> &Z) {
+#ifdef PROFILING
+	double start = get_timestamp();
+#endif
+	vec_add(X.len-1, 1, X.data(), -1, X.data()+1, res.data()+1);
+	vec_add(X.dim_y * X.dim_z, X.dim_x, X.data()+X.dim_x-1, -1, X.data(), res.data());
+
+	cblas_axpy((Y.dim_y*Y.dim_z-1)*Y.dim_x, (T_comp)1, Y.data(), 1, res.data()+Y.dim_x, 1);
+	cblas_axpy((Y.dim_y*Y.dim_z-1)*Y.dim_x, (T_comp)-1, Y.data()+Y.dim_x, 1, res.data()+Y.dim_x, 1);
+	for(int z = 0; z < Y.dim_z; z++) {
+		cblas_axpy(Y.dim_x, (T_comp)1, Y.data() + (Y.dim_y-1)*Y.dim_x + z*Y.dim_x*Y.dim_y, 1, res.data() + z*Y.dim_x*Y.dim_y, 1);
+		cblas_axpy(Y.dim_x, (T_comp)-1, Y.data() + z*Y.dim_x*Y.dim_y, 1, res.data() + z*Y.dim_x*Y.dim_y, 1);
+	}
+
+	cblas_axpy(Z.dim_y*Z.dim_x*(Z.dim_z-1), (T_comp)1, Z.data(), 1, res.data()+Z.dim_y*Z.dim_x, 1);
+	cblas_axpy(Z.dim_y*Z.dim_x*(Z.dim_z-1), (T_comp)-1, Z.data()+Z.dim_y*Z.dim_x, 1, res.data()+Z.dim_y*Z.dim_x, 1);
+	cblas_axpy(Z.dim_y*Z.dim_x, (T_comp)1, Z.data() + (Z.dim_z-1)*Z.dim_x*Z.dim_y, 1, res.data(), 1);
+	cblas_axpy(Z.dim_y*Z.dim_x, (T_comp)-1, Z.data(), 1, res.data(), 1);
+#ifdef PROFILING
+	double stop = get_timestamp();
+	if(rec_time) t_dt += stop - start;
+#endif
+}
+
+// overloaded function get_g_mult calculates:
+// Au = A*U...
+// g = A'*Au...
+
+// standrad version (parameter submat_mb not used)
+template<class T_comp, class T_scal, class T_A>
+void get_g_mult(tval3_data<T_comp, T_scal> &data, const T_A &A, const mat<T_comp> &U, int submat_mb) {
+	gemv(CblasNoTrans, A, U, data.Au);
+	gemv(CblasConjTrans, A, data.Au, data.g);
+}
+
+// special version for dense matrices (cache blocking for single layer reconstructions....)
+// submat_mb: size of submatrices in MB
+template<class T_comp, class T_scal>
+void get_g_mult(tval3_data<T_comp, T_scal> &data, const mat<T_comp> &A, const mat<T_comp> &U, int submat_mb) {
+
+		int submat_elements = submat_mb*1024*1024 / sizeof(T_comp);
+
+		mat<T_comp> g_tmp(data.n);
+
+		int submat_dim_y = max(submat_elements / data.n, 1);
+
+		int n_y = (data.m + submat_dim_y - 1) / submat_dim_y;
+
+		int r;
+		memset(data.g.data(), 0, data.n * sizeof(T_comp));
+		for(int y=0; y < n_y; y++) {
+			r = min(submat_dim_y, data.m - y * submat_dim_y);
+
+			cblas_gemv(CblasRowMajor, CblasNoTrans, r, data.n, 1, A.data() + y * submat_dim_y * data.n, data.n, U.data(), 1, 0, data.Au.data() + y * submat_dim_y, 1);
+
+			cblas_gemv(CblasRowMajor, CblasConjTrans, r, data.n, 1, A.data() + y * submat_dim_y * data.n, data.n, data.Au.data() + y * submat_dim_y, 1, 0, g_tmp.data(), 1);
+
+			cblas_axpy(data.n, 1, g_tmp.data(), 1, data.g.data(), 1);
+		}
+}
+
+template<class T_comp, class T_scal, class T_A>
+void get_g(tval3_data<T_comp, T_scal> &data, const T_A &A, const mat<T_comp> &U, const mat<T_comp> &b, int submat_mb) {
+#ifdef PROFILING
+	double start = get_timestamp();
+#endif
+	get_g_mult(data, A, U, submat_mb);
+#ifdef PROFILING
+	double stop = get_timestamp();
+	if(rec_time) t_mult += stop - start;
+#endif
+
+	cblas_axpy(data.n, (T_comp)-1, data.Atb.data(), 1, data.g.data(), 1);
+
+	// update g2, lam2, lam4...
+	mat<T_comp> Vx(data.p, data.q, data.r);
+	mat<T_comp> Vy(data.p, data.q, data.r);
+	mat<T_comp> Vz(data.p, data.q, data.r);
+	vec_add(data.n, 1, data.Ux.data(), (T_scal)-1, data.Wx.data(), Vx.data()); // Vx = Ux - Wx
+	vec_add(data.n, 1, data.Uy.data(), (T_scal)-1, data.Wy.data(), Vy.data()); // Vy = Uy - Wy
+	vec_add(data.n, 1, data.Uz.data(), (T_scal)-1, data.Wz.data(), Vz.data()); // Vz = Uz - Wz
+	Dt<T_comp, T_scal>(data.g2, Vx, Vy, Vz);
+
+	data.lam2=real(cblas_dotc(data.n, Vx.data(), 1, Vx.data(), 1));
+	data.lam2 += real(cblas_dotc(data.n, Vy.data(), 1, Vy.data(), 1));
+	data.lam2 += real(cblas_dotc(data.n, Vz.data(), 1, Vz.data(), 1));
+	data.lam4=real(cblas_dotc(data.n, data.sigmaX.data(), 1, Vx.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaY.data(), 1, Vy.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaZ.data(), 1, Vz.data(), 1));
+
+	//update lam3, lam5...
+	mat<T_comp> Aub(data.m);
+	vec_add(data.m, 1, data.Au.data(), (T_scal)-1, b.data(), Aub.data()); // Aub = Au - b
+	data.lam3=real(cblas_dotc(data.m, Aub.data(), 1, Aub.data(), 1));
+	data.lam5=real(cblas_dotc(data.m, data.delta.data(), 1, Aub.data(), 1));
+
+	data.f = (data.lam1 + data.beta/2*data.lam2 + data.mu/2*data.lam3) - data.lam4 - data.lam5;
+}
+
+template<class T_comp, class T_scal, class T_A>
+T_scal get_tau(tval3_data<T_comp, T_scal> &data, const T_A &A, bool fst_iter) {
+	T_scal tau;
+
+	// calculate tau...
+	if(fst_iter) {
+		mat<T_comp> dx(data.p, data.q, data.r);
+		mat<T_comp> dy(data.p, data.q, data.r);
+		mat<T_comp> dz(data.p, data.q, data.r);
+		D<T_comp, T_scal>(dx, dy, dz, data.d);
+		T_comp dDd = cblas_dotc(data.n, dx.data(), 1, dx.data(), 1) +
+					cblas_dotc(data.n, dy.data(), 1, dy.data(), 1) +
+					cblas_dotc(data.n, dz.data(), 1, dz.data(), 1);
+		T_comp dd = cblas_dotc(data.n, data.d.data(), 1, data.d.data(), 1);
+		mat<T_comp> Ad(data.m);
+		// Ad=A*d...
+#ifdef PROFILING
+		double start = get_timestamp();
+#endif
+		gemv(CblasNoTrans, A, data.d, Ad);
+#ifdef PROFILING
+		double stop = get_timestamp();
+		if(rec_time) t_mult += stop - start;
+#endif
+		T_comp Add = cblas_dotc(data.m, Ad.data(), 1, Ad.data(), 1);
+
+		tau = abs(dd/(dDd + data.muDbeta*Add));
+	} else {
+		vec_add(data.n, 1, data.g.data(), -1, data.gp.data(), data.dg.data());
+		vec_add(data.n, 1, data.g2.data(), -1, data.g2p.data(), data.dg2.data());
+		T_comp ss = cblas_dotc(data.n, data.uup.data(), 1, data.uup.data(), 1);
+		mat<T_comp> tmp(data.dg2);
+		cblas_axpy(data.n, data.muDbeta, data.dg.data(), 1, tmp.data(), 1);
+		T_comp sy = cblas_dotc(data.n, data.uup.data(), 1, tmp.data(), 1);
+		tau = abs(ss/sy);
+	}
+	return tau;
+}
+
+template<class T_mat>
+void nonneg(mat<T_mat> &U) {
+	T_mat val;
+	#pragma parallel always
+	#pragma vector always
+	for(int i=0; i < U.len; i++) {
+		val = U[i];
+		U[i] = (val < 0) ? 0 : val;
+	}
+}
+
+template<class T_mat>
+void nonneg(mat<complex<T_mat> > &U) {
+	T_mat val;
+	T_mat *data = (T_mat *) U.data();
+	#pragma parallel always
+	#pragma vector always
+	for(int i=0; i < U.len; i++) {
+		val = data[2*i];
+		data[2*i] = (val < 0) ? 0 : val;
+		data[2*i + 1] = 0;
+	}
+}
+
+template<class T_comp, class T_scal, class T_A>
+void descend(mat<T_comp> &U, tval3_data<T_comp, T_scal> &data, const T_A &A, const mat<T_comp> &b, T_scal tau, bool non_neg,
+		int submat_mb) {
+	cblas_axpy(data.n, -1*tau, data.d.data(), 1, U.data(),1); // U = U - tau*d
+	if(non_neg) {
+		nonneg(U);
+	}
+	D<T_comp, T_scal>(data.Ux, data.Uy, data.Uz, U);
+	get_g<T_comp, T_scal>(data, A, U, b, submat_mb);
+}
+
+template<class T_comp, class T_scal, class T_A>
+void update_g(tval3_data<T_comp, T_scal> &data, const T_scal alpha, const T_A &A, mat<T_comp> &U, const mat<T_comp> &b) {
+	vec_add(data.n, 1, data.gp.data(), alpha, data.dg.data(), data.g.data());
+	vec_add(data.n, 1, data.g2p.data(), alpha, data.dg2.data(), data.g2.data());
+	vec_add(data.n, 1, data.Up.data(), alpha, data.dU.data(), U.data());
+	vec_add(data.m, 1, data.Aup.data(), alpha, data.dAu.data(), data.Au.data());
+	vec_add(data.n, 1, data.Uxp.data(), alpha, data.dUx.data(), data.Ux.data());
+	vec_add(data.n, 1, data.Uyp.data(), alpha, data.dUy.data(), data.Uy.data());
+	vec_add(data.n, 1, data.Uzp.data(), alpha, data.dUz.data(), data.Uz.data());
+
+	// update lam2, lam4...
+	mat<T_comp> Vx(data.p, data.q, data.r);
+	mat<T_comp> Vy(data.p, data.q, data.r);
+	mat<T_comp> Vz(data.p, data.q, data.r);
+	vec_add(data.n, 1, data.Ux.data(), (T_scal)-1, data.Wx.data(), Vx.data()); // Vx = Ux - Wx
+	vec_add(data.n, 1, data.Uy.data(), (T_scal)-1, data.Wy.data(), Vy.data()); // Vy = Uy - Wy
+	vec_add(data.n, 1, data.Uz.data(), (T_scal)-1, data.Wz.data(), Vz.data()); // Vz = Uz - Wz
+
+	data.lam2=real(cblas_dotc(data.n, Vx.data(), 1, Vx.data(), 1));
+	data.lam2 += real(cblas_dotc(data.n, Vy.data(), 1, Vy.data(), 1));
+	data.lam2 += real(cblas_dotc(data.n, Vz.data(), 1, Vz.data(), 1));
+	data.lam4=real(cblas_dotc(data.n, data.sigmaX.data(), 1, Vx.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaY.data(), 1, Vy.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaZ.data(), 1, Vz.data(), 1));
+
+	//update lam3, lam5...
+	mat<T_comp> Aub(data.m);
+	vec_add(data.m, 1, data.Au.data(), (T_scal)-1, b.data(), Aub.data()); // Aub = Au - b
+	data.lam3 =real(cblas_dotc(data.m, Aub.data(), 1, Aub.data(), 1));
+	data.lam5=real(cblas_dotc(data.m, data.delta.data(), 1, Aub.data(), 1));
+
+	data.f = data.lam1 + data.beta/2*data.lam2 + data.mu/2*data.lam3 - data.lam4 - data.lam5;
+}
+
+template<class T_comp, class T_scal, class T_A>
+void min_u(tval3_data<T_comp, T_scal> &data, mat<T_comp> &U, T_scal &gam, const T_A &A, const mat<T_comp> &b,
+		const tval3_options<T_scal> &opts, T_comp C, bool fst_iter, int submat_mb) {
+	T_scal tau, alpha, c_armin;
+
+	tau = get_tau<T_comp, T_scal, T_A>(data, A, fst_iter);
+
+	// keep previous values...
+	data.Up = U; data.gp = data.g; data.g2p = data.g2;
+	data.Aup = data.Au; data.Uxp = data.Ux; data.Uyp = data.Uy;
+	data.Uzp = data.Uz;
+
+	// one step steepest descend...
+	descend<T_comp, T_scal, T_A>(U, data, A, b, tau, opts.nonneg, submat_mb);
+
+	// NMLA...
+	alpha=1;
+	vec_add(data.n, 1, U.data(), -1, data.Up.data(), data.dU.data()); // Ud = U - Up
+	c_armin = real(cblas_dotc(data.n, data.d.data(), 1, data.d.data(), 1) * tau * opts.c * data.beta); // c_armin = d'*d*tau*c*beta
+	if(abs(data.f) > abs(C - alpha*c_armin)) {  // Arminjo condition...
+
+		vec_add(data.n, 1, data.g.data(), -1, data.gp.data(), data.dg.data()); // dg=g-gp
+		vec_add(data.n, 1, data.g2.data(), -1, data.g2p.data(), data.dg2.data()); // dg2=g2-g2p
+		vec_add(data.m, 1, data.Au.data(), -1, data.Aup.data(), data.dAu.data()); // dAu=Au-Aup
+		vec_add(data.n, 1, data.Ux.data(), -1, data.Uxp.data(), data.dUx.data()); // dUx=Ux-Uxp
+		vec_add(data.n, 1, data.Uy.data(), -1, data.Uyp.data(), data.dUy.data()); // Uy = Uy-Uyp
+		vec_add(data.n, 1, data.Uz.data(), -1, data.Uzp.data(), data.dUz.data()); // Uz = Uz-Uzp
+
+		int cnt=0;
+
+		while(abs(data.f) > abs(C - alpha*c_armin)) { // Arminjo condition...
+
+			if(cnt==5) { // "backtracking" not successful...
+				gam *= opts.rate_gam;
+				tau = get_tau<T_comp, T_scal, T_A>(data, A, true);
+				U = data.Up;
+				descend<T_comp, T_scal, T_A>(U, data, A, b, tau, opts.nonneg, submat_mb);
+				break;
+			}
+			alpha *= opts.gamma;
+
+			update_g<T_comp, T_scal, T_A>(data, alpha, A, U, b);
+
+			cnt++;
+		}
+	}
+}
+
+template<class T_comp, class T_scal>
+void get_gradient(tval3_data<T_comp, T_scal> &data, const mat<T_comp> &DtsAtd) {
+	// d = g2 + muDbeta*g + DtsAtd...   (DtsAtd has opposite sign, compared to the MATLAB-version!)
+	data.d = DtsAtd;
+	cblas_axpy(data.n, 1, data.g2.data(), 1, data.d.data(), 1);
+	cblas_axpy(data.n, data.muDbeta, data.g.data(), 1, data.d.data(), 1);
+}
+
+// scalar
+template<class T_mat, class T_scal>
+void shrinkage(tval3_data<T_mat, T_scal> &data) {
+	T_scal sum=0;
+	T_scal temp_wx, temp_wy, temp_wz;
+	T_mat Uxbar, Uybar, Uzbar;
+	#pragma parallel always
+	#pragma vector always
+	for(int i=0; i < data.n; i++) {
+		Uxbar = data.Ux[i] - data.sigmaX[i]/data.beta;
+		Uybar = data.Uy[i] - data.sigmaY[i]/data.beta;
+		Uzbar = data.Uz[i] - data.sigmaZ[i]/data.beta;
+
+		temp_wx = abs(Uxbar) - 1/data.beta;
+		temp_wy = abs(Uybar) - 1/data.beta;
+		temp_wz = abs(Uzbar) - 1/data.beta;
+		temp_wx = (temp_wx >= 0) ? temp_wx : 0;
+		temp_wy = (temp_wy >= 0) ? temp_wy : 0;
+		temp_wz = (temp_wz >= 0) ? temp_wz : 0;
+		sum += temp_wx + temp_wy + temp_wz;
+		data.Wx[i] = (Uxbar >= 0) ? temp_wx : -1 * temp_wx;
+		data.Wy[i] = (Uybar >= 0) ? temp_wy : -1 * temp_wy;
+		data.Wz[i] = (Uzbar >= 0) ? temp_wz : -1 * temp_wz;
+	}
+	data.lam1 = sum;
+}
+
+// complex
+template<class T_mat, class T_scal>
+void shrinkage(tval3_data<complex<T_mat>, T_scal> &data) {
+	T_mat sum=0;
+	T_mat temp_wx, temp_wy, temp_wz;
+	T_mat Uxbar_real, Uxbar_imag, Uxbar_abs, Uybar_real, Uybar_imag, Uybar_abs, Uzbar_real, Uzbar_imag, Uzbar_abs;
+	T_mat *Ux_data = (T_mat *) data.Ux.data();
+	T_mat *Uy_data = (T_mat *) data.Uy.data();
+	T_mat *Uz_data = (T_mat *) data.Uz.data();
+	T_mat *sigmaX_data = (T_mat *) data.sigmaX.data();
+	T_mat *sigmaY_data = (T_mat *) data.sigmaY.data();
+	T_mat *sigmaZ_data = (T_mat *) data.sigmaZ.data();
+	T_mat *Wx_data = (T_mat *) data.Wx.data();
+	T_mat *Wy_data = (T_mat *) data.Wy.data();
+	T_mat *Wz_data = (T_mat *) data.Wz.data();
+	const int len = data.n;
+	#pragma parallel always
+	#pragma vector always
+	for(int i=0; i < len; i++) {
+		Uxbar_real = Ux_data[2*i] - sigmaX_data[2*i]/data.beta;
+		Uxbar_imag = Ux_data[2*i + 1] - sigmaX_data[2*i + 1]/data.beta;
+		Uybar_real = Uy_data[2*i] - sigmaY_data[2*i]/data.beta;
+		Uybar_imag = Uy_data[2*i + 1] - sigmaY_data[2*i + 1]/data.beta;
+		Uzbar_real = Uz_data[2*i] - sigmaZ_data[2*i]/data.beta;
+		Uzbar_imag = Uz_data[2*i + 1] - sigmaZ_data[2*i + 1]/data.beta;
+
+		Uxbar_abs = sqrt(Uxbar_real*Uxbar_real + Uxbar_imag*Uxbar_imag);
+		Uybar_abs = sqrt(Uybar_real*Uybar_real + Uybar_imag*Uybar_imag);
+		Uzbar_abs = sqrt(Uzbar_real*Uzbar_real + Uzbar_imag*Uzbar_imag);
+
+		temp_wx = Uxbar_abs - 1/data.beta;
+		temp_wy = Uybar_abs - 1/data.beta;
+		temp_wz = Uzbar_abs - 1/data.beta;
+		temp_wx = (temp_wx >= 0) ? temp_wx : 0;
+		temp_wy = (temp_wy >= 0) ? temp_wy : 0;
+		temp_wz = (temp_wz >= 0) ? temp_wz : 0;
+		sum += temp_wx + temp_wy + temp_wz;
+
+		Wx_data[2*i] = (Uxbar_abs > 0) ? temp_wx*Uxbar_real/Uxbar_abs : 0;
+		Wx_data[2*i+1] = (Uxbar_abs > 0) ? temp_wx*Uxbar_imag/Uxbar_abs : 0;
+		Wy_data[2*i] = (Uybar_abs > 0) ? temp_wy*Uybar_real/Uybar_abs : 0;
+		Wy_data[2*i+1] = (Uybar_abs > 0) ? temp_wy*Uybar_imag/Uybar_abs : 0;
+		Wz_data[2*i] = (Uzbar_abs > 0) ? temp_wz*Uybar_real/Uzbar_abs : 0;
+		Wz_data[2*i+1] = (Uzbar_abs > 0) ? temp_wz*Uybar_imag/Uzbar_abs : 0;
+	}
+	data.lam1 = sum;
+}
+
+template<class T_comp, class T_scal>
+void update_W(tval3_data<T_comp, T_scal> &data) {
+
+	data.f -= (data.lam1 + data.beta/2*data.lam2 - data.lam4);
+
+	shrinkage(data);
+
+	// update g2, lam2, lam4...
+	mat<T_comp> Vx(data.p, data.q, data.r);
+	mat<T_comp> Vy(data.p, data.q, data.r);
+	mat<T_comp> Vz(data.p, data.q, data.r);
+	vec_add(data.n, 1, data.Ux.data(), (T_scal)-1, data.Wx.data(), Vx.data()); // Vx = Ux - Wx
+	vec_add(data.n, 1, data.Uy.data(), (T_scal)-1, data.Wy.data(), Vy.data()); // Vy = Uy - Wy
+	vec_add(data.n, 1, data.Uz.data(), (T_scal)-1, data.Wz.data(), Vz.data()); // Vz = Uz - Wz
+	Dt<T_comp, T_scal>(data.g2, Vx, Vy, Vz);
+
+	data.lam2=real(cblas_dotc(data.n, Vx.data(), 1, Vx.data(), 1));
+	data.lam2 += real(cblas_dotc(data.n, Vy.data(), 1, Vy.data(), 1));
+	data.lam2 += real(cblas_dotc(data.n, Vz.data(), 1, Vz.data(), 1));
+	data.lam4=real(cblas_dotc(data.n, data.sigmaX.data(), 1, Vx.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaY.data(), 1, Vy.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaZ.data(), 1, Vz.data(), 1));
+
+	data.f += (data.lam1 + data.beta/2*data.lam2 - data.lam4);
+}
+
+template<class T_comp, class T_scal>
+void update_mlp(tval3_data<T_comp, T_scal> &data, const mat<T_comp> &b) {
+	data.f += (data.lam4 + data.lam5);
+
+	mat<T_comp> Vx(data.p, data.q, data.r);
+	mat<T_comp> Vy(data.p, data.q, data.r);
+	mat<T_comp> Vz(data.p, data.q, data.r);
+	vec_add(data.n, 1, data.Ux.data(), (T_scal)-1, data.Wx.data(), Vx.data()); // Vx = Ux - Wx
+	vec_add(data.n, 1, data.Uy.data(), (T_scal)-1, data.Wy.data(), Vy.data()); // Vy = Uy - Wy
+	vec_add(data.n, 1, data.Uz.data(), (T_scal)-1, data.Wz.data(), Vz.data()); // Vz = Uz - Wz
+
+	cblas_axpy(data.n, -1*data.beta, Vx.data(), 1, data.sigmaX.data(), 1); // sigmaX -= beta*Vx
+	cblas_axpy(data.n, -1*data.beta, Vy.data(), 1, data.sigmaY.data(), 1); // sigmaY -= beta*Vy
+	cblas_axpy(data.n, -1*data.beta, Vz.data(), 1, data.sigmaZ.data(), 1); // sigmaz -= beta*Vz
+
+	data.lam4=real(cblas_dotc(data.n, data.sigmaX.data(), 1, Vx.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaY.data(), 1, Vy.data(), 1));
+	data.lam4 += real(cblas_dotc(data.n, data.sigmaZ.data(), 1, Vz.data(), 1));
+
+	mat<T_comp> Aub(data.m);
+	vec_add(data.m, 1, data.Au.data(), (T_scal)-1, b.data(), Aub.data()); // Aub = Au - b
+	cblas_axpy(data.m, -1*data.mu, Aub.data(), 1, data.delta.data(), 1); // delta -= mu*Aub
+	data.lam5=real(cblas_dotc(data.m, data.delta.data(), 1, Aub.data(), 1));
+
+	data.f -= (data.lam4 + data.lam5);
+}
+
+template<class T_comp>
+void check_params(mat<T_comp> &U, const mat<T_comp> &A, const mat<T_comp> &b, const mat<T_comp> &Ut) throw(tval3_exception) {
+	if(U.format != mat_row_major || Ut.format != mat_row_major || A.format != mat_row_major)
+		throw tval3_exception(string("Argument error: Matrices must be in row major format!"));
+	else if(Ut.len > 0 && Ut.len != U.len)
+		throw tval3_exception(string("Argument error: U and Ut must have the same size!"));
+	else if(U.len != A.dim_x)
+		throw tval3_exception("Argument error: the length of U must be equal to A.dim_x!");
+	else if(b.len != A.dim_y)
+		throw tval3_exception(string("Argument error: b.len must be equal to A.dim_y!"));
+}
+
+template<class T_comp>
+void check_params(mat<T_comp> &U, const sparse_mat<T_comp> &A, const mat<T_comp> &b, const mat<T_comp> &Ut) throw(tval3_exception) {
+	if(U.format != mat_row_major || Ut.format != mat_row_major)
+		throw tval3_exception(string("Argument error: Matrices must be in row major format!"));
+	if(A.format != sparse_mat_csc)
+		throw tval3_exception(string("Argument error: A must be in CSC format!"));
+	if(Ut.len > 0 && Ut.len != U.len)
+		throw tval3_exception(string("Argument error: U and Ut must have the same size!"));
+	else if(U.len != A.dim_x)
+		throw tval3_exception("Argument error: the length of U must be equal to A.dim_x!");
+	else if(b.len != A.dim_y)
+		throw tval3_exception(string("Argument error: b.len must be equal to A.dim_y!"));
+}
+
+template<class T_comp>
+void check_params(mat<T_comp> &U, const geometry &A, const mat<T_comp> &b, const mat<T_comp> &Ut) throw(tval3_exception) {
+	if(U.format != mat_row_major || Ut.format != mat_row_major)
+		throw tval3_exception(string("Argument error: Matrices must be in row major format!"));
+	else if(Ut.len > 0 && Ut.len != U.len)
+		throw tval3_exception(string("Argument error: U and Ut must have the same size!"));
+	else if(b.len != A.num_emitters * A.num_receivers)
+		throw tval3_exception(string("Argument error: b.len must be equal to A.num_emitters * A.num_receivers!"));
+}
+
+template<class T_comp, class T_scal, class T_A>
+const tval3_info<T_scal> tval3(mat<T_comp> &U, const T_A &A, const mat<T_comp> &b, const tval3_options<T_scal> &opts, const mat<T_comp> &Ut,
+		int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception) {
+
+	double start, stop;
+#ifdef PROFILING
+	double  start_loop, stop_loop;
+	t_mult = 0; t_d = 0; t_dt = 0; rec_time = false;
+#endif
+	tval3_info<T_scal> info;
+	start = get_timestamp();
+	try {
+
+		check_params (U, A, b, Ut);
+
+		omp_set_num_threads(omp_get_num_procs() / virtual_procs_per_core);
+
+		mat<T_comp> bs = b; // create a copy of b (scaling...)
+		tval3_data<T_comp, T_scal> data(U, bs, opts.mu0, opts.beta0);
+		T_scal muf=opts.mu;
+		T_scal betaf=opts.beta, beta0=0;
+		mat<T_comp> DtsAtd(data.n);
+		T_scal gam = opts.gam;
+		T_scal scl = scaleb<T_comp, T_scal>(bs);
+		gemv(CblasConjTrans, A, bs, data.Atb); // Atb = A'*bs
+		U = data.Atb; // initial guess: U=A'*b
+		if(data.mu > muf) data.mu = muf;
+		if(data.beta > betaf) data.beta = betaf;
+		data.muDbeta = data.mu/data.beta;
+		mat <T_comp> rcdU(U), UrcdU(data.n);
+		D<T_comp, T_scal>(data.Ux, data.Uy, data.Uz, U);
+		shrinkage(data);
+		get_g<T_comp, T_scal>(data, A, U, bs, submat_mb);
+		get_gradient<T_comp, T_scal>(data, DtsAtd);
+		T_scal Q=1, Qp;
+		T_comp C=data.f;
+		int count_outer=0, count_total=0;
+		bool fst_iter;
+		T_scal RelChg, RelChgOut=0, nrmup=0;
+
+#ifdef PROFILING
+		rec_time = true;
+		start_loop = get_timestamp();
+#endif
+
+		while((count_outer < opts.maxcnt) && (count_total < opts.maxit)) {
+
+			fst_iter = true;
+			while(count_total < opts.maxit) {
+				// u-subproblem...
+				min_u<T_comp, T_scal>(data, U, gam, A, bs, opts, C, fst_iter, submat_mb);
+
+				// shrinkage like step...
+				update_W<T_comp, T_scal>(data);
+
+				// update reference values...
+				Qp = Q, Q = gam*Qp + 1; C = (gam*Qp*C + data.f)/Q;
+				vec_add(data.n, 1, U.data(), -1, data.Up.data(), data.uup.data());
+
+				// compute gradient...
+				get_gradient<T_comp, T_scal>(data, DtsAtd);
+
+				count_total++;
+
+				// calculate relative change...
+				nrmup = cblas_nrm2(data.n, data.Up.data(), 1);
+				RelChg = cblas_nrm2(data.n, data.uup.data(), 1) / nrmup;
+
+				if((RelChg < opts.tol_inn) && (!fst_iter))
+					break;
+
+				fst_iter = false;
+
+			}
+			count_outer++;
+
+			// calculate relative change...
+			vec_add(data.n, 1, U.data(), -1, rcdU.data(), UrcdU.data()); // UrcdU = U - rcdU
+			RelChgOut = cblas_nrm2(data.n, UrcdU.data(), 1) / nrmup;
+			rcdU = U;
+
+			if(RelChgOut < opts.tol)
+				break;
+
+			// update multipliers...
+			update_mlp<T_comp, T_scal>(data, bs);
+
+			// update penalty parameters for continuation scheme...
+			beta0 = data.beta;
+			data.beta *= opts.rate_cnt;
+			data.mu *= opts.rate_cnt;
+			if(data.beta > betaf) data.beta = betaf;
+			if(data.mu > muf) data.mu = muf;
+			data.muDbeta = data.mu/data.beta;
+
+			// update f...
+			data.f = data.lam1 + data.beta/2*data.lam2 + data.mu/2*data.lam3 - data.lam4 - data.lam5;
+
+			// DtsAtd = beta0/beta*d
+			DtsAtd = data.d;
+			cblas_scal(data.n, beta0/data.beta, DtsAtd.data(), 1);
+
+			// compute gradient...
+			get_gradient<T_comp, T_scal>(data, DtsAtd);
+
+			// reset reference values...
+			gam=opts.gam; Q=1; C=data.f;
+		}
+#ifdef PROFILING
+		stop_loop = get_timestamp();
+		rec_time = false;
+
+		ofstream file;
+		file.open("profile.txt");
+		file << "loop [ms]: " << (stop_loop - start_loop)*1000 << "\n";
+		file << "mult [ms]: " << t_mult*1000 << "\n";
+		file << "d [ms]: " << t_d*1000 << "\n";
+		file << "dt [ms]: " << t_dt*1000 << "\n";
+		file.close();
+#endif
+
+		// scale U. Take real part only, if opts.isreal==true
+		for(int i=0; i < data.n; i++) {
+			if(opts.isreal)
+				U[i] = (T_comp)real(U[i]);
+			U[i] = U[i]/scl;
+		}
+
+		stop = get_timestamp();
+
+		// generate return value...
+		info.secs = stop - start;
+		info.outer_iters = count_outer;
+		info.total_iters = count_total;
+		info.rel_chg = RelChgOut;
+		if(Ut.len == U.len) {
+			mat<T_comp> deltaU = Ut;
+			cblas_axpy(Ut.len, (T_comp)-1, U.data(), 1, deltaU.data(), 1);
+			T_scal nrm_deltaU = cblas_nrm2(deltaU.len, deltaU.data(), 1);
+			T_scal nrm_Ut = cblas_nrm2(Ut.len, Ut.data(), 1);
+			info.rel_error = nrm_deltaU/nrm_Ut;
+		} else {
+			info.rel_error = 0;
+		}
+	} catch (tval3_exception) {
+		throw;
+	} catch (const std::exception &ex) {
+		string msg = "Internal error: ";
+		msg += ex.what();
+		throw tval3_exception(msg);
+	}
+
+	return  info;
+}
+
+
+
+
+const tval3_info<float> tval3(mat<float> &U, const mat<float> &A, const mat<float> &b, const tval3_options<float> &opts,
+		const mat<float> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<float, float, mat<float> >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<float> &U, const mat<float> &A, const mat<float> &b,
+		const tval3_options<double> &opts, const mat<float> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<float, double, mat<float> >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<double> &U, const mat<double> &A, const mat<double> &b,
+		const tval3_options<double> &opts, const mat<double> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+
+	return tval3<double, double, mat<double> >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<float> tval3(mat<complex<float> > &U, const mat<complex<float> > &A, const mat<complex<float> > &b,
+							   const tval3_options<float> &opts, const mat<complex<float> > &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<std::complex<float>, float, mat<std::complex<float> > >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<complex<double> > &U, const mat<complex<double> > &A, const mat<complex<double> > &b,
+								const tval3_options<double> &opts, const mat<complex<double> > &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<std::complex<double>, double, mat<std::complex<double> > >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+
+const tval3_info<float> tval3(mat<float> &U, const sparse_mat<float> &A, const mat<float> &b,
+		const tval3_options<float> &opts, const mat<float> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<float, float, sparse_mat<float> >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<float> &U, const sparse_mat<float> &A, const mat<float> &b,
+		const tval3_options<double> &opts, const mat<float> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<float, double, sparse_mat<float> >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<double> &U, const sparse_mat<double> &A, const mat<double> &b,
+		const tval3_options<double> &opts, const mat<double> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<double, double, sparse_mat<double> >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<float> tval3(mat<complex<float> > &U, const sparse_mat<complex<float> > &A,
+		const mat<complex<float> > &b, const tval3_options<float> &opts, const mat<complex<float> > &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<std::complex<float>, float, sparse_mat<std::complex<float> > >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<complex<double> > &U, const sparse_mat<complex<double> > &A,
+		const mat<complex<double> > &b, const tval3_options<double> &opts, const mat<complex<double> > &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<std::complex<double>, double, sparse_mat<std::complex<double> > >(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+
+
+
+
+const tval3_info<float> tval3(mat<float> &U, const geometry &A, const mat<float> &b,
+		const tval3_options<float> &opts, const mat<float> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<float, float, geometry>(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<float> &U, const geometry &A, const mat<float> &b,
+		const tval3_options<double> &opts, const mat<float> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<float, double, geometry>(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<double> &U, const geometry &A, const mat<double> &b,
+		const tval3_options<double> &opts, const mat<double> &Ut, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<double, double, geometry>(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<float> tval3(mat<complex<float> > &U, const geometry &A,
+		const mat<complex<float> > &b, const tval3_options<float> &opts, const mat<complex<float> > &Ut, int submat_mb=5,
+		int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<std::complex<float>, float, geometry>(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
+
+const tval3_info<double> tval3(mat<complex<double> > &U, const geometry &A,
+		const mat<complex<double> > &b, const tval3_options<double> &opts, const mat<complex<double> > &Ut, int submat_mb=5,
+		int virtual_procs_per_core=2) throw(tval3_exception)
+{
+	return tval3<std::complex<double>, double, geometry>(U, A, b, opts, Ut, submat_mb, virtual_procs_per_core);
+}
--- a/TVALCPU/src/tval3.h
+++ b/TVALCPU/src/tval3.h
@@ -0,0 +1,67 @@
+/*
+ * tval3.h
+ *
+ *  Created on: Mar 14, 2011
+ *      Author: ditlevsen
+ */
+
+#ifndef TVAL3_H_
+#define TVAL3_H_
+
+#include "container.h"
+#include <complex>
+#include "tval3_types.h"
+
+extern const tval3_info<float> tval3(mat<float> &U, const mat<float> &A, const mat<float> &b, const tval3_options<float> &opts,
+	const mat<float> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<float> &U, const mat<float> &A, const mat<float> &b, const tval3_options<double> &opts,
+	const mat<float> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<double> &U, const mat<double> &A, const mat<double> &b, const tval3_options<double> &opts,
+		const mat<double> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<float> tval3(mat<std::complex<float> > &U, const mat<std::complex<float> > &A, const mat<std::complex<float> > &b,
+	const tval3_options<float> &opts,  const mat<std::complex<float> > &Ut=0, int submat_mb=5, int virtual_procs_per_core=2)
+	throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<std::complex<double> > &U, const mat<std::complex<double> > &A,
+		const mat<std::complex<double> > &b, const tval3_options<double> &opts, const mat<std::complex<double> > &Ut=0, int submat_mb=5,
+		int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<float> tval3(mat<float> &U, const sparse_mat<float> &A, const mat<float> &b, const tval3_options<float> &opts,
+		const mat<float> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<float> &U, const sparse_mat<float> &A, const mat<float> &b, const tval3_options<double> &opts,
+		const mat<float> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<double> &U, const sparse_mat<double> &A, const mat<double> &b, const tval3_options<double> &opts,
+		const mat<double> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<float> tval3(mat<std::complex<float> > &U, const sparse_mat<std::complex<float> > &A,
+		const mat<std::complex<float> > &b, const tval3_options<float> &opts,  const mat<std::complex<float> > &Ut=0, int submat_mb=5,
+		int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<std::complex<double> > &U, const sparse_mat<std::complex<double> > &A,
+		const mat<std::complex<double> > &b, const tval3_options<double> &opts, const mat<std::complex<double> > &Ut=0, int submat_mb=5,
+		int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<float> tval3(mat<float> &U, const geometry &A, const mat<float> &b, const tval3_options<float> &opts,
+		const mat<float> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<float> &U, const geometry &A, const mat<float> &b, const tval3_options<double> &opts,
+		const mat<float> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<double> &U, const geometry &A, const mat<double> &b, const tval3_options<double> &opts,
+		const mat<double> &Ut=0, int submat_mb=5, int virtual_procs_per_core=2) throw(tval3_exception);
+
+extern const tval3_info<float> tval3(mat<std::complex<float> > &U, const geometry &A, const mat<std::complex<float> > &b,
+		const tval3_options<float> &opts, const mat<std::complex<float> > &Ut=0, int submat_mb=5, int virtual_procs_per_core=2)
+		throw(tval3_exception);
+
+extern const tval3_info<double> tval3(mat<std::complex<double> > &U, const geometry &A, const mat<std::complex<double> > &b,
+		const tval3_options<double> &opts, const mat<std::complex<double> > &Ut=0, int submat_mb=5, int virtual_procs_per_core=2)
+		throw(tval3_exception);
+
+
+#endif /* TVAL3_H_ */
--- a/TVALCPU/src/tval3_types.h
+++ b/TVALCPU/src/tval3_types.h
@@ -0,0 +1,49 @@
+#ifndef TVAL3_TYPES_H_
+#define TVAL3_TYPES_H_
+
+#include <stdexcept>
+#include <string>
+
+template<class T_scal>
+struct tval3_options {
+	T_scal mu; // maximum value for penalty parameter mu, see (2.12)
+		// mu is mainly decided by noise level. Set mu big when b is noise-free,
+		// set mu small when b is very noisy
+		// suggested values for mu: 2⁴..2¹³
+	T_scal mu0; // initial value for mu
+	T_scal beta; // maximum value for penalty parameter beta, see (2.12)
+	T_scal beta0; // initial value for beta
+	T_scal rate_cnt; // continuation parameter for both mu and beta
+	T_scal tol; // outer loop tolerance
+	T_scal tol_inn; // inner loop tolerance
+	int maxcnt; // maximum number of outer iterations
+	int maxit; // maximum number of iterations (total)
+	T_scal c; // corresponds to the parameter delta in (2.33) (arminjo condition)
+	T_scal gamma; // corresponds to rho "Algorithm 2" on page 29
+	T_scal gam; // corresponds to parameter eta in (2.34)
+		// Control the degree of nonmonotonicity. 0 corresponds to monotone line search.
+		// The best convergence is obtained by using values closer to 1 when the iterates
+		// are far from the optimum, and using values closer to 0 when near an optimum.
+	T_scal rate_gam; // shrinkage rate of gam
+	bool isreal;
+	bool nonneg;
+	tval3_options(): mu(256.0f), mu0(256.0f), beta(32.0f), beta0(32.0f), rate_cnt(2.0f),
+			tol(1e-6f), tol_inn(1e-3f), maxcnt(10), maxit(1025), c(1e-5f), gamma(0.6f),
+			gam(0.9995f), rate_gam(0.9f), isreal(true), nonneg(true) {};
+};
+
+template<class T_scal>
+struct tval3_info {
+	int total_iters;
+	int outer_iters;
+	T_scal rel_chg;
+	T_scal rel_error;
+	double secs;
+};
+
+class tval3_exception: public std::runtime_error {
+public:
+	tval3_exception(const std::string& message) : std::runtime_error(message) {};
+};
+
+#endif
--- a/TVALCPU/src/tval3cpp3d.cpp
+++ b/TVALCPU/src/tval3cpp3d.cpp
@@ -0,0 +1,653 @@
+#include "tval3cpp3d.h"
+#include <matrix.h>
+#include <mex.h>   
+#include <tval3.h>
+#include <complex>
+#include <exception>
+#include <mkl_cblas.h>
+
+using namespace std;
+
+// create mat-object from mxArray (row major storage)
+template<class Type>
+mat<Type> *getMatrix(const mxArray *mi) {
+
+    if(mi != NULL) {
+        const mwSize *dims = mxGetDimensions(mi);
+        int dim_y = dims[0];
+        int dim_x = dims[1];
+        int dim_z = (mxGetNumberOfDimensions(mi) > 2) ? dims[2] : 1;
+
+        Type *mi_data = (Type *)mxGetData(mi);
+
+        mat<Type> *mo = new mat<Type>(dim_y, dim_x, dim_z);
+
+        for(int z=0; z < dim_z; z++) {
+            for(int y=0; y < dim_y; y++) {
+                for(int x=0; x < dim_x; x++) {
+                    (*mo)[x + y*dim_x + z*dim_x*dim_y] =
+                            mi_data[x*dim_y + y + z*dim_x*dim_y];
+                }
+            }
+        }
+        return mo;
+    } else {
+        mat<Type> *mo = new mat<Type>(0);
+        return mo;
+    }
+}
+
+// create mat-object from complex mxArray (row major)
+template<class Type>
+mat<complex<Type> > *getCompMatrix(const mxArray *mi) {
+
+    if(mi != NULL) {
+        const mwSize *dims = mxGetDimensions(mi);
+        int dim_y = dims[0];
+        int dim_x = dims[1];
+        int dim_z = (mxGetNumberOfDimensions(mi) > 2) ? dims[2] : 1;
+
+        Type *mi_real = (Type *)mxGetData(mi);
+        Type *mi_imag = (Type *)mxGetImagData(mi);
+
+        mat<complex<Type> > *mo = new mat<complex<Type> >(dim_y, dim_x, dim_z);
+
+        for(int z=0; z < dim_z; z++) {
+            for(int y=0; y < dim_y; y++) {
+                for(int x=0; x < dim_x; x++) {
+                    (*mo)[x + y*dim_x].real(mi_real[x*dim_y + y]);
+                    if(mxIsComplex(mi))
+                        (*mo)[x + y*dim_x + z*dim_x*dim_y].imag(mi_imag[x*dim_y + y + z*dim_x*dim_y]);
+                }
+            }
+        }
+        return mo;
+    } else {
+        mat<complex<Type> > *mo = new mat<complex<Type> >(0);
+        return mo;
+    }
+}
+
+// create sparse_mat-object from mxArray
+template<class Type>
+sparse_mat<Type> *getSparseMatrix(const mxArray *mi) {
+
+    const mwSize *dims = mxGetDimensions(mi);
+    int dim_y = dims[0];
+    int dim_x = dims[1];
+    mwIndex *mi_row_ind = mxGetIr(mi);
+    mwIndex *mi_col_ptr = mxGetJc(mi);
+    int nnz = (int)mxGetNzmax(mi);
+    
+    double *mi_data = (double *)mxGetData(mi);
+    
+    sparse_mat<Type> *mo;
+
+    mo = new sparse_mat<Type>(dim_y, dim_x, nnz);
+
+    for(int i=0; i < (mo->dim_x + 1); i++) {
+        mo->ptr()[i] = (int) mi_col_ptr[i];
+    }
+
+    for(int i=0; i < mo->nnz; i++) {
+        mo->val()[i] = (Type) mi_data[i];
+        mo->ind()[i] = (int) mi_row_ind[i];
+    }
+    
+    return mo;
+}
+
+
+// create sparse_mat-object from complex mxArray (row major)
+template<class Type>
+sparse_mat<complex<Type> > *getSparseCompMatrix(const mxArray *mi) {
+
+    const mwSize *dims = mxGetDimensions(mi);
+    int dim_y = dims[0];
+    int dim_x = dims[1];
+    
+    double *mi_real = (double *)mxGetData(mi);
+    double *mi_imag = (double *)mxGetImagData(mi);
+    mwIndex *mi_row_ind = mxGetIr(mi);
+    mwIndex *mi_col_ptr = mxGetJc(mi);
+    int nnz = (int)mxGetNzmax(mi);
+
+    sparse_mat<complex<Type> > *mo = new sparse_mat<complex<Type> >(dim_y,
+            dim_x, nnz);
+    
+    for(int i=0; i < mo->nnz; i++) {
+        mo->ind()[i] = mi_row_ind[i];
+        mo->val()[i].real((Type)mi_real[i]);
+        if(mxIsComplex(mi))
+            mo->val()[i].imag((Type)mi_imag[i]);
+    }
+    
+    for(int i=0; i < dim_x + 1; i++)
+            mo->ptr()[i] = mi_col_ptr[i];
+    
+    return mo;
+}
+
+// get geometry structure from mxArray
+geometry *get_geometry(const mxArray *A) {
+    float scale_factor;
+    int *x_e, *y_e, *z_e, *x_r, *y_r, *z_r;
+    int ne, nr, rv_x, rv_y, rv_z;
+    mxArray *field;
+    
+    field = mxGetField(A, 0, "rv_x");
+    if(field == NULL) {
+        mexErrMsgTxt("no field rv_x in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.rv_x must be of type Int32!");
+        }
+       rv_x = *( (int *)mxGetData(field) );
+    }
+    
+    field = mxGetField(A, 0, "rv_y");
+    if(field == NULL) {
+        mexErrMsgTxt("no field rv_y in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.rv_y must be of type Int32!");
+        }
+       rv_y = *( (int *)mxGetData(field) );
+    }
+
+        field = mxGetField(A, 0, "rv_z");
+    if(field == NULL) {
+        mexErrMsgTxt("no field rv_z in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.rv_z must be of type Int32!");
+        }
+       rv_z = *( (int *)mxGetData(field) );
+    }
+        
+    field = mxGetField(A, 0, "scale_factor");
+    if(field == NULL) {
+        mexErrMsgTxt("no field scale_factor in struct A!");
+    } else {
+        if(!mxIsSingle(field)) {
+            mexErrMsgTxt("A.scale_factor must be of type single!");
+        }
+        scale_factor = *( (float *)mxGetData(field) );
+    }
+    
+    field = mxGetField(A, 0, "x_emitters");
+    if(field == NULL) {
+        mexErrMsgTxt("no field x_emitters in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.x_emitters must be of type int32!");
+        }
+        const mwSize *dims = mxGetDimensions(field);
+        if(dims[0] > 1)
+            mexErrMsgTxt("A.x_emitters must be a column-vector!");
+        ne = dims[1];
+        x_e = (int *)mxGetData(field);
+    }
+
+    field = mxGetField(A, 0, "y_emitters");
+    if(field == NULL) {
+        mexErrMsgTxt("no field y_emitters in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.y_emitters must be of type in32!");
+        }
+        const mwSize *dims = mxGetDimensions(field);
+        if(dims[0] > 1)
+            mexErrMsgTxt("A.y_emitters must be a column-vector!");
+        if(dims[1] != ne)
+            mexErrMsgTxt("A.y_emitters must be of the same size as A.x_emitters!");
+        y_e = (int *)mxGetData(field);
+    }
+    
+    field = mxGetField(A, 0, "z_emitters");
+    if(field == NULL) {
+        mexErrMsgTxt("no field z_emitters in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.z_emitters must be of type in32!");
+        }
+        const mwSize *dims = mxGetDimensions(field);
+        if(dims[0] > 1)
+            mexErrMsgTxt("A.z_emitters must be a column-vector!");
+        if(dims[1] != ne)
+            mexErrMsgTxt("A.z_emitters must be of the same size as A.x_emitters!");
+        z_e = (int *)mxGetData(field);
+    }
+    
+
+    field = mxGetField(A, 0, "x_receivers");
+    if(field == NULL) {
+        mexErrMsgTxt("no field x_receivers in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.x_receivers must be of type in32!");
+        }
+        const mwSize *dims = mxGetDimensions(field);
+        if(dims[0] > 1)
+            mexErrMsgTxt("A.x_receivers must be a column-vector!");
+        nr = dims[1];
+        x_r = (int *)mxGetData(field);
+    }
+
+    field = mxGetField(A, 0, "y_receivers");
+    if(field == NULL) {
+        mexErrMsgTxt("no field y_receivers in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.y_receivers must be of type in32!");
+        }
+        const mwSize *dims = mxGetDimensions(field);
+        if(dims[0] > 1)
+            mexErrMsgTxt("A.y_receivers must be a column-vector!");
+        if(dims[1] != nr)
+            mexErrMsgTxt("A.y_receivers must be of the same size as A.x_receivers!");
+        y_r = (int *)mxGetData(field);
+    }
+    
+    field = mxGetField(A, 0, "z_receivers");
+    if(field == NULL) {
+        mexErrMsgTxt("no field z_receivers in struct A!");
+    } else {
+        if(!mxIsInt32(field)) {
+            mexErrMsgTxt("A.z_receivers must be of type int32!");
+        }
+        const mwSize *dims = mxGetDimensions(field);
+        if(dims[0] > 1)
+            mexErrMsgTxt("A.z_receivers must be a column-vector!");
+        if(dims[1] != nr)
+            mexErrMsgTxt("A.z_receivers must be of the same size as A.z_receivers!");
+        z_r = (int *)mxGetData(field);
+    }
+    
+    geometry *geom = new geometry;
+    geom->num_emitters = ne;
+    geom->num_receivers = nr;
+    geom->x_emitters = x_e;
+    geom->y_emitters = y_e;
+    geom->z_emitters = z_e;
+    geom->x_receivers = x_r;
+    geom->y_receivers = y_r;
+    geom->z_receivers = z_r;
+    geom->scale_factor = scale_factor;
+    geom->rv_x = rv_x;
+    geom->rv_y = rv_y;
+    geom->rv_z = rv_z;
+    
+    return geom;
+}
+
+// create tval3_options struct
+template<class Type>
+tval3_options<Type> *getOptions(const mxArray *optsi) {
+
+    tval3_options<Type> *optso = new tval3_options<Type>;
+    
+    if(!mxIsStruct(optsi)) {
+        mexPrintf("Warning: opts is not a structure.");
+        return optso;
+    }
+    
+    mxArray *field;
+    
+    field = mxGetField(optsi, 0, "mu");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.mu is not of type double (ignored).");
+        else
+            optso->mu = (Type) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "mu0");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.mu0 is not of type double (ignored).");
+        else
+            optso->mu0 = (Type) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "beta");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.beta is not of type double (ignored).");
+        else
+            optso->beta = (Type) *mxGetPr(field);
+    }
+    
+    field = mxGetField(optsi, 0, "beta0");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.beta0 is not of type double (ignored).");
+        else
+            optso->beta0 = (Type) *mxGetPr(field);
+    }
+    
+    field = mxGetField(optsi, 0, "rate_cnt");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.rate_cnt is not of type double (ignored).");
+        else
+            optso->rate_cnt = (Type) *mxGetPr(field);
+    }
+    
+    field = mxGetField(optsi, 0, "tol");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.tol is not of type double (ignored).");
+        else
+            optso->tol = (Type) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "tol_inn");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.tol_inn is not of type double (ignored).");
+        else
+
+            optso->tol_inn = (Type) *mxGetPr(field);
+    }
+    
+    field = mxGetField(optsi, 0, "maxcnt");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.maxcnt is not of type double (ignored).");
+        else
+            optso->maxcnt = (int) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "maxit");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.maxit is not of type double (ignored).");
+        else
+            optso->maxit = (int) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "c");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.c is not of type double (ignored).");
+        else
+            optso->c = (Type) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "gamma");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.gamma is not of type double (ignored).");
+        else
+            optso->gamma = (Type) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "gam");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.gam is not of type double (ignored).");
+        else
+            optso->gam = (Type) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "rate_gam");
+    if(field != NULL) {
+        if(!mxIsDouble(field))
+            mexPrintf("Warning: opts.rate_gam is not of type double (ignored).");
+        else
+            optso->rate_gam = (Type) *mxGetPr(field);
+    }
+
+    field = mxGetField(optsi, 0, "isreal");
+    if(field != NULL) {
+        if(!mxIsLogical(field))
+            mexPrintf("Warning: opts.isreal is not of type logical (ignored).");
+        else
+            optso->isreal = (bool) *mxGetLogicals(field);
+    }
+
+    field = mxGetField(optsi, 0, "nonneg");
+    if(field != NULL) {
+        if(!mxIsLogical(field))
+            mexPrintf("Warning: opts.nonneg is not of type logical (ignored).");
+        else
+            optso->nonneg = (bool) *mxGetLogicals(field);
+    }
+    
+    return optso;
+}
+
+// create mxArray from struct tval3_info
+template<class Type>
+mxArray *setInfo(tval3_info<Type> &info) {
+    const char *field_names[] = {"total_iters", "outer_iters", "rel_chg", "rel_error", "secs"};
+    
+    mxArray *output = mxCreateStructMatrix(1, 1, 5, field_names);
+    
+    mxArray *value;
+
+    value = mxCreateDoubleMatrix(1,1,mxREAL);
+    *mxGetPr(value)  = info.total_iters;
+    mxSetField(output, 0, "total_iters", value);
+
+    value = mxCreateDoubleMatrix(1,1,mxREAL);
+    *mxGetPr(value)  = info.outer_iters;
+    mxSetField(output, 0, "outer_iters", value);
+
+    value = mxCreateDoubleMatrix(1,1,mxREAL);
+    *mxGetPr(value)  = info.rel_chg;
+    mxSetField(output, 0, "rel_chg", value);
+
+    value = mxCreateDoubleMatrix(1,1,mxREAL);
+    *mxGetPr(value)  = info.rel_error;
+    mxSetField(output, 0, "rel_error", value);
+
+    value = mxCreateDoubleMatrix(1,1,mxREAL);
+    *mxGetPr(value)  = info.secs;
+    mxSetField(output, 0, "secs", value);
+    
+    return output;
+}
+
+// create mxArray from mat-object (row major)
+template<class Type>
+mxArray *setMatrix(const mat<Type> &mi) {
+
+    mwSize dims[] = {mi.dim_y, mi.dim_x, mi.dim_z};
+    mxArray *mo = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxREAL);
+    double *mo_data = mxGetPr(mo);
+    
+    
+    for(int z=0; z < mi.dim_z; z++) {
+        for(int y=0; y < mi.dim_y; y++) {
+            for(int x=0; x < mi.dim_x; x++) {
+                mo_data[x*mi.dim_y + y + z*mi.dim_x*mi.dim_y] = mi[x + y*mi.dim_x + z*mi.dim_x*mi.dim_y];
+            }
+        }
+    }
+    
+    return mo;
+}
+
+// create complex mxArray from mat-object (row major)
+template<class Type>
+mxArray *setCompMatrix(const mat<complex<Type> > &mi) {
+
+    mwSize dims[] = {mi.dim_y, mi.dim_x, mi.dim_z};
+    mxArray *mo = mxCreateNumericArray(3, dims, mxDOUBLE_CLASS, mxCOMPLEX);
+    double *mo_real = mxGetPr(mo);
+    double *mo_imag = mxGetPi(mo);
+    
+    for(int z=0; z < mi.dim_z; z++) {
+        for(int y=0; y < mi.dim_y; y++) {
+            for(int x=0; x < mi.dim_x; x++) {
+                mo_real[x*mi.dim_y + y + z*mi.dim_x*mi.dim_y] = mi[x + y*mi.dim_x + z*mi.dim_x*mi.dim_y].real();
+                mo_imag[x*mi.dim_y + y + z*mi.dim_x*mi.dim_y] = mi[x + y*mi.dim_x + z*mi.dim_x*mi.dim_y].imag();
+            }
+        }
+    }
+    return mo;
+}
+
+template<class T_mat, class T_scal>
+void call_tval3(const mxArray *A, const mxArray *b, const mxArray *Ut,
+        int ip, int iq, int ir, const mxArray *opts, mxArray **U, mxArray **info,
+        geometry *geo, int submat_mb, int virtual_procs_per_core)
+{
+    mat<T_mat> *mb = getMatrix<T_mat>(b);
+    mat<T_mat> mU(ip, iq, ir);
+    mat<T_mat> *mUt = getMatrix<T_mat>(Ut);
+    
+    tval3_options<T_scal> *options = getOptions<T_scal>(opts);
+    
+    tval3_info<T_scal> ti_info;
+    if(geo != NULL) {
+        ti_info = tval3(mU, *geo, *mb, *options, *mUt, submat_mb,
+                virtual_procs_per_core);
+    } else {
+        if(mxIsSparse(A)) {
+            sparse_mat<T_mat> *mA = getSparseMatrix<T_mat>(A);
+            ti_info = tval3(mU, *mA, *mb, *options, *mUt, submat_mb,
+                    virtual_procs_per_core);
+            delete mA;
+        } else {
+            mat<T_mat> *mA = getMatrix<T_mat>(A);
+            ti_info = tval3(mU, *mA, *mb, *options, *mUt, submat_mb,
+                    virtual_procs_per_core);
+            delete mA;
+        }
+    }
+                
+    *U = setMatrix(mU);
+    if(info != NULL) *info = setInfo(ti_info);
+    
+    delete mb; delete mUt; delete options;
+}
+
+template<class Type>
+void call_tval3_comp(const mxArray *A, const mxArray *b, const mxArray *Ut,
+        int ip, int iq, int ir, const mxArray *opts, mxArray **U, mxArray **info,
+        geometry *geo, int submat_mb, int virtual_procs_per_core)
+{
+    mat<complex<Type> > *mb = getCompMatrix<Type>(b);
+    mat<complex<Type> > mU(ip, iq, ir);
+    mat<complex<Type> > *mUt = getCompMatrix<Type>(Ut);
+           
+    tval3_options<Type> *options = getOptions<Type>(opts);
+
+    tval3_info<Type> ti_info;
+    
+    if(geo != NULL) {
+        ti_info = tval3(mU, *geo, *mb, *options, *mUt, submat_mb,
+                virtual_procs_per_core);
+    } else {
+        if(mxIsSparse(A)) {
+            sparse_mat<complex<Type> > *mA = getSparseCompMatrix<Type>(A);
+            ti_info = tval3(mU, *mA, *mb, *options, *mUt, submat_mb,
+                    virtual_procs_per_core);
+            delete mA;
+        } else {
+            mat<complex<Type> > *mA = getCompMatrix<Type>(A);
+            ti_info = tval3(mU, *mA, *mb, *options, *mUt, submat_mb,
+                    virtual_procs_per_core);
+            delete mA;
+        }
+    }
+                
+    *U = setCompMatrix(mU);
+    if(info != NULL) *info = setInfo(ti_info);
+    
+    delete mb; delete mUt; delete options;
+}
+
+// [U, out] = tval3cpp(A,b,p,q,opts,use_dbl)
+void TVALCPU(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
+
+    // check number of arguments...
+    if ((nrhs < 6) || (nrhs > 10))
+        mexErrMsgTxt("Wrong number of input arguments.");
+    if(nlhs > 2)
+        mexErrMsgTxt("Too many output arguments.");
+    
+    // assign input arguments...
+    const mxArray *A = prhs[0];
+    const mxArray *b = prhs[1];
+    const mxArray *p = prhs[2];
+    const mxArray *q = prhs[3];
+    const mxArray *r = prhs[4];
+    const mxArray *opts = prhs[5];
+    const mxArray *use_dbl = (nrhs < 7) ? NULL : prhs [6];
+    const mxArray *Ut = mxGetField(opts, 0, "Ut");
+    const mxArray *submat_mb = (nrhs < 8) ? NULL : prhs[7];
+    const mxArray *virtual_procs_per_core = (nrhs < 9) ? NULL : prhs[8];
+    
+    // assign output arguments...
+    mxArray **U = &plhs[0];
+    mxArray **info = (nlhs < 2) ? NULL : &plhs[1];
+    
+    // check data types and assign variables...
+    geometry *geo = NULL;
+    
+    if(!mxIsStruct(A) && !mxIsSingle(A) && !mxIsDouble(A))
+        mexErrMsgTxt("A must be single, double or struct.");
+    if(!mxIsSparse(A) && !mxIsStruct(A) && mxGetClassID(A) != mxGetClassID(b))
+        mexErrMsgTxt("A and b must be of the same type when using dense matrices.");
+    if(!mxIsSingle(b) && !mxIsDouble(b))
+        mexErrMsgTxt("b must be single or double.");
+    if((Ut != NULL) && (mxGetClassID(Ut) != mxGetClassID(b)))
+        mexErrMsgTxt("opts.Ut must be of same type as b.");
+    if(!mxIsDouble(p))
+        mexErrMsgTxt("p must be of type double.");
+    if(!mxIsDouble(q))
+        mexErrMsgTxt("q must be of type double.");
+    if(!mxIsDouble(r))
+        mexErrMsgTxt("r must be of type double.");
+    if((use_dbl != NULL) && (!mxIsLogical(use_dbl)))
+        mexErrMsgTxt("use_dbl is not a logical value.");
+    if((submat_mb != NULL) && (!mxIsDouble(submat_mb)))
+        mexErrMsgTxt("submat_mb must be of type double.");
+    if((virtual_procs_per_core != NULL) && (!mxIsDouble(virtual_procs_per_core)))
+        mexErrMsgTxt("virtual_procs_per_core must be of type double.");
+    
+    bool b_use_dbl = (use_dbl == NULL) ? false : (bool) mxGetLogicals(use_dbl)[0]; 
+    int ip = (int)(*mxGetPr(p));
+    int iq = (int)(*mxGetPr(q));
+    int ir = (int)(*mxGetPr(r));
+    int isubmat_mb = (submat_mb == NULL) ? 5 : (int)(*mxGetPr(submat_mb));
+    int ivirtual_procs_per_core = (virtual_procs_per_core == NULL) ? 2 :
+        (int)(*mxGetPr(virtual_procs_per_core));
+        
+    if(mxIsStruct(A)) {
+        geo = get_geometry(A);
+    }
+    
+    // call tval3()...
+    try{
+        if(!mxIsComplex(A) && !mxIsComplex(b)) { // scalar matrices...
+            if(mxIsSingle(b)) {
+                if(b_use_dbl) {
+                    call_tval3<float, double>(A, b, Ut, ip, iq, ir, opts, U,
+                            info, geo, isubmat_mb, ivirtual_procs_per_core);
+                } else {
+                    call_tval3<float, float>(A, b, Ut, ip, iq, ir, opts, U,
+                            info, geo, isubmat_mb, ivirtual_procs_per_core);
+                }
+            } else {
+                call_tval3<double, double>(A, b, Ut, ip, iq, ir, opts, U,
+                        info, geo, isubmat_mb, ivirtual_procs_per_core);
+            }
+        } else { // complex matrices...
+            if(mxIsSingle(b)) {
+               call_tval3_comp<float>(A, b, Ut, ip, iq, ir, opts, U, info,
+                       geo, isubmat_mb, ivirtual_procs_per_core);
+            } else {
+                call_tval3_comp<double>(A, b, Ut, ip, iq, ir, opts, U, info,
+                        geo, isubmat_mb, ivirtual_procs_per_core);
+            }
+        }
+    } catch(const std::exception &ex) {
+        mexErrMsgTxt(ex.what());
+    }
+    if(geo != NULL) delete geo;
+}
--- a/TVALCPU/src/tval3cpp3d.h
+++ b/TVALCPU/src/tval3cpp3d.h
@@ -0,0 +1,4 @@
+#include <mex.h>
+extern "C"{
+    void TVALCPU(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) ;
+}
--- a/TVALCPU/src/utility.h
+++ b/TVALCPU/src/utility.h
@@ -0,0 +1,45 @@
+/*
+ * utility.h
+ *
+ *  Created on: Mar 22, 2011
+ *      Author: ditlevsen
+ */
+
+#ifndef UTILITY_H_
+#define UTILITY_H_
+
+/**********************************************************************************/
+/* functions, etc. for handling different data types in mathematical expressions  */
+/**********************************************************************************/
+
+// overloaded versions of conj() and real() for scalar values...
+// (not needed when compiled with gcc and -std_c++0x)
+
+template <class Type>
+inline Type conj(Type f) {
+	return f;
+}
+
+template<class Type>
+inline Type real(Type f) {
+	return f;
+}
+
+template<class Type>
+inline Type imag(Type f) {
+	return 0;
+}
+
+
+// sign function
+
+template<class T_comp, class T_scal>
+inline T_comp sign(T_comp x) {
+	T_scal ax = std::abs(x);
+	if(ax > 0)
+		return x/ax;
+	else
+		return 0;
+}
+
+#endif /* UTILITY_H_ */