URDepends/TVALGPU/include/mat_vec_mul.h

/*
 * mat_vec_mul.h
 *
 *  Created on: Jul 13, 2011
 *      Author: ditlevsen
 */
#ifndef MAT_VEC_MUL_H_
#define MAT_VEC_MUL_H_

struct sparse_mm {
	sparse_mat_device A_csc;
	sparse_mat_device A_csr;
	cusparseHandle_t cs_handle;
	cusparseMatDescr_t descrA;
	sparse_mm(const sparse_mat_host &A, cusparseHandle_t handle, cusparseMatDescr_t descriptor) :
		A_csc(A), A_csr(A.dim_y, A.dim_x, A.nnz, sparse_mat_csr, false), cs_handle(handle),
		descrA(descriptor)
	{
		
		
		/* Old CUDA 4.0 implementation for reference
		HANDLE_ERROR(cusparseScsr2csc(cs_handle, A.dim_x, A.dim_y, A_csc.val(),
				A_csc.ptr(), A_csc.ind(), A_csr.val(), A_csr.ind(),
				A_csr.ptr(), 1, CUSPARSE_INDEX_BASE_ZERO));
		*/
		
		/* CUDA 10.1 adaption (Feb. 2021)
		cusparseStatus_t cusparseScsr2csc(cusparseHandle_t    handle,
                 int                 m,
                 int                 n,
                 int                 nnz,
                 const float*        csrVal,
                 const int*          csrRowPtr,
                 const int*          csrColInd,
                 float*              cscVal,
                 int*                cscRowInd,
                 int*                cscColPtr,
                 cusparseAction_t    copyValues,
                 cusparseIndexBase_t idxBase)

		*/ 
		
		HANDLE_ERROR(cusparseScsr2csc(cs_handle,      
					A.dim_x, 		// m
					A.dim_y,		// n 
					A.nnz, 			// nnz
					A_csc.val(),	// csrVal
					A_csc.ptr(), 	// csrRowPtr
					A_csc.ind(), 	// csrColInd
					A_csr.val(), 	// cscVal
					A_csr.ind(),	// cscRowInd
					A_csr.ptr(), 	// cscColPtr
					CUSPARSE_ACTION_NUMERIC, // copyValues
					CUSPARSE_INDEX_BASE_ZERO)); // idxBase
		
		
		/*  TO DO: adaption to CUDA 11. Above function was replaced with the following function
		
		cusparseStatus_t cusparseCsr2cscEx2(cusparseHandle_t     handle,
                   int                  m,
                   int                  n,
                   int                  nnz,
                   const void*          csrVal,
                   const int*           csrRowPtr,
                   const int*           csrColInd,
                   void*                cscVal,
                   int*                 cscColPtr,
                   int*                 cscRowInd,
                   cudaDataType         valType,
                   cusparseAction_t     copyValues,
                   cusparseIndexBase_t  idxBase,
                   cusparseCsr2CscAlg_t alg,
                   void*                buffer)
								
		*/
										
	};
};

struct dense_mm {
	mat_device A;
	cublasHandle_t cb_handle;
	dense_mm(const mat_host &A_host, cublasHandle_t handle) : A(A_host, true), cb_handle(handle) {};
};

inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A, const float *x, float *y,
		cudaStream_t stream=0)
{
	int n, m;
	const sparse_mat_device *mA;
	const float alpha = 1;
    const float beta = 0;
	

	if(transA == CUBLAS_OP_N) {
		n = A.A_csr.dim_x;
		m = A.A_csr.dim_y;
		mA = &A.A_csr;
	} else {
		n = A.A_csr.dim_y;
		m = A.A_csr.dim_x;
		mA = &A.A_csc; // implicit transponation...
	}

	/* Old CUDA 4 implementation 
	HANDLE_ERROR(cusparseSetKernelStream(A.cs_handle, stream));  // one problem here
	*/
	

    /* CUDA 10.1 adaption (Feb. 2021)*/
	HANDLE_ERROR(cusparseSetStream(A.cs_handle, stream)); 
    
	
	/* Old CUDA 4 implementation 
	return cusparseScsrmv(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, m, n, 1, A.descrA, mA->val(),mA->ptr(), mA->ind(), x, 0, y);
	*/


	/* CUDA 10.1  adaption (Feb. 2021)
	cusparseStatus_tcusparseScsrmv(cusparseHandle_t handle,
							cusparseOperation_t transA,
							int m,
							int n,
							int nnz, 
							const float* alpha,
							const cusparseMatDescr_t descrA,
							const float* csrValA,
							const int* csrRowPtrA, 
							const int* csrColIndA,
							const float* x, 
							const float* beta,
							float* y);
	
	*/
    
    cusparseStatus_t status;
	status = cusparseScsrmv(A.cs_handle, 
						  CUSPARSE_OPERATION_NON_TRANSPOSE, 
						  m, 
						  n, 
						  A.A_csr.nnz,
						  &alpha,  
						  A.descrA, 
						  mA->val(),
						  mA->ptr(), 
						  mA->ind(), 
						  x, 
						  &beta, 
						  y); 
     
    return status;
}

inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const dense_mm &A, const float *x, float *y,
		cudaStream_t stream=0) {
	int n = A.A.dim_x;
	int m = A.A.dim_y;
	cublasStatus_t status;

	float alpha=1, beta=0;

	HANDLE_ERROR(cublasSetStream(A.cb_handle, stream));

	status =  cublasSgemv(A.cb_handle, trans, m, n, &alpha, A.A.data_dev_ptr(), A.A.leading_dim(), x, 1, &beta, y, 1);

	HANDLE_ERROR(cublasSetStream(A.cb_handle, 0));

	return status;
}

__device__ float atomicAdd_cc1(float* address, float val) {
	int* address_as_int = (int*)address;
	int old = *address_as_int, assumed;
	do {
		assumed = old;
		old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
	} while (assumed != old);
	return __int_as_float(old);
}

// To do: atomic operations...!!
__device__ void mult_element(cublasOperation_t transA, int ray, int x_pixel, int y_pixel, int z_pixel, int dim_x, int dim_y, float scale_factor, const float *x, float *y) {
	int lin_index = x_pixel*dim_y + y_pixel + z_pixel*dim_x*dim_y;
	if(transA == CUBLAS_OP_N) {
		y[ray] += x[lin_index] * scale_factor;
	} else {
#if __CUDA_ARCH__ < 200
		atomicAdd_cc1(y + lin_index, x[ray] * scale_factor);
#else
		atomicAdd(y + lin_index, x[ray] * scale_factor);
#endif
		//y[lin_index] += x[ray] * scale_factor;
	}
}


__global__ void dyn_calc_kernel(cublasOperation_t transA, int num_emitters, int num_receivers, int rv_x, int rv_y, int rv_z,
		float scale_factor, const int *x_receivers, const int *y_receivers, const int *z_receivers, const int *x_emitters,
		const int *y_emitters, const int *z_emitters, const float *x, float *y) {

	int index_r, index_e, inc_r, inc_e;
	int d1, d2, d3, inc1, m1, inc2, m2, inc3, m3, x_pixel, y_pixel, z_pixel, ray, err_1, err_2;
	int *pixel1, *pixel2, *pixel3;

	// different assignment of paths to threads for multiplication with or without transposition
	// (performance reasons)
	// no transposition: x-> receivers, y-> emitters
	// transposition: x-> emitters, y-> receivers
	if(transA == CUBLAS_OP_N) {
		index_r = threadIdx.x + blockIdx.x * blockDim.x;
		index_e = threadIdx.y + blockIdx.y * blockDim.y;
		inc_r = blockDim.x * gridDim.x;
		inc_e = blockDim.y * gridDim.y;
	} else {
		index_e = threadIdx.x + blockIdx.x * blockDim.x;
		index_r = threadIdx.y + blockIdx.y * blockDim.y;
		inc_e = blockDim.x * gridDim.x;
		inc_r = blockDim.y * gridDim.y;
	}

	for(int receiver=index_r; receiver < num_receivers; receiver+= inc_r) {
		for(int emitter=index_e; emitter < num_emitters; emitter+= inc_e) {

			ray = emitter*num_receivers + receiver;

			// trace path using Bresenham algorithm...

			x_pixel = x_emitters[emitter];
			y_pixel = y_emitters[emitter];
			z_pixel = z_emitters[emitter];

			mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);

			d1 = x_receivers[receiver] - x_pixel;
			d2 = y_receivers[receiver] - y_pixel;
			d3 = z_receivers[receiver] - z_pixel;

			m1 = abs(d1);
			m2 = abs(d2);
			m3 = abs(d3);

			if (m1 >= m2 && m1 >= m3) {
				pixel1 = &x_pixel;
				pixel2 = &y_pixel;
				pixel3 = &z_pixel;
			} else if(m2 >= m1 && m2 >= m3) {
				int tmp = d1;
				d1 = d2;
				d2 = tmp;

				pixel1 = &y_pixel;
				pixel2 = &x_pixel;
				pixel3 = &z_pixel;
			} else {
				int tmp = d1;
				d1 = d3;
				d3 = d2;
				d2 = tmp;

				pixel1 = &z_pixel;
				pixel2 = &x_pixel;
				pixel3 = &y_pixel;
			}

			inc1 = (d1 < 0) ? -1 : 1;
			inc2 = (d2 < 0) ? -1 : 1;
			inc3 = (d3 < 0) ? -1 : 1;

			m1 = abs(d1);
			m2 = abs(d2);
			m3 = abs(d3);

			err_1 = 2 * m2 - m1;
			err_2 = 2 * m3 - m1;
			for(int j = 1; j < m1+1; j++) {
				if (err_1 > 0) {
					*pixel2 += inc2;
					err_1 -= 2 * m1;
				}
				if (err_2 > 0) {
					*pixel3 += inc3;
					err_2 -= 2 * m1;
				}
				err_1 += 2 * m2;
				err_2 += 2 * m3;
				*pixel1 += inc1;
				mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);
			}
		}
	}
}

// matrix-vector multiplication with dynamic calculation of the measurement matrix...
inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const geometry_device &A, const float *x, float *y,
		cudaStream_t stream=0)
{
	int len_y = (trans == CUBLAS_OP_N) ? A.num_emitters * A.num_receivers : A.rv_x * A.rv_y * A.rv_z;
	cudaMemset(y, 0, len_y * sizeof(float));
	dim3 threads, blocks;

	// recht Willkürliche Dimensionierung (Performance ca. 10 % schlechter, als mit jeweils bester gefundener Dimensionierung
	threads.x = 64;
	threads.y = 4;
	if(trans == CUBLAS_OP_N) {
		blocks.x = max(A.num_receivers / threads.x, 1);
		blocks.y = max(A.num_emitters / threads.x, 1);
	} else {
		blocks.y = max(A.num_receivers / threads.x, 1);
		blocks.x = max(A.num_emitters / threads.x, 1);
	}

	// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 64x64x64:
/*	if(trans == CUBLAS_OP_N) {
		threads.x = 32;
		threads.y = 8;
		blocks.x = 40;
		blocks.y = 78;
	} else {
		threads.x = 128;
		threads.y = 2;
		blocks.x = 1;
		blocks.y =16;
	}*/

	// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 128x128x128:
/*	if(trans == CUBLAS_OP_N) {
		threads.x = 64;
		threads.y = 4;
		blocks.x = 22;
		blocks.y = 152;
	} else {
		threads.x = 64;
		threads.y = 8;
		blocks.x = 8;
		blocks.y = 26;
	}*/

	dyn_calc_kernel<<<blocks, threads, 0, stream>>>(trans, A.num_emitters, A.num_receivers, A.rv_x, A.rv_y, A.rv_z,
			A.scale_factor, A.x_re_dev_ptr(), A.y_re_dev_ptr(), A.z_re_dev_ptr(), A.x_em_dev_ptr(), A.y_em_dev_ptr(),
			A.z_em_dev_ptr(), x, y);
	return CUBLAS_STATUS_SUCCESS;

}


#endif /* MAT_VEC_MUL_H_ */
Commit source 2023-05-18 16:04:27 +08:00			`/*`
			`* mat_vec_mul.h`
			`*`
			`* Created on: Jul 13, 2011`
			`* Author: ditlevsen`
			`*/`
			`#ifndef MAT_VEC_MUL_H_`
			`#define MAT_VEC_MUL_H_`

			`struct sparse_mm {`
			`sparse_mat_device A_csc;`
			`sparse_mat_device A_csr;`
			`cusparseHandle_t cs_handle;`
			`cusparseMatDescr_t descrA;`
			`sparse_mm(const sparse_mat_host &A, cusparseHandle_t handle, cusparseMatDescr_t descriptor) :`
			`A_csc(A), A_csr(A.dim_y, A.dim_x, A.nnz, sparse_mat_csr, false), cs_handle(handle),`
			`descrA(descriptor)`
			`{`


			`/* Old CUDA 4.0 implementation for reference`
			`HANDLE_ERROR(cusparseScsr2csc(cs_handle, A.dim_x, A.dim_y, A_csc.val(),`
			`A_csc.ptr(), A_csc.ind(), A_csr.val(), A_csr.ind(),`
			`A_csr.ptr(), 1, CUSPARSE_INDEX_BASE_ZERO));`
			`*/`

			`/* CUDA 10.1 adaption (Feb. 2021)`
			`cusparseStatus_t cusparseScsr2csc(cusparseHandle_t handle,`
			`int m,`
			`int n,`
			`int nnz,`
			`const float* csrVal,`
			`const int* csrRowPtr,`
			`const int* csrColInd,`
			`float* cscVal,`
			`int* cscRowInd,`
			`int* cscColPtr,`
			`cusparseAction_t copyValues,`
			`cusparseIndexBase_t idxBase)`

			`*/`

			`HANDLE_ERROR(cusparseScsr2csc(cs_handle,`
			`A.dim_x, // m`
			`A.dim_y, // n`
			`A.nnz, // nnz`
			`A_csc.val(), // csrVal`
			`A_csc.ptr(), // csrRowPtr`
			`A_csc.ind(), // csrColInd`
			`A_csr.val(), // cscVal`
			`A_csr.ind(), // cscRowInd`
			`A_csr.ptr(), // cscColPtr`
			`CUSPARSE_ACTION_NUMERIC, // copyValues`
			`CUSPARSE_INDEX_BASE_ZERO)); // idxBase`




			`/* TO DO: adaption to CUDA 11. Above function was replaced with the following function`

			`cusparseStatus_t cusparseCsr2cscEx2(cusparseHandle_t handle,`
			`int m,`
			`int n,`
			`int nnz,`
			`const void* csrVal,`
			`const int* csrRowPtr,`
			`const int* csrColInd,`
			`void* cscVal,`
			`int* cscColPtr,`
			`int* cscRowInd,`
			`cudaDataType valType,`
			`cusparseAction_t copyValues,`
			`cusparseIndexBase_t idxBase,`
			`cusparseCsr2CscAlg_t alg,`
			`void* buffer)`

			`*/`

			`};`
			`};`

			`struct dense_mm {`
			`mat_device A;`
			`cublasHandle_t cb_handle;`
			`dense_mm(const mat_host &A_host, cublasHandle_t handle) : A(A_host, true), cb_handle(handle) {};`
			`};`

			`inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A, const float x, float y,`
			`cudaStream_t stream=0)`
			`{`
			`int n, m;`
			`const sparse_mat_device *mA;`
			`const float alpha = 1;`
			`const float beta = 0;`


			`if(transA == CUBLAS_OP_N) {`
			`n = A.A_csr.dim_x;`
			`m = A.A_csr.dim_y;`
			`mA = &A.A_csr;`
			`} else {`
			`n = A.A_csr.dim_y;`
			`m = A.A_csr.dim_x;`
			`mA = &A.A_csc; // implicit transponation...`
			`}`

			`/* Old CUDA 4 implementation`
			`HANDLE_ERROR(cusparseSetKernelStream(A.cs_handle, stream)); // one problem here`
			`*/`


			`/* CUDA 10.1 adaption (Feb. 2021)*/`
			`HANDLE_ERROR(cusparseSetStream(A.cs_handle, stream));`


			`/* Old CUDA 4 implementation`
			`return cusparseScsrmv(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, m, n, 1, A.descrA, mA->val(),mA->ptr(), mA->ind(), x, 0, y);`
			`*/`



			`/* CUDA 10.1 adaption (Feb. 2021)`
			`cusparseStatus_tcusparseScsrmv(cusparseHandle_t handle,`
			`cusparseOperation_t transA,`
			`int m,`
			`int n,`
			`int nnz,`
			`const float* alpha,`
			`const cusparseMatDescr_t descrA,`
			`const float* csrValA,`
			`const int* csrRowPtrA,`
			`const int* csrColIndA,`
			`const float* x,`
			`const float* beta,`
			`float* y);`

			`*/`

			`cusparseStatus_t status;`
			`status = cusparseScsrmv(A.cs_handle,`
			`CUSPARSE_OPERATION_NON_TRANSPOSE,`
			`m,`
			`n,`
			`A.A_csr.nnz,`
			`&alpha,`
			`A.descrA,`
			`mA->val(),`
			`mA->ptr(),`
			`mA->ind(),`
			`x,`
			`&beta,`
			`y);`

			`return status;`
			`}`

			`inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const dense_mm &A, const float x, float y,`
			`cudaStream_t stream=0) {`
			`int n = A.A.dim_x;`
			`int m = A.A.dim_y;`
			`cublasStatus_t status;`

			`float alpha=1, beta=0;`

			`HANDLE_ERROR(cublasSetStream(A.cb_handle, stream));`

			`status = cublasSgemv(A.cb_handle, trans, m, n, &alpha, A.A.data_dev_ptr(), A.A.leading_dim(), x, 1, &beta, y, 1);`

			`HANDLE_ERROR(cublasSetStream(A.cb_handle, 0));`

			`return status;`
			`}`

			`__device__ float atomicAdd_cc1(float* address, float val) {`
			`int* address_as_int = (int*)address;`
			`int old = *address_as_int, assumed;`
			`do {`
			`assumed = old;`
			`old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));`
			`} while (assumed != old);`
			`return __int_as_float(old);`
			`}`

			`// To do: atomic operations...!!`
			`__device__ void mult_element(cublasOperation_t transA, int ray, int x_pixel, int y_pixel, int z_pixel, int dim_x, int dim_y, float scale_factor, const float x, float y) {`
			`int lin_index = x_pixeldim_y + y_pixel + z_pixeldim_x*dim_y;`
			`if(transA == CUBLAS_OP_N) {`
			`y[ray] += x[lin_index] * scale_factor;`
			`} else {`
			`#if __CUDA_ARCH__ < 200`
			`atomicAdd_cc1(y + lin_index, x[ray] * scale_factor);`
			`#else`
			`atomicAdd(y + lin_index, x[ray] * scale_factor);`
			`#endif`
			`//y[lin_index] += x[ray] * scale_factor;`
			`}`
			`}`


			`__global__ void dyn_calc_kernel(cublasOperation_t transA, int num_emitters, int num_receivers, int rv_x, int rv_y, int rv_z,`
			`float scale_factor, const int x_receivers, const int y_receivers, const int z_receivers, const int x_emitters,`
			`const int y_emitters, const int z_emitters, const float x, float y) {`

			`int index_r, index_e, inc_r, inc_e;`
			`int d1, d2, d3, inc1, m1, inc2, m2, inc3, m3, x_pixel, y_pixel, z_pixel, ray, err_1, err_2;`
			`int pixel1, pixel2, *pixel3;`

			`// different assignment of paths to threads for multiplication with or without transposition`
			`// (performance reasons)`
			`// no transposition: x-> receivers, y-> emitters`
			`// transposition: x-> emitters, y-> receivers`
			`if(transA == CUBLAS_OP_N) {`
			`index_r = threadIdx.x + blockIdx.x * blockDim.x;`
			`index_e = threadIdx.y + blockIdx.y * blockDim.y;`
			`inc_r = blockDim.x * gridDim.x;`
			`inc_e = blockDim.y * gridDim.y;`
			`} else {`
			`index_e = threadIdx.x + blockIdx.x * blockDim.x;`
			`index_r = threadIdx.y + blockIdx.y * blockDim.y;`
			`inc_e = blockDim.x * gridDim.x;`
			`inc_r = blockDim.y * gridDim.y;`
			`}`

			`for(int receiver=index_r; receiver < num_receivers; receiver+= inc_r) {`
			`for(int emitter=index_e; emitter < num_emitters; emitter+= inc_e) {`

			`ray = emitter*num_receivers + receiver;`

			`// trace path using Bresenham algorithm...`

			`x_pixel = x_emitters[emitter];`
			`y_pixel = y_emitters[emitter];`
			`z_pixel = z_emitters[emitter];`

			`mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);`

			`d1 = x_receivers[receiver] - x_pixel;`
			`d2 = y_receivers[receiver] - y_pixel;`
			`d3 = z_receivers[receiver] - z_pixel;`

			`m1 = abs(d1);`
			`m2 = abs(d2);`
			`m3 = abs(d3);`

			`if (m1 >= m2 && m1 >= m3) {`
			`pixel1 = &x_pixel;`
			`pixel2 = &y_pixel;`
			`pixel3 = &z_pixel;`
			`} else if(m2 >= m1 && m2 >= m3) {`
			`int tmp = d1;`
			`d1 = d2;`
			`d2 = tmp;`

			`pixel1 = &y_pixel;`
			`pixel2 = &x_pixel;`
			`pixel3 = &z_pixel;`
			`} else {`
			`int tmp = d1;`
			`d1 = d3;`
			`d3 = d2;`
			`d2 = tmp;`

			`pixel1 = &z_pixel;`
			`pixel2 = &x_pixel;`
			`pixel3 = &y_pixel;`
			`}`

			`inc1 = (d1 < 0) ? -1 : 1;`
			`inc2 = (d2 < 0) ? -1 : 1;`
			`inc3 = (d3 < 0) ? -1 : 1;`

			`m1 = abs(d1);`
			`m2 = abs(d2);`
			`m3 = abs(d3);`

			`err_1 = 2 * m2 - m1;`
			`err_2 = 2 * m3 - m1;`
			`for(int j = 1; j < m1+1; j++) {`
			`if (err_1 > 0) {`
			`*pixel2 += inc2;`
			`err_1 -= 2 * m1;`
			`}`
			`if (err_2 > 0) {`
			`*pixel3 += inc3;`
			`err_2 -= 2 * m1;`
			`}`
			`err_1 += 2 * m2;`
			`err_2 += 2 * m3;`
			`*pixel1 += inc1;`
			`mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);`
			`}`
			`}`
			`}`
			`}`

			`// matrix-vector multiplication with dynamic calculation of the measurement matrix...`
			`inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const geometry_device &A, const float x, float y,`
			`cudaStream_t stream=0)`
			`{`
			`int len_y = (trans == CUBLAS_OP_N) ? A.num_emitters * A.num_receivers : A.rv_x * A.rv_y * A.rv_z;`
			`cudaMemset(y, 0, len_y * sizeof(float));`
			`dim3 threads, blocks;`

			`// recht Willkürliche Dimensionierung (Performance ca. 10 % schlechter, als mit jeweils bester gefundener Dimensionierung`
			`threads.x = 64;`
			`threads.y = 4;`
			`if(trans == CUBLAS_OP_N) {`
			`blocks.x = max(A.num_receivers / threads.x, 1);`
			`blocks.y = max(A.num_emitters / threads.x, 1);`
			`} else {`
			`blocks.y = max(A.num_receivers / threads.x, 1);`
			`blocks.x = max(A.num_emitters / threads.x, 1);`
			`}`

			`// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 64x64x64:`
			`/* if(trans == CUBLAS_OP_N) {`
			`threads.x = 32;`
			`threads.y = 8;`
			`blocks.x = 40;`
			`blocks.y = 78;`
			`} else {`
			`threads.x = 128;`
			`threads.y = 2;`
			`blocks.x = 1;`
			`blocks.y =16;`
			`}*/`

			`// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 128x128x128:`
			`/* if(trans == CUBLAS_OP_N) {`
			`threads.x = 64;`
			`threads.y = 4;`
			`blocks.x = 22;`
			`blocks.y = 152;`
			`} else {`
			`threads.x = 64;`
			`threads.y = 8;`
			`blocks.x = 8;`
			`blocks.y = 26;`
			`}*/`

			`dyn_calc_kernel<<<blocks, threads, 0, stream>>>(trans, A.num_emitters, A.num_receivers, A.rv_x, A.rv_y, A.rv_z,`
			`A.scale_factor, A.x_re_dev_ptr(), A.y_re_dev_ptr(), A.z_re_dev_ptr(), A.x_em_dev_ptr(), A.y_em_dev_ptr(),`
			`A.z_em_dev_ptr(), x, y);`
			`return CUBLAS_STATUS_SUCCESS;`

			`}`



			`#endif /* MAT_VEC_MUL_H_ */`