/* * mat_vec_mul.h * * Created on: Jul 13, 2011 * Author: ditlevsen */ #ifndef MAT_VEC_MUL_H_ #define MAT_VEC_MUL_H_ struct sparse_mm { sparse_mat_device A_csc; sparse_mat_device A_csr; cusparseHandle_t cs_handle; cusparseMatDescr_t descrA; sparse_mm(const sparse_mat_host &A, cusparseHandle_t handle, cusparseMatDescr_t descriptor) : A_csc(A), A_csr(A.dim_y, A.dim_x, A.nnz, sparse_mat_csr, false), cs_handle(handle), descrA(descriptor) { /* Old CUDA 4.0 implementation for reference HANDLE_ERROR(cusparseScsr2csc(cs_handle, A.dim_x, A.dim_y, A_csc.val(), A_csc.ptr(), A_csc.ind(), A_csr.val(), A_csr.ind(), A_csr.ptr(), 1, CUSPARSE_INDEX_BASE_ZERO)); */ /* CUDA 10.1 adaption (Feb. 2021) cusparseStatus_t cusparseScsr2csc(cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal, const int* csrRowPtr, const int* csrColInd, float* cscVal, int* cscRowInd, int* cscColPtr, cusparseAction_t copyValues, cusparseIndexBase_t idxBase) */ HANDLE_ERROR(cusparseScsr2csc(cs_handle, A.dim_x, // m A.dim_y, // n A.nnz, // nnz A_csc.val(), // csrVal A_csc.ptr(), // csrRowPtr A_csc.ind(), // csrColInd A_csr.val(), // cscVal A_csr.ind(), // cscRowInd A_csr.ptr(), // cscColPtr CUSPARSE_ACTION_NUMERIC, // copyValues CUSPARSE_INDEX_BASE_ZERO)); // idxBase /* TO DO: adaption to CUDA 11. Above function was replaced with the following function cusparseStatus_t cusparseCsr2cscEx2(cusparseHandle_t handle, int m, int n, int nnz, const void* csrVal, const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr, int* cscRowInd, cudaDataType valType, cusparseAction_t copyValues, cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void* buffer) */ }; }; struct dense_mm { mat_device A; cublasHandle_t cb_handle; dense_mm(const mat_host &A_host, cublasHandle_t handle) : A(A_host, true), cb_handle(handle) {}; }; inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A, const float *x, float *y, cudaStream_t stream=0) { int n, m; const sparse_mat_device *mA; const float alpha = 1; const float beta = 0; if(transA == CUBLAS_OP_N) { n = A.A_csr.dim_x; m = A.A_csr.dim_y; mA = &A.A_csr; } else { n = A.A_csr.dim_y; m = A.A_csr.dim_x; mA = &A.A_csc; // implicit transponation... } /* Old CUDA 4 implementation HANDLE_ERROR(cusparseSetKernelStream(A.cs_handle, stream)); // one problem here */ /* CUDA 10.1 adaption (Feb. 2021)*/ HANDLE_ERROR(cusparseSetStream(A.cs_handle, stream)); /* Old CUDA 4 implementation return cusparseScsrmv(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, m, n, 1, A.descrA, mA->val(),mA->ptr(), mA->ind(), x, 0, y); */ /* CUDA 10.1 adaption (Feb. 2021) cusparseStatus_tcusparseScsrmv(cusparseHandle_t handle, cusparseOperation_t transA, int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA, const float* csrValA, const int* csrRowPtrA, const int* csrColIndA, const float* x, const float* beta, float* y); */ cusparseStatus_t status; status = cusparseScsrmv(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, m, n, A.A_csr.nnz, &alpha, A.descrA, mA->val(), mA->ptr(), mA->ind(), x, &beta, y); return status; } inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const dense_mm &A, const float *x, float *y, cudaStream_t stream=0) { int n = A.A.dim_x; int m = A.A.dim_y; cublasStatus_t status; float alpha=1, beta=0; HANDLE_ERROR(cublasSetStream(A.cb_handle, stream)); status = cublasSgemv(A.cb_handle, trans, m, n, &alpha, A.A.data_dev_ptr(), A.A.leading_dim(), x, 1, &beta, y, 1); HANDLE_ERROR(cublasSetStream(A.cb_handle, 0)); return status; } __device__ float atomicAdd_cc1(float* address, float val) { int* address_as_int = (int*)address; int old = *address_as_int, assumed; do { assumed = old; old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed))); } while (assumed != old); return __int_as_float(old); } // To do: atomic operations...!! __device__ void mult_element(cublasOperation_t transA, int ray, int x_pixel, int y_pixel, int z_pixel, int dim_x, int dim_y, float scale_factor, const float *x, float *y) { int lin_index = x_pixel*dim_y + y_pixel + z_pixel*dim_x*dim_y; if(transA == CUBLAS_OP_N) { y[ray] += x[lin_index] * scale_factor; } else { #if __CUDA_ARCH__ < 200 atomicAdd_cc1(y + lin_index, x[ray] * scale_factor); #else atomicAdd(y + lin_index, x[ray] * scale_factor); #endif //y[lin_index] += x[ray] * scale_factor; } } __global__ void dyn_calc_kernel(cublasOperation_t transA, int num_emitters, int num_receivers, int rv_x, int rv_y, int rv_z, float scale_factor, const int *x_receivers, const int *y_receivers, const int *z_receivers, const int *x_emitters, const int *y_emitters, const int *z_emitters, const float *x, float *y) { int index_r, index_e, inc_r, inc_e; int d1, d2, d3, inc1, m1, inc2, m2, inc3, m3, x_pixel, y_pixel, z_pixel, ray, err_1, err_2; int *pixel1, *pixel2, *pixel3; // different assignment of paths to threads for multiplication with or without transposition // (performance reasons) // no transposition: x-> receivers, y-> emitters // transposition: x-> emitters, y-> receivers if(transA == CUBLAS_OP_N) { index_r = threadIdx.x + blockIdx.x * blockDim.x; index_e = threadIdx.y + blockIdx.y * blockDim.y; inc_r = blockDim.x * gridDim.x; inc_e = blockDim.y * gridDim.y; } else { index_e = threadIdx.x + blockIdx.x * blockDim.x; index_r = threadIdx.y + blockIdx.y * blockDim.y; inc_e = blockDim.x * gridDim.x; inc_r = blockDim.y * gridDim.y; } for(int receiver=index_r; receiver < num_receivers; receiver+= inc_r) { for(int emitter=index_e; emitter < num_emitters; emitter+= inc_e) { ray = emitter*num_receivers + receiver; // trace path using Bresenham algorithm... x_pixel = x_emitters[emitter]; y_pixel = y_emitters[emitter]; z_pixel = z_emitters[emitter]; mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y); d1 = x_receivers[receiver] - x_pixel; d2 = y_receivers[receiver] - y_pixel; d3 = z_receivers[receiver] - z_pixel; m1 = abs(d1); m2 = abs(d2); m3 = abs(d3); if (m1 >= m2 && m1 >= m3) { pixel1 = &x_pixel; pixel2 = &y_pixel; pixel3 = &z_pixel; } else if(m2 >= m1 && m2 >= m3) { int tmp = d1; d1 = d2; d2 = tmp; pixel1 = &y_pixel; pixel2 = &x_pixel; pixel3 = &z_pixel; } else { int tmp = d1; d1 = d3; d3 = d2; d2 = tmp; pixel1 = &z_pixel; pixel2 = &x_pixel; pixel3 = &y_pixel; } inc1 = (d1 < 0) ? -1 : 1; inc2 = (d2 < 0) ? -1 : 1; inc3 = (d3 < 0) ? -1 : 1; m1 = abs(d1); m2 = abs(d2); m3 = abs(d3); err_1 = 2 * m2 - m1; err_2 = 2 * m3 - m1; for(int j = 1; j < m1+1; j++) { if (err_1 > 0) { *pixel2 += inc2; err_1 -= 2 * m1; } if (err_2 > 0) { *pixel3 += inc3; err_2 -= 2 * m1; } err_1 += 2 * m2; err_2 += 2 * m3; *pixel1 += inc1; mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y); } } } } // matrix-vector multiplication with dynamic calculation of the measurement matrix... inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const geometry_device &A, const float *x, float *y, cudaStream_t stream=0) { int len_y = (trans == CUBLAS_OP_N) ? A.num_emitters * A.num_receivers : A.rv_x * A.rv_y * A.rv_z; cudaMemset(y, 0, len_y * sizeof(float)); dim3 threads, blocks; // recht Willkürliche Dimensionierung (Performance ca. 10 % schlechter, als mit jeweils bester gefundener Dimensionierung threads.x = 64; threads.y = 4; if(trans == CUBLAS_OP_N) { blocks.x = max(A.num_receivers / threads.x, 1); blocks.y = max(A.num_emitters / threads.x, 1); } else { blocks.y = max(A.num_receivers / threads.x, 1); blocks.x = max(A.num_emitters / threads.x, 1); } // beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 64x64x64: /* if(trans == CUBLAS_OP_N) { threads.x = 32; threads.y = 8; blocks.x = 40; blocks.y = 78; } else { threads.x = 128; threads.y = 2; blocks.x = 1; blocks.y =16; }*/ // beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 128x128x128: /* if(trans == CUBLAS_OP_N) { threads.x = 64; threads.y = 4; blocks.x = 22; blocks.y = 152; } else { threads.x = 64; threads.y = 8; blocks.x = 8; blocks.y = 26; }*/ dyn_calc_kernel<<>>(trans, A.num_emitters, A.num_receivers, A.rv_x, A.rv_y, A.rv_z, A.scale_factor, A.x_re_dev_ptr(), A.y_re_dev_ptr(), A.z_re_dev_ptr(), A.x_em_dev_ptr(), A.y_em_dev_ptr(), A.z_em_dev_ptr(), x, y); return CUBLAS_STATUS_SUCCESS; } #endif /* MAT_VEC_MUL_H_ */