351 lines
9.6 KiB
C
351 lines
9.6 KiB
C
/*
|
|
* mat_vec_mul.h
|
|
*
|
|
* Created on: Jul 13, 2011
|
|
* Author: ditlevsen
|
|
*/
|
|
#ifndef MAT_VEC_MUL_H_
|
|
#define MAT_VEC_MUL_H_
|
|
|
|
struct sparse_mm {
|
|
sparse_mat_device A_csc;
|
|
sparse_mat_device A_csr;
|
|
cusparseHandle_t cs_handle;
|
|
cusparseMatDescr_t descrA;
|
|
sparse_mm(const sparse_mat_host &A, cusparseHandle_t handle, cusparseMatDescr_t descriptor) :
|
|
A_csc(A), A_csr(A.dim_y, A.dim_x, A.nnz, sparse_mat_csr, false), cs_handle(handle),
|
|
descrA(descriptor)
|
|
{
|
|
|
|
|
|
/* Old CUDA 4.0 implementation for reference
|
|
HANDLE_ERROR(cusparseScsr2csc(cs_handle, A.dim_x, A.dim_y, A_csc.val(),
|
|
A_csc.ptr(), A_csc.ind(), A_csr.val(), A_csr.ind(),
|
|
A_csr.ptr(), 1, CUSPARSE_INDEX_BASE_ZERO));
|
|
*/
|
|
|
|
/* CUDA 10.1 adaption (Feb. 2021)
|
|
cusparseStatus_t cusparseScsr2csc(cusparseHandle_t handle,
|
|
int m,
|
|
int n,
|
|
int nnz,
|
|
const float* csrVal,
|
|
const int* csrRowPtr,
|
|
const int* csrColInd,
|
|
float* cscVal,
|
|
int* cscRowInd,
|
|
int* cscColPtr,
|
|
cusparseAction_t copyValues,
|
|
cusparseIndexBase_t idxBase)
|
|
|
|
*/
|
|
|
|
HANDLE_ERROR(cusparseScsr2csc(cs_handle,
|
|
A.dim_x, // m
|
|
A.dim_y, // n
|
|
A.nnz, // nnz
|
|
A_csc.val(), // csrVal
|
|
A_csc.ptr(), // csrRowPtr
|
|
A_csc.ind(), // csrColInd
|
|
A_csr.val(), // cscVal
|
|
A_csr.ind(), // cscRowInd
|
|
A_csr.ptr(), // cscColPtr
|
|
CUSPARSE_ACTION_NUMERIC, // copyValues
|
|
CUSPARSE_INDEX_BASE_ZERO)); // idxBase
|
|
|
|
|
|
|
|
|
|
/* TO DO: adaption to CUDA 11. Above function was replaced with the following function
|
|
|
|
cusparseStatus_t cusparseCsr2cscEx2(cusparseHandle_t handle,
|
|
int m,
|
|
int n,
|
|
int nnz,
|
|
const void* csrVal,
|
|
const int* csrRowPtr,
|
|
const int* csrColInd,
|
|
void* cscVal,
|
|
int* cscColPtr,
|
|
int* cscRowInd,
|
|
cudaDataType valType,
|
|
cusparseAction_t copyValues,
|
|
cusparseIndexBase_t idxBase,
|
|
cusparseCsr2CscAlg_t alg,
|
|
void* buffer)
|
|
|
|
*/
|
|
|
|
};
|
|
};
|
|
|
|
struct dense_mm {
|
|
mat_device A;
|
|
cublasHandle_t cb_handle;
|
|
dense_mm(const mat_host &A_host, cublasHandle_t handle) : A(A_host, true), cb_handle(handle) {};
|
|
};
|
|
|
|
inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A, const float *x, float *y,
|
|
cudaStream_t stream=0)
|
|
{
|
|
int n, m;
|
|
const sparse_mat_device *mA;
|
|
const float alpha = 1;
|
|
const float beta = 0;
|
|
|
|
|
|
if(transA == CUBLAS_OP_N) {
|
|
n = A.A_csr.dim_x;
|
|
m = A.A_csr.dim_y;
|
|
mA = &A.A_csr;
|
|
} else {
|
|
n = A.A_csr.dim_y;
|
|
m = A.A_csr.dim_x;
|
|
mA = &A.A_csc; // implicit transponation...
|
|
}
|
|
|
|
/* Old CUDA 4 implementation
|
|
HANDLE_ERROR(cusparseSetKernelStream(A.cs_handle, stream)); // one problem here
|
|
*/
|
|
|
|
|
|
/* CUDA 10.1 adaption (Feb. 2021)*/
|
|
HANDLE_ERROR(cusparseSetStream(A.cs_handle, stream));
|
|
|
|
|
|
/* Old CUDA 4 implementation
|
|
return cusparseScsrmv(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, m, n, 1, A.descrA, mA->val(),mA->ptr(), mA->ind(), x, 0, y);
|
|
*/
|
|
|
|
|
|
|
|
/* CUDA 10.1 adaption (Feb. 2021)
|
|
cusparseStatus_tcusparseScsrmv(cusparseHandle_t handle,
|
|
cusparseOperation_t transA,
|
|
int m,
|
|
int n,
|
|
int nnz,
|
|
const float* alpha,
|
|
const cusparseMatDescr_t descrA,
|
|
const float* csrValA,
|
|
const int* csrRowPtrA,
|
|
const int* csrColIndA,
|
|
const float* x,
|
|
const float* beta,
|
|
float* y);
|
|
|
|
*/
|
|
|
|
cusparseStatus_t status;
|
|
status = cusparseScsrmv(A.cs_handle,
|
|
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
|
m,
|
|
n,
|
|
A.A_csr.nnz,
|
|
&alpha,
|
|
A.descrA,
|
|
mA->val(),
|
|
mA->ptr(),
|
|
mA->ind(),
|
|
x,
|
|
&beta,
|
|
y);
|
|
|
|
return status;
|
|
}
|
|
|
|
inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const dense_mm &A, const float *x, float *y,
|
|
cudaStream_t stream=0) {
|
|
int n = A.A.dim_x;
|
|
int m = A.A.dim_y;
|
|
cublasStatus_t status;
|
|
|
|
float alpha=1, beta=0;
|
|
|
|
HANDLE_ERROR(cublasSetStream(A.cb_handle, stream));
|
|
|
|
status = cublasSgemv(A.cb_handle, trans, m, n, &alpha, A.A.data_dev_ptr(), A.A.leading_dim(), x, 1, &beta, y, 1);
|
|
|
|
HANDLE_ERROR(cublasSetStream(A.cb_handle, 0));
|
|
|
|
return status;
|
|
}
|
|
|
|
__device__ float atomicAdd_cc1(float* address, float val) {
|
|
int* address_as_int = (int*)address;
|
|
int old = *address_as_int, assumed;
|
|
do {
|
|
assumed = old;
|
|
old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
|
|
} while (assumed != old);
|
|
return __int_as_float(old);
|
|
}
|
|
|
|
// To do: atomic operations...!!
|
|
__device__ void mult_element(cublasOperation_t transA, int ray, int x_pixel, int y_pixel, int z_pixel, int dim_x, int dim_y, float scale_factor, const float *x, float *y) {
|
|
int lin_index = x_pixel*dim_y + y_pixel + z_pixel*dim_x*dim_y;
|
|
if(transA == CUBLAS_OP_N) {
|
|
y[ray] += x[lin_index] * scale_factor;
|
|
} else {
|
|
#if __CUDA_ARCH__ < 200
|
|
atomicAdd_cc1(y + lin_index, x[ray] * scale_factor);
|
|
#else
|
|
atomicAdd(y + lin_index, x[ray] * scale_factor);
|
|
#endif
|
|
//y[lin_index] += x[ray] * scale_factor;
|
|
}
|
|
}
|
|
|
|
|
|
__global__ void dyn_calc_kernel(cublasOperation_t transA, int num_emitters, int num_receivers, int rv_x, int rv_y, int rv_z,
|
|
float scale_factor, const int *x_receivers, const int *y_receivers, const int *z_receivers, const int *x_emitters,
|
|
const int *y_emitters, const int *z_emitters, const float *x, float *y) {
|
|
|
|
int index_r, index_e, inc_r, inc_e;
|
|
int d1, d2, d3, inc1, m1, inc2, m2, inc3, m3, x_pixel, y_pixel, z_pixel, ray, err_1, err_2;
|
|
int *pixel1, *pixel2, *pixel3;
|
|
|
|
// different assignment of paths to threads for multiplication with or without transposition
|
|
// (performance reasons)
|
|
// no transposition: x-> receivers, y-> emitters
|
|
// transposition: x-> emitters, y-> receivers
|
|
if(transA == CUBLAS_OP_N) {
|
|
index_r = threadIdx.x + blockIdx.x * blockDim.x;
|
|
index_e = threadIdx.y + blockIdx.y * blockDim.y;
|
|
inc_r = blockDim.x * gridDim.x;
|
|
inc_e = blockDim.y * gridDim.y;
|
|
} else {
|
|
index_e = threadIdx.x + blockIdx.x * blockDim.x;
|
|
index_r = threadIdx.y + blockIdx.y * blockDim.y;
|
|
inc_e = blockDim.x * gridDim.x;
|
|
inc_r = blockDim.y * gridDim.y;
|
|
}
|
|
|
|
for(int receiver=index_r; receiver < num_receivers; receiver+= inc_r) {
|
|
for(int emitter=index_e; emitter < num_emitters; emitter+= inc_e) {
|
|
|
|
ray = emitter*num_receivers + receiver;
|
|
|
|
// trace path using Bresenham algorithm...
|
|
|
|
x_pixel = x_emitters[emitter];
|
|
y_pixel = y_emitters[emitter];
|
|
z_pixel = z_emitters[emitter];
|
|
|
|
mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);
|
|
|
|
d1 = x_receivers[receiver] - x_pixel;
|
|
d2 = y_receivers[receiver] - y_pixel;
|
|
d3 = z_receivers[receiver] - z_pixel;
|
|
|
|
m1 = abs(d1);
|
|
m2 = abs(d2);
|
|
m3 = abs(d3);
|
|
|
|
if (m1 >= m2 && m1 >= m3) {
|
|
pixel1 = &x_pixel;
|
|
pixel2 = &y_pixel;
|
|
pixel3 = &z_pixel;
|
|
} else if(m2 >= m1 && m2 >= m3) {
|
|
int tmp = d1;
|
|
d1 = d2;
|
|
d2 = tmp;
|
|
|
|
pixel1 = &y_pixel;
|
|
pixel2 = &x_pixel;
|
|
pixel3 = &z_pixel;
|
|
} else {
|
|
int tmp = d1;
|
|
d1 = d3;
|
|
d3 = d2;
|
|
d2 = tmp;
|
|
|
|
pixel1 = &z_pixel;
|
|
pixel2 = &x_pixel;
|
|
pixel3 = &y_pixel;
|
|
}
|
|
|
|
inc1 = (d1 < 0) ? -1 : 1;
|
|
inc2 = (d2 < 0) ? -1 : 1;
|
|
inc3 = (d3 < 0) ? -1 : 1;
|
|
|
|
m1 = abs(d1);
|
|
m2 = abs(d2);
|
|
m3 = abs(d3);
|
|
|
|
err_1 = 2 * m2 - m1;
|
|
err_2 = 2 * m3 - m1;
|
|
for(int j = 1; j < m1+1; j++) {
|
|
if (err_1 > 0) {
|
|
*pixel2 += inc2;
|
|
err_1 -= 2 * m1;
|
|
}
|
|
if (err_2 > 0) {
|
|
*pixel3 += inc3;
|
|
err_2 -= 2 * m1;
|
|
}
|
|
err_1 += 2 * m2;
|
|
err_2 += 2 * m3;
|
|
*pixel1 += inc1;
|
|
mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// matrix-vector multiplication with dynamic calculation of the measurement matrix...
|
|
inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const geometry_device &A, const float *x, float *y,
|
|
cudaStream_t stream=0)
|
|
{
|
|
int len_y = (trans == CUBLAS_OP_N) ? A.num_emitters * A.num_receivers : A.rv_x * A.rv_y * A.rv_z;
|
|
cudaMemset(y, 0, len_y * sizeof(float));
|
|
dim3 threads, blocks;
|
|
|
|
// recht Willkürliche Dimensionierung (Performance ca. 10 % schlechter, als mit jeweils bester gefundener Dimensionierung
|
|
threads.x = 64;
|
|
threads.y = 4;
|
|
if(trans == CUBLAS_OP_N) {
|
|
blocks.x = max(A.num_receivers / threads.x, 1);
|
|
blocks.y = max(A.num_emitters / threads.x, 1);
|
|
} else {
|
|
blocks.y = max(A.num_receivers / threads.x, 1);
|
|
blocks.x = max(A.num_emitters / threads.x, 1);
|
|
}
|
|
|
|
// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 64x64x64:
|
|
/* if(trans == CUBLAS_OP_N) {
|
|
threads.x = 32;
|
|
threads.y = 8;
|
|
blocks.x = 40;
|
|
blocks.y = 78;
|
|
} else {
|
|
threads.x = 128;
|
|
threads.y = 2;
|
|
blocks.x = 1;
|
|
blocks.y =16;
|
|
}*/
|
|
|
|
// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 128x128x128:
|
|
/* if(trans == CUBLAS_OP_N) {
|
|
threads.x = 64;
|
|
threads.y = 4;
|
|
blocks.x = 22;
|
|
blocks.y = 152;
|
|
} else {
|
|
threads.x = 64;
|
|
threads.y = 8;
|
|
blocks.x = 8;
|
|
blocks.y = 26;
|
|
}*/
|
|
|
|
dyn_calc_kernel<<<blocks, threads, 0, stream>>>(trans, A.num_emitters, A.num_receivers, A.rv_x, A.rv_y, A.rv_z,
|
|
A.scale_factor, A.x_re_dev_ptr(), A.y_re_dev_ptr(), A.z_re_dev_ptr(), A.x_em_dev_ptr(), A.y_em_dev_ptr(),
|
|
A.z_em_dev_ptr(), x, y);
|
|
return CUBLAS_STATUS_SUCCESS;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* MAT_VEC_MUL_H_ */
|