Commit source
This commit is contained in:
387
TVALGPU/include/container_device.h
Normal file
387
TVALGPU/include/container_device.h
Normal file
@@ -0,0 +1,387 @@
|
||||
/*
|
||||
* container_gpu.h
|
||||
*
|
||||
* Created on: Jun 9, 2011
|
||||
* Author: ditlevsen
|
||||
*/
|
||||
|
||||
#ifndef CONTAINER_DEVICE_H_
|
||||
#define CONTAINER_DEVICE_H_
|
||||
|
||||
#include "container_host.h"
|
||||
|
||||
class mat_device {
|
||||
private:
|
||||
float *pbuf;
|
||||
int ld;
|
||||
public:
|
||||
const int dim_x;
|
||||
const int dim_y;
|
||||
const int dim_z;
|
||||
const int len;
|
||||
const T_mat_format format;
|
||||
|
||||
mat_device(int l_y, int l_x, int l_z, bool init=true, bool pitch=false, T_mat_format storage=mat_col_major);
|
||||
mat_device(int num_elements, bool init=true);
|
||||
mat_device(const mat_device &m);
|
||||
mat_device(const mat_host &m, bool pitch=false);
|
||||
~mat_device() { if(pbuf) HANDLE_ERROR(cudaFree(pbuf)); }
|
||||
|
||||
int leading_dim() const { return ld; }
|
||||
|
||||
float *data_dev_ptr() { return pbuf; }
|
||||
const float *data_dev_ptr() const { return pbuf; }
|
||||
|
||||
mat_device &operator=(const mat_device &m) throw(illegal_mat_gpu_assignment);
|
||||
mat_device &operator=(const mat_host &m) throw(illegal_mat_gpu_assignment);
|
||||
};
|
||||
|
||||
class sparse_mat_device {
|
||||
private:
|
||||
int *p_ind;
|
||||
int *p_ptr;
|
||||
float *p_val;
|
||||
public:
|
||||
const int dim_y;
|
||||
const int dim_x;
|
||||
const int nnz;
|
||||
T_sparse_mat_format format;
|
||||
|
||||
sparse_mat_device(int num_dim_y, int num_dim_x, int n_nonzero, T_sparse_mat_format storage_format, bool init=true);
|
||||
sparse_mat_device(const sparse_mat_device &m);
|
||||
sparse_mat_device(const sparse_mat_host &m);
|
||||
~sparse_mat_device();
|
||||
|
||||
float *val() { return p_val; }
|
||||
const float *val() const { return p_val; }
|
||||
|
||||
int *ptr() { return p_ptr; }
|
||||
const int *ptr() const { return p_ptr; }
|
||||
|
||||
int *ind() { return p_ind; }
|
||||
const int *ind() const { return p_ind; }
|
||||
|
||||
sparse_mat_device &operator=(const sparse_mat_device &m) throw(illegal_mat_gpu_assignment);
|
||||
sparse_mat_device &operator=(const sparse_mat_host &m) throw(illegal_mat_gpu_assignment);
|
||||
};
|
||||
|
||||
//---------------------------------------
|
||||
// class geomety_device
|
||||
//---------------------------------------
|
||||
|
||||
class geometry_device {
|
||||
private:
|
||||
int *x_emitters;
|
||||
int *y_emitters;
|
||||
int *z_emitters;
|
||||
int *x_receivers;
|
||||
int *y_receivers;
|
||||
int *z_receivers;
|
||||
|
||||
public:
|
||||
int *x_em_dev_ptr() { return x_emitters; }
|
||||
const int *x_em_dev_ptr() const { return x_emitters; }
|
||||
int *y_em_dev_ptr() { return y_emitters; }
|
||||
const int *y_em_dev_ptr() const { return y_emitters; }
|
||||
int *z_em_dev_ptr() { return z_emitters; }
|
||||
const int *z_em_dev_ptr() const { return z_emitters; }
|
||||
int *x_re_dev_ptr() { return x_receivers; }
|
||||
const int *x_re_dev_ptr() const { return x_receivers; }
|
||||
int *y_re_dev_ptr() { return y_receivers; }
|
||||
const int *y_re_dev_ptr() const { return y_receivers; }
|
||||
int *z_re_dev_ptr() { return z_receivers; }
|
||||
const int *z_re_dev_ptr() const { return z_receivers; }
|
||||
|
||||
const int num_emitters;
|
||||
const int num_receivers;
|
||||
const int rv_x;
|
||||
const int rv_y;
|
||||
const int rv_z;
|
||||
const float scale_factor;
|
||||
|
||||
geometry_device(const geometry_host &geom_host);
|
||||
geometry_device(const geometry_device &geom_dev);
|
||||
~geometry_device();
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// member functions, etc. class mat_device
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
//member functions...
|
||||
mat_device::mat_device(int l_y, int l_x, int l_z, bool init, bool pitch, T_mat_format storage): pbuf(NULL), dim_y(l_y),
|
||||
dim_z(l_z), dim_x(l_x), len(l_y*l_x*l_z), format(storage) {
|
||||
if(len > 0) {
|
||||
if(pitch) {
|
||||
size_t width = (format == mat_row_major) ? dim_x*sizeof(float) : dim_y*sizeof(float);
|
||||
size_t height = (format == mat_row_major) ? dim_y : dim_x;
|
||||
size_t p;
|
||||
HANDLE_ERROR(cudaMallocPitch(&pbuf, &p, width, height * dim_z));
|
||||
ld = p / sizeof(float);
|
||||
if(init)
|
||||
HANDLE_ERROR(cudaMemset2D(pbuf, p, 0, width, height * dim_z));
|
||||
} else {
|
||||
HANDLE_ERROR(cudaMalloc(&pbuf, len*sizeof(float)));
|
||||
ld = (format == mat_row_major) ? dim_x : dim_y;
|
||||
if(init)
|
||||
HANDLE_ERROR(cudaMemset(pbuf, 0, len*sizeof(float)));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
mat_device::mat_device(int num_elements, bool init): pbuf(NULL), dim_y(num_elements), dim_x(1), dim_z(1), len(num_elements),
|
||||
format(mat_col_major) {
|
||||
if(len > 0) {
|
||||
HANDLE_ERROR(cudaMalloc(&pbuf, len*sizeof(float)));
|
||||
ld = (format == mat_row_major) ? dim_x : dim_y;
|
||||
if(init)
|
||||
HANDLE_ERROR(cudaMemset(pbuf, 0, len*sizeof(float)));
|
||||
}
|
||||
};
|
||||
|
||||
mat_device::mat_device(const mat_device &m): pbuf(NULL), dim_y(m.dim_y), dim_x(m.dim_x), dim_z(m.dim_z), len(m.len),
|
||||
format(m.format), ld(m.ld) {
|
||||
if(len > 0) {
|
||||
size_t width = (format == mat_row_major) ? dim_x*sizeof(float) : dim_y*sizeof(float);
|
||||
size_t height = (format == mat_row_major) ? dim_y : dim_x;
|
||||
int total_len = (format == mat_row_major) ? ld*dim_y*dim_z : ld*dim_x*dim_z;
|
||||
HANDLE_ERROR(cudaMalloc(&pbuf, total_len*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy2D(pbuf, ld*sizeof(float), m.pbuf, m.ld*sizeof(float), width, height * dim_z, cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
};
|
||||
|
||||
mat_device::mat_device(const mat_host &m, bool pitch): pbuf(NULL), dim_y(m.dim_y), dim_x(m.dim_x), dim_z(m.dim_z), len(m.len),
|
||||
format(m.format) {
|
||||
if(len > 0) {
|
||||
if(pitch) {
|
||||
size_t width = (format == mat_row_major) ? dim_x*sizeof(float) : dim_y*sizeof(float);
|
||||
size_t height = (format == mat_row_major) ? dim_y : dim_x;
|
||||
size_t p;
|
||||
|
||||
HANDLE_ERROR(cudaMallocPitch(&pbuf, &p, width, height * dim_z));
|
||||
ld = p / sizeof(float);
|
||||
HANDLE_ERROR(cudaMemcpy2D(pbuf, ld*sizeof(float), m.data(), width, width, height * dim_z, cudaMemcpyHostToDevice));
|
||||
} else {
|
||||
HANDLE_ERROR(cudaMalloc(&pbuf, len*sizeof(float)));
|
||||
ld = (format == mat_row_major) ? dim_x : dim_y;
|
||||
HANDLE_ERROR(cudaMemcpy(pbuf, m.data(), len*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
mat_device &mat_device::operator=(const mat_device &m) throw(illegal_mat_gpu_assignment) {
|
||||
int w_m = (m.format == mat_row_major) ? m.dim_x : m.dim_y;
|
||||
int w = (format == mat_row_major) ? dim_x : dim_y;
|
||||
if(w_m == m.ld && w == ld && len == m.len) {
|
||||
HANDLE_ERROR(cudaMemcpy(pbuf, m.data_dev_ptr(), len*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
} else if((m.dim_y == dim_y) && (m.dim_x == dim_x) && (m.dim_z == dim_z) && (m.format == format)) {
|
||||
size_t width = (format == mat_row_major) ? dim_x*sizeof(float) : dim_y*sizeof(float);
|
||||
size_t height = (format == mat_row_major) ? dim_y : dim_x;
|
||||
HANDLE_ERROR(cudaMemcpy2D(pbuf, ld*sizeof(float), m.pbuf, m.ld*sizeof(float), width, height * dim_z, cudaMemcpyDeviceToDevice));
|
||||
} else {
|
||||
throw illegal_mat_gpu_assignment("Illegale Zuweisung von mat_device-Objekten!");
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
mat_device &mat_device::operator=(const mat_host &m) throw(illegal_mat_gpu_assignment) {
|
||||
if((m.dim_y == dim_y) && (m.dim_x == dim_x) && (m.dim_z == dim_z) && (m.format == format)) {
|
||||
size_t width = (format == mat_row_major) ? dim_x*sizeof(float) : dim_y*sizeof(float);
|
||||
size_t height = (format == mat_row_major) ? dim_y : dim_x;
|
||||
HANDLE_ERROR(cudaMemcpy2D(pbuf, ld*sizeof(float), m.data(), width, width, height * dim_z, cudaMemcpyHostToDevice));
|
||||
} else {
|
||||
throw illegal_mat_gpu_assignment("Illegale Zuweisung mat_device <- mat_host!");
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// function for conversion mat_device -> mat_host
|
||||
void mat_gpu_to_host(mat_host &m_host, const mat_device &m) throw(illegal_mat_gpu_assignment) {
|
||||
if((m.dim_y == m_host.dim_y) && (m.dim_x == m_host.dim_x) && (m.dim_z == m_host.dim_z) && (m.format == m_host.format)) {
|
||||
size_t width = (m.format == mat_row_major) ? m.dim_x*sizeof(float) : m.dim_y*sizeof(float);
|
||||
size_t height = (m.format == mat_row_major) ? m.dim_y : m.dim_x;
|
||||
HANDLE_ERROR(cudaMemcpy2D(m_host.data(), width, m.data_dev_ptr(), m.leading_dim()*sizeof(float), width, height * m.dim_z, cudaMemcpyDeviceToHost));
|
||||
} else {
|
||||
throw illegal_mat_gpu_assignment("Illegale Zuweisung mat_host <- mat_device!");
|
||||
}
|
||||
};
|
||||
|
||||
// output operator...
|
||||
std::ostream &operator<<(std::ostream &stream, const mat_device &m) {
|
||||
mat_host m_host(m.dim_y, m.dim_x, m.dim_z, m.format, false, false);
|
||||
|
||||
mat_gpu_to_host(m_host, m);
|
||||
|
||||
stream << m_host;
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// member functions, etc. class sparse_mat_device
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
//member functions...
|
||||
sparse_mat_device::sparse_mat_device(int num_dim_y, int num_dim_x, int n_nonzero, T_sparse_mat_format storage_format, bool init): p_val(NULL), p_ptr(NULL), p_ind(NULL),
|
||||
dim_y(num_dim_y), dim_x(num_dim_x), nnz(n_nonzero), format(storage_format)
|
||||
{
|
||||
if(nnz > 0) {
|
||||
int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
|
||||
|
||||
HANDLE_ERROR(cudaMalloc(&p_val, nnz*sizeof(float)));
|
||||
if(init)
|
||||
HANDLE_ERROR(cudaMemset(p_val, 0, nnz*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMalloc(&p_ptr, len_ptr*sizeof(float)));
|
||||
if(init)
|
||||
HANDLE_ERROR(cudaMemset(p_ptr, 0, len_ptr*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMalloc(&p_ind, nnz*sizeof(float)));
|
||||
if(init)
|
||||
HANDLE_ERROR(cudaMemset(p_ind, 0, nnz*sizeof(float)));
|
||||
}
|
||||
};
|
||||
sparse_mat_device::sparse_mat_device(const sparse_mat_device &m): p_val(NULL), p_ptr(NULL), p_ind(NULL), dim_y(m.dim_y),
|
||||
dim_x(m.dim_x), nnz(m.nnz), format(m.format)
|
||||
{
|
||||
if(nnz > 0) {
|
||||
int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
|
||||
|
||||
HANDLE_ERROR(cudaMalloc(&p_val, nnz*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(p_val, m.p_val, nnz*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&p_ptr, len_ptr*sizeof(int)));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ptr, m.p_ptr, len_ptr*sizeof(int), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&p_ind, nnz*sizeof(int)));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ind, m.p_ind, nnz*sizeof(int), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
};
|
||||
|
||||
sparse_mat_device::sparse_mat_device(const sparse_mat_host &m): p_val(NULL), p_ptr(NULL), p_ind(NULL), dim_y(m.dim_y),
|
||||
dim_x(m.dim_x), nnz(m.nnz), format(m.format)
|
||||
{
|
||||
if(nnz > 0) {
|
||||
int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
|
||||
|
||||
HANDLE_ERROR(cudaMalloc(&p_val, nnz*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(p_val, m.val(), nnz*sizeof(float), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&p_ptr, len_ptr*sizeof(int)));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ptr, m.ptr(), len_ptr*sizeof(int), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&p_ind, nnz*sizeof(int)));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ind, m.ind(), nnz*sizeof(int), cudaMemcpyHostToDevice));
|
||||
}
|
||||
};
|
||||
|
||||
sparse_mat_device::~sparse_mat_device() {
|
||||
if(p_val) HANDLE_ERROR(cudaFree(p_val));
|
||||
if(p_ptr) HANDLE_ERROR(cudaFree(p_ptr));
|
||||
if(p_ind) HANDLE_ERROR(cudaFree(p_ind));
|
||||
}
|
||||
|
||||
sparse_mat_device &sparse_mat_device::operator=(const sparse_mat_device &m) throw(illegal_mat_gpu_assignment) {
|
||||
if(m.nnz == nnz && m.format == format && m.dim_x == dim_x && m.dim_y == dim_y) {
|
||||
int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
|
||||
|
||||
HANDLE_ERROR(cudaMemcpy(p_val, m.p_val, nnz*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ptr, m.p_ptr, len_ptr*sizeof(int), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ind, m.p_ind, nnz*sizeof(int), cudaMemcpyDeviceToDevice));
|
||||
} else {
|
||||
throw illegal_mat_gpu_assignment("Illegale Zuweisung von sparse_mat_device-Objekten!");
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
sparse_mat_device &sparse_mat_device::operator=(const sparse_mat_host &m) throw(illegal_mat_gpu_assignment) {
|
||||
if(m.nnz == nnz && m.format == format && m.dim_x == dim_x && m.dim_y == dim_y) {
|
||||
int len_ptr = (format == sparse_mat_csc) ? dim_x + 1 : dim_y + 1;
|
||||
|
||||
HANDLE_ERROR(cudaMemcpy(p_val, m.val(), nnz*sizeof(float), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ptr, m.ptr(), len_ptr*sizeof(int), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMemcpy(p_ind, m.ind(), nnz*sizeof(int), cudaMemcpyHostToDevice));
|
||||
} else {
|
||||
throw illegal_mat_gpu_assignment(("Illegale Zuweisung sparse_ mat_device <- sparse_mat_host!"));
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
// function for conversion sparse_mat_device -> sparse_mat_host
|
||||
void sparse_mat_gpu_to_host(sparse_mat_host &lhs, const sparse_mat_device &rhs) throw(illegal_mat_gpu_assignment) {
|
||||
if(lhs.nnz == rhs.nnz && lhs.format == rhs.format && lhs.dim_x == rhs.dim_x && lhs.dim_y == rhs.dim_y) {
|
||||
int len_ptr = (lhs.format == sparse_mat_csc) ? lhs.dim_x + 1 : lhs.dim_y + 1;
|
||||
|
||||
HANDLE_ERROR(cudaMemcpy(lhs.val(), rhs.val(), lhs.nnz*sizeof(float), cudaMemcpyDeviceToHost));
|
||||
HANDLE_ERROR(cudaMemcpy(lhs.ptr(), rhs.ptr(), len_ptr*sizeof(int), cudaMemcpyDeviceToHost));
|
||||
HANDLE_ERROR(cudaMemcpy(lhs.ind(), rhs.ind(), lhs.nnz*sizeof(int), cudaMemcpyDeviceToHost));
|
||||
} else {
|
||||
throw illegal_mat_gpu_assignment(("Illegale Zuweisung sparse_ mat_host <- sparse_mat_device!"));
|
||||
}
|
||||
}
|
||||
|
||||
// output operator...
|
||||
std::ostream &operator<<(std::ostream &stream, const sparse_mat_device &m) {
|
||||
sparse_mat_host tmp(m.dim_y, m.dim_x, m.nnz, m.format);
|
||||
|
||||
sparse_mat_gpu_to_host(tmp, m);
|
||||
|
||||
stream << tmp;
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// member functions, etc. struct geometry
|
||||
// ------------------------------------------------------------------------
|
||||
|
||||
geometry_device::geometry_device(const geometry_host &geom_host): num_emitters(geom_host.num_emitters),
|
||||
num_receivers(geom_host.num_receivers), rv_x(geom_host.rv_x), rv_y(geom_host.rv_y), rv_z(geom_host.rv_z),
|
||||
scale_factor(geom_host.scale_factor), x_emitters(NULL), y_emitters(NULL), z_emitters(NULL), x_receivers(NULL),
|
||||
y_receivers(NULL), z_receivers(NULL)
|
||||
{
|
||||
if(num_emitters > 0) {
|
||||
HANDLE_ERROR(cudaMalloc(&x_emitters, num_emitters*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(x_emitters, geom_host.x_emitters, num_emitters*sizeof(float), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&y_emitters, num_emitters*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(y_emitters, geom_host.y_emitters, num_emitters*sizeof(float), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&z_emitters, num_emitters*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(z_emitters, geom_host.z_emitters, num_emitters*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
if(num_receivers > 0) {
|
||||
HANDLE_ERROR(cudaMalloc(&x_receivers, num_receivers*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(x_receivers, geom_host.x_receivers, num_receivers*sizeof(float), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&y_receivers, num_receivers*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(y_receivers, geom_host.y_receivers, num_receivers*sizeof(float), cudaMemcpyHostToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&z_receivers, num_receivers*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(z_receivers, geom_host.z_receivers, num_receivers*sizeof(float), cudaMemcpyHostToDevice));
|
||||
}
|
||||
}
|
||||
|
||||
geometry_device::geometry_device(const geometry_device &geom_dev): num_emitters(geom_dev.num_emitters),
|
||||
num_receivers(geom_dev.num_receivers), rv_x(geom_dev.rv_x), rv_y(geom_dev.rv_y), rv_z(geom_dev.rv_z),
|
||||
scale_factor(geom_dev.scale_factor), x_emitters(NULL), y_emitters(NULL), z_emitters(NULL), x_receivers(NULL),
|
||||
y_receivers(NULL), z_receivers(NULL)
|
||||
{
|
||||
if(num_emitters > 0) {
|
||||
HANDLE_ERROR(cudaMalloc(&x_emitters, num_emitters*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(x_emitters, geom_dev.x_emitters, num_emitters*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&y_emitters, num_emitters*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(y_emitters, geom_dev.y_emitters, num_emitters*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&z_emitters, num_emitters*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(z_emitters, geom_dev.z_emitters, num_emitters*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
if(num_receivers > 0) {
|
||||
HANDLE_ERROR(cudaMalloc(&x_receivers, num_receivers*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(x_receivers, geom_dev.x_receivers, num_receivers*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&y_receivers, num_receivers*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(y_receivers, geom_dev.y_receivers, num_receivers*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
HANDLE_ERROR(cudaMalloc(&z_receivers, num_receivers*sizeof(float)));
|
||||
HANDLE_ERROR(cudaMemcpy(z_receivers, geom_dev.z_receivers, num_receivers*sizeof(float), cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
}
|
||||
|
||||
geometry_device::~geometry_device() {
|
||||
if(x_emitters) cudaFree(x_emitters);
|
||||
if(y_emitters) cudaFree(y_emitters);
|
||||
if(x_receivers) cudaFree(x_receivers);
|
||||
if(y_receivers) cudaFree(y_receivers);
|
||||
}
|
||||
|
||||
|
||||
#endif /* CONTAINER_DEVICE_H_ */
|
||||
131
TVALGPU/include/container_host.h
Normal file
131
TVALGPU/include/container_host.h
Normal file
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
* container.h
|
||||
*
|
||||
* Created on: Mar 21, 2011
|
||||
* Author: ditlevsen
|
||||
*
|
||||
* Class mat_host for handling matrices/vectors in conjunction with C-BLAS
|
||||
*/
|
||||
|
||||
#ifndef CONTAINER_HOST_H_
|
||||
#define CONTAINER_HOST_H_
|
||||
#pragma once
|
||||
#include <cuda_runtime.h>
|
||||
#include "handle_error.h"
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
// enumerations...
|
||||
enum T_alloc {allocated_in_constructor, preallocated_buffer};
|
||||
enum T_mat_format {mat_col_major, mat_row_major};
|
||||
enum T_sparse_mat_format {sparse_mat_csc, sparse_mat_csr};
|
||||
|
||||
// exceptions...
|
||||
|
||||
class illegal_mat_gpu_assignment: public std::runtime_error {
|
||||
public:
|
||||
illegal_mat_gpu_assignment(const std::string& message) : std::runtime_error(message) {};
|
||||
};
|
||||
|
||||
//---------------------------------------
|
||||
// class mat_host...
|
||||
//---------------------------------------
|
||||
|
||||
class mat_host {
|
||||
private:
|
||||
float *pbuf;
|
||||
T_alloc alloc_info;
|
||||
public:
|
||||
const int dim_x;
|
||||
const int dim_y;
|
||||
const int dim_z;
|
||||
const int len;
|
||||
const T_mat_format format;
|
||||
const bool pagelocked;
|
||||
const unsigned int alloc_flags;
|
||||
|
||||
mat_host(int l_y, int l_x, int l_z, T_mat_format storage=mat_col_major, bool init=true, bool mem_pagelocked=false,
|
||||
unsigned int flags=cudaHostAllocDefault);
|
||||
mat_host(int num_elements, bool init=true, bool mem_pagelocked=false, unsigned int flags=cudaHostAllocDefault);
|
||||
mat_host(int l_y, int l_x, int l_z, float *buffer, T_mat_format storage=mat_col_major,
|
||||
bool mem_pagelocked=false, unsigned int flags=cudaHostAllocDefault);
|
||||
mat_host(int num_elements, float *buffer, bool mem_pagelocked=false, unsigned int flags=cudaHostAllocDefault);
|
||||
mat_host(const mat_host &m);
|
||||
~mat_host();
|
||||
|
||||
float *data() { return pbuf; }
|
||||
const float *data() const { return pbuf; }
|
||||
|
||||
float &operator[](int i) { return pbuf[i]; }
|
||||
const float &operator[](int i) const { return pbuf[i]; }
|
||||
|
||||
mat_host &operator=(const mat_host &m) throw(illegal_mat_gpu_assignment);
|
||||
|
||||
void randomFill(float scl);
|
||||
};
|
||||
|
||||
|
||||
//---------------------------------------
|
||||
// class sparse_mat_host...
|
||||
//---------------------------------------
|
||||
|
||||
class sparse_mat_host {
|
||||
private:
|
||||
int *p_ptr;
|
||||
int *p_ind;
|
||||
float *p_val;
|
||||
T_alloc alloc_info;
|
||||
public:
|
||||
const int dim_y;
|
||||
const int dim_x;
|
||||
const int nnz;
|
||||
const T_sparse_mat_format format;
|
||||
const bool pagelocked;
|
||||
const unsigned int alloc_flags;
|
||||
|
||||
sparse_mat_host(int num_dim_y, int num_dim_x, int n_nonzero, T_sparse_mat_format storage_format=sparse_mat_csc, bool init=true,
|
||||
bool mem_pagelocked=false, unsigned int flags=cudaHostAllocDefault);
|
||||
sparse_mat_host(int num_dim_y, int num_dim_x, int n_nonzero, float *buff_val, int *buff_ptr, int *buff_ind,
|
||||
T_sparse_mat_format storage_format=sparse_mat_csc,
|
||||
bool mem_pagelocked=false, unsigned int flags=cudaHostAllocDefault);
|
||||
sparse_mat_host(const sparse_mat_host &m);
|
||||
~sparse_mat_host();
|
||||
|
||||
float *val() { return p_val; }
|
||||
const float *val() const { return p_val; }
|
||||
|
||||
int *ptr() { return p_ptr; }
|
||||
const int *ptr() const { return p_ptr; }
|
||||
|
||||
int *ind() { return p_ind; }
|
||||
const int *ind() const { return p_ind; }
|
||||
|
||||
sparse_mat_host &operator=(const sparse_mat_host &m) throw(illegal_mat_gpu_assignment);
|
||||
};
|
||||
|
||||
//---------------------------------------
|
||||
// Struct geomety_host
|
||||
//---------------------------------------
|
||||
|
||||
struct geometry_host {
|
||||
int *x_emitters;
|
||||
int *y_emitters;
|
||||
int *z_emitters;
|
||||
int *x_receivers;
|
||||
int *y_receivers;
|
||||
int *z_receivers;
|
||||
int num_emitters;
|
||||
int num_receivers;
|
||||
int rv_x;
|
||||
int rv_y;
|
||||
int rv_z;
|
||||
float scale_factor;
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* CONTAINER_HOST_H_ */
|
||||
40
TVALGPU/include/handle_error.h
Normal file
40
TVALGPU/include/handle_error.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#ifndef HANDLE_ERROR_H_
|
||||
#define HANDLE_ERROR_H_
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
|
||||
class cuda_exception: public std::runtime_error {
|
||||
public:
|
||||
cuda_exception(const std::string& message) : std::runtime_error(message) {};
|
||||
};
|
||||
|
||||
static void handle_error(cudaError_t error, const char *file, int line ) {
|
||||
if (error != cudaSuccess) {
|
||||
std::stringstream ss;
|
||||
ss << file << ", line " << line << ": " << cudaGetErrorString(error) << "\n";
|
||||
throw cuda_exception(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
static void handle_error(cublasStatus_t error, const char *file, int line ) {
|
||||
if (error != CUBLAS_STATUS_SUCCESS) {
|
||||
std::stringstream ss;
|
||||
ss << file << ", line " << line << ": cublas error " << error << "\n";
|
||||
throw cuda_exception(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
static void handle_error(cusparseStatus_t error, const char *file, int line ) {
|
||||
if (error != CUSPARSE_STATUS_SUCCESS) {
|
||||
std::stringstream ss;
|
||||
ss << file << ", line " << line << ": cusparse error " << error << "\n";
|
||||
throw cuda_exception(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
#define HANDLE_ERROR(error) (handle_error(error, __FILE__, __LINE__ ))
|
||||
|
||||
|
||||
#endif /* HANDLE_ERROR_H_ */
|
||||
350
TVALGPU/include/mat_vec_mul.h
Normal file
350
TVALGPU/include/mat_vec_mul.h
Normal file
@@ -0,0 +1,350 @@
|
||||
/*
|
||||
* mat_vec_mul.h
|
||||
*
|
||||
* Created on: Jul 13, 2011
|
||||
* Author: ditlevsen
|
||||
*/
|
||||
#ifndef MAT_VEC_MUL_H_
|
||||
#define MAT_VEC_MUL_H_
|
||||
|
||||
struct sparse_mm {
|
||||
sparse_mat_device A_csc;
|
||||
sparse_mat_device A_csr;
|
||||
cusparseHandle_t cs_handle;
|
||||
cusparseMatDescr_t descrA;
|
||||
sparse_mm(const sparse_mat_host &A, cusparseHandle_t handle, cusparseMatDescr_t descriptor) :
|
||||
A_csc(A), A_csr(A.dim_y, A.dim_x, A.nnz, sparse_mat_csr, false), cs_handle(handle),
|
||||
descrA(descriptor)
|
||||
{
|
||||
|
||||
|
||||
/* Old CUDA 4.0 implementation for reference
|
||||
HANDLE_ERROR(cusparseScsr2csc(cs_handle, A.dim_x, A.dim_y, A_csc.val(),
|
||||
A_csc.ptr(), A_csc.ind(), A_csr.val(), A_csr.ind(),
|
||||
A_csr.ptr(), 1, CUSPARSE_INDEX_BASE_ZERO));
|
||||
*/
|
||||
|
||||
/* CUDA 10.1 adaption (Feb. 2021)
|
||||
cusparseStatus_t cusparseScsr2csc(cusparseHandle_t handle,
|
||||
int m,
|
||||
int n,
|
||||
int nnz,
|
||||
const float* csrVal,
|
||||
const int* csrRowPtr,
|
||||
const int* csrColInd,
|
||||
float* cscVal,
|
||||
int* cscRowInd,
|
||||
int* cscColPtr,
|
||||
cusparseAction_t copyValues,
|
||||
cusparseIndexBase_t idxBase)
|
||||
|
||||
*/
|
||||
|
||||
HANDLE_ERROR(cusparseScsr2csc(cs_handle,
|
||||
A.dim_x, // m
|
||||
A.dim_y, // n
|
||||
A.nnz, // nnz
|
||||
A_csc.val(), // csrVal
|
||||
A_csc.ptr(), // csrRowPtr
|
||||
A_csc.ind(), // csrColInd
|
||||
A_csr.val(), // cscVal
|
||||
A_csr.ind(), // cscRowInd
|
||||
A_csr.ptr(), // cscColPtr
|
||||
CUSPARSE_ACTION_NUMERIC, // copyValues
|
||||
CUSPARSE_INDEX_BASE_ZERO)); // idxBase
|
||||
|
||||
|
||||
|
||||
|
||||
/* TO DO: adaption to CUDA 11. Above function was replaced with the following function
|
||||
|
||||
cusparseStatus_t cusparseCsr2cscEx2(cusparseHandle_t handle,
|
||||
int m,
|
||||
int n,
|
||||
int nnz,
|
||||
const void* csrVal,
|
||||
const int* csrRowPtr,
|
||||
const int* csrColInd,
|
||||
void* cscVal,
|
||||
int* cscColPtr,
|
||||
int* cscRowInd,
|
||||
cudaDataType valType,
|
||||
cusparseAction_t copyValues,
|
||||
cusparseIndexBase_t idxBase,
|
||||
cusparseCsr2CscAlg_t alg,
|
||||
void* buffer)
|
||||
|
||||
*/
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
struct dense_mm {
|
||||
mat_device A;
|
||||
cublasHandle_t cb_handle;
|
||||
dense_mm(const mat_host &A_host, cublasHandle_t handle) : A(A_host, true), cb_handle(handle) {};
|
||||
};
|
||||
|
||||
inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A, const float *x, float *y,
|
||||
cudaStream_t stream=0)
|
||||
{
|
||||
int n, m;
|
||||
const sparse_mat_device *mA;
|
||||
const float alpha = 1;
|
||||
const float beta = 0;
|
||||
|
||||
|
||||
if(transA == CUBLAS_OP_N) {
|
||||
n = A.A_csr.dim_x;
|
||||
m = A.A_csr.dim_y;
|
||||
mA = &A.A_csr;
|
||||
} else {
|
||||
n = A.A_csr.dim_y;
|
||||
m = A.A_csr.dim_x;
|
||||
mA = &A.A_csc; // implicit transponation...
|
||||
}
|
||||
|
||||
/* Old CUDA 4 implementation
|
||||
HANDLE_ERROR(cusparseSetKernelStream(A.cs_handle, stream)); // one problem here
|
||||
*/
|
||||
|
||||
|
||||
/* CUDA 10.1 adaption (Feb. 2021)*/
|
||||
HANDLE_ERROR(cusparseSetStream(A.cs_handle, stream));
|
||||
|
||||
|
||||
/* Old CUDA 4 implementation
|
||||
return cusparseScsrmv(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, m, n, 1, A.descrA, mA->val(),mA->ptr(), mA->ind(), x, 0, y);
|
||||
*/
|
||||
|
||||
|
||||
|
||||
/* CUDA 10.1 adaption (Feb. 2021)
|
||||
cusparseStatus_tcusparseScsrmv(cusparseHandle_t handle,
|
||||
cusparseOperation_t transA,
|
||||
int m,
|
||||
int n,
|
||||
int nnz,
|
||||
const float* alpha,
|
||||
const cusparseMatDescr_t descrA,
|
||||
const float* csrValA,
|
||||
const int* csrRowPtrA,
|
||||
const int* csrColIndA,
|
||||
const float* x,
|
||||
const float* beta,
|
||||
float* y);
|
||||
|
||||
*/
|
||||
|
||||
cusparseStatus_t status;
|
||||
status = cusparseScsrmv(A.cs_handle,
|
||||
CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
m,
|
||||
n,
|
||||
A.A_csr.nnz,
|
||||
&alpha,
|
||||
A.descrA,
|
||||
mA->val(),
|
||||
mA->ptr(),
|
||||
mA->ind(),
|
||||
x,
|
||||
&beta,
|
||||
y);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const dense_mm &A, const float *x, float *y,
|
||||
cudaStream_t stream=0) {
|
||||
int n = A.A.dim_x;
|
||||
int m = A.A.dim_y;
|
||||
cublasStatus_t status;
|
||||
|
||||
float alpha=1, beta=0;
|
||||
|
||||
HANDLE_ERROR(cublasSetStream(A.cb_handle, stream));
|
||||
|
||||
status = cublasSgemv(A.cb_handle, trans, m, n, &alpha, A.A.data_dev_ptr(), A.A.leading_dim(), x, 1, &beta, y, 1);
|
||||
|
||||
HANDLE_ERROR(cublasSetStream(A.cb_handle, 0));
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
__device__ float atomicAdd_cc1(float* address, float val) {
|
||||
int* address_as_int = (int*)address;
|
||||
int old = *address_as_int, assumed;
|
||||
do {
|
||||
assumed = old;
|
||||
old = atomicCAS(address_as_int, assumed, __float_as_int(val + __int_as_float(assumed)));
|
||||
} while (assumed != old);
|
||||
return __int_as_float(old);
|
||||
}
|
||||
|
||||
// To do: atomic operations...!!
|
||||
__device__ void mult_element(cublasOperation_t transA, int ray, int x_pixel, int y_pixel, int z_pixel, int dim_x, int dim_y, float scale_factor, const float *x, float *y) {
|
||||
int lin_index = x_pixel*dim_y + y_pixel + z_pixel*dim_x*dim_y;
|
||||
if(transA == CUBLAS_OP_N) {
|
||||
y[ray] += x[lin_index] * scale_factor;
|
||||
} else {
|
||||
#if __CUDA_ARCH__ < 200
|
||||
atomicAdd_cc1(y + lin_index, x[ray] * scale_factor);
|
||||
#else
|
||||
atomicAdd(y + lin_index, x[ray] * scale_factor);
|
||||
#endif
|
||||
//y[lin_index] += x[ray] * scale_factor;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__global__ void dyn_calc_kernel(cublasOperation_t transA, int num_emitters, int num_receivers, int rv_x, int rv_y, int rv_z,
|
||||
float scale_factor, const int *x_receivers, const int *y_receivers, const int *z_receivers, const int *x_emitters,
|
||||
const int *y_emitters, const int *z_emitters, const float *x, float *y) {
|
||||
|
||||
int index_r, index_e, inc_r, inc_e;
|
||||
int d1, d2, d3, inc1, m1, inc2, m2, inc3, m3, x_pixel, y_pixel, z_pixel, ray, err_1, err_2;
|
||||
int *pixel1, *pixel2, *pixel3;
|
||||
|
||||
// different assignment of paths to threads for multiplication with or without transposition
|
||||
// (performance reasons)
|
||||
// no transposition: x-> receivers, y-> emitters
|
||||
// transposition: x-> emitters, y-> receivers
|
||||
if(transA == CUBLAS_OP_N) {
|
||||
index_r = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
index_e = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
inc_r = blockDim.x * gridDim.x;
|
||||
inc_e = blockDim.y * gridDim.y;
|
||||
} else {
|
||||
index_e = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
index_r = threadIdx.y + blockIdx.y * blockDim.y;
|
||||
inc_e = blockDim.x * gridDim.x;
|
||||
inc_r = blockDim.y * gridDim.y;
|
||||
}
|
||||
|
||||
for(int receiver=index_r; receiver < num_receivers; receiver+= inc_r) {
|
||||
for(int emitter=index_e; emitter < num_emitters; emitter+= inc_e) {
|
||||
|
||||
ray = emitter*num_receivers + receiver;
|
||||
|
||||
// trace path using Bresenham algorithm...
|
||||
|
||||
x_pixel = x_emitters[emitter];
|
||||
y_pixel = y_emitters[emitter];
|
||||
z_pixel = z_emitters[emitter];
|
||||
|
||||
mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);
|
||||
|
||||
d1 = x_receivers[receiver] - x_pixel;
|
||||
d2 = y_receivers[receiver] - y_pixel;
|
||||
d3 = z_receivers[receiver] - z_pixel;
|
||||
|
||||
m1 = abs(d1);
|
||||
m2 = abs(d2);
|
||||
m3 = abs(d3);
|
||||
|
||||
if (m1 >= m2 && m1 >= m3) {
|
||||
pixel1 = &x_pixel;
|
||||
pixel2 = &y_pixel;
|
||||
pixel3 = &z_pixel;
|
||||
} else if(m2 >= m1 && m2 >= m3) {
|
||||
int tmp = d1;
|
||||
d1 = d2;
|
||||
d2 = tmp;
|
||||
|
||||
pixel1 = &y_pixel;
|
||||
pixel2 = &x_pixel;
|
||||
pixel3 = &z_pixel;
|
||||
} else {
|
||||
int tmp = d1;
|
||||
d1 = d3;
|
||||
d3 = d2;
|
||||
d2 = tmp;
|
||||
|
||||
pixel1 = &z_pixel;
|
||||
pixel2 = &x_pixel;
|
||||
pixel3 = &y_pixel;
|
||||
}
|
||||
|
||||
inc1 = (d1 < 0) ? -1 : 1;
|
||||
inc2 = (d2 < 0) ? -1 : 1;
|
||||
inc3 = (d3 < 0) ? -1 : 1;
|
||||
|
||||
m1 = abs(d1);
|
||||
m2 = abs(d2);
|
||||
m3 = abs(d3);
|
||||
|
||||
err_1 = 2 * m2 - m1;
|
||||
err_2 = 2 * m3 - m1;
|
||||
for(int j = 1; j < m1+1; j++) {
|
||||
if (err_1 > 0) {
|
||||
*pixel2 += inc2;
|
||||
err_1 -= 2 * m1;
|
||||
}
|
||||
if (err_2 > 0) {
|
||||
*pixel3 += inc3;
|
||||
err_2 -= 2 * m1;
|
||||
}
|
||||
err_1 += 2 * m2;
|
||||
err_2 += 2 * m3;
|
||||
*pixel1 += inc1;
|
||||
mult_element(transA, ray, x_pixel, y_pixel, z_pixel, rv_x, rv_y, scale_factor, x, y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// matrix-vector multiplication with dynamic calculation of the measurement matrix...
|
||||
inline cublasStatus_t mat_vec_mul(cublasOperation_t trans, const geometry_device &A, const float *x, float *y,
|
||||
cudaStream_t stream=0)
|
||||
{
|
||||
int len_y = (trans == CUBLAS_OP_N) ? A.num_emitters * A.num_receivers : A.rv_x * A.rv_y * A.rv_z;
|
||||
cudaMemset(y, 0, len_y * sizeof(float));
|
||||
dim3 threads, blocks;
|
||||
|
||||
// recht Willkürliche Dimensionierung (Performance ca. 10 % schlechter, als mit jeweils bester gefundener Dimensionierung
|
||||
threads.x = 64;
|
||||
threads.y = 4;
|
||||
if(trans == CUBLAS_OP_N) {
|
||||
blocks.x = max(A.num_receivers / threads.x, 1);
|
||||
blocks.y = max(A.num_emitters / threads.x, 1);
|
||||
} else {
|
||||
blocks.y = max(A.num_receivers / threads.x, 1);
|
||||
blocks.x = max(A.num_emitters / threads.x, 1);
|
||||
}
|
||||
|
||||
// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 64x64x64:
|
||||
/* if(trans == CUBLAS_OP_N) {
|
||||
threads.x = 32;
|
||||
threads.y = 8;
|
||||
blocks.x = 40;
|
||||
blocks.y = 78;
|
||||
} else {
|
||||
threads.x = 128;
|
||||
threads.y = 2;
|
||||
blocks.x = 1;
|
||||
blocks.y =16;
|
||||
}*/
|
||||
|
||||
// beste gefundene Dimensionierung für USCT2-Geometrie, Volumen: 128x128x128:
|
||||
/* if(trans == CUBLAS_OP_N) {
|
||||
threads.x = 64;
|
||||
threads.y = 4;
|
||||
blocks.x = 22;
|
||||
blocks.y = 152;
|
||||
} else {
|
||||
threads.x = 64;
|
||||
threads.y = 8;
|
||||
blocks.x = 8;
|
||||
blocks.y = 26;
|
||||
}*/
|
||||
|
||||
dyn_calc_kernel<<<blocks, threads, 0, stream>>>(trans, A.num_emitters, A.num_receivers, A.rv_x, A.rv_y, A.rv_z,
|
||||
A.scale_factor, A.x_re_dev_ptr(), A.y_re_dev_ptr(), A.z_re_dev_ptr(), A.x_em_dev_ptr(), A.y_em_dev_ptr(),
|
||||
A.z_em_dev_ptr(), x, y);
|
||||
return CUBLAS_STATUS_SUCCESS;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif /* MAT_VEC_MUL_H_ */
|
||||
25
TVALGPU/include/tval3_gpu.h
Normal file
25
TVALGPU/include/tval3_gpu.h
Normal file
@@ -0,0 +1,25 @@
|
||||
/*
|
||||
* tval3.h
|
||||
*
|
||||
* Created on: Mar 14, 2011
|
||||
* Author: ditlevsen
|
||||
*/
|
||||
|
||||
#ifndef TVAL3_GPU_H_
|
||||
#define TVAL3_GPU_H_
|
||||
|
||||
#include "container_host.h"
|
||||
#include <complex>
|
||||
|
||||
#include "tval3_types.h"
|
||||
|
||||
extern const tval3_info tval3_gpu(mat_host &U, const sparse_mat_host &A, const mat_host &b,
|
||||
const tval3_options &opts, const mat_host &Ut=0, bool pagelocked=true);
|
||||
|
||||
extern const tval3_info tval3_gpu(mat_host &U, const mat_host &A, const mat_host &b,
|
||||
const tval3_options &opts, const mat_host &Ut=0, bool pagelocked=true);
|
||||
|
||||
extern const tval3_info tval3_gpu(mat_host &U, const geometry_host &A, const mat_host &b,
|
||||
const tval3_options &opts, const mat_host &Ut, bool pagelocked);
|
||||
|
||||
#endif /* TVAL3_GPU_H_ */
|
||||
54
TVALGPU/include/tval3_types.h
Normal file
54
TVALGPU/include/tval3_types.h
Normal file
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* tval3_floats.h
|
||||
*
|
||||
* Created on: Jun 9, 2011
|
||||
* Author: ditlevsen
|
||||
*/
|
||||
|
||||
#ifndef TVAL3_floatS_H_
|
||||
#define TVAL3_floatS_H_
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
struct tval3_options {
|
||||
float mu; // maximum value for penalty parameter mu, see (2.12)
|
||||
// mu is mainly decided by noise level. Set mu big when b is noise-free,
|
||||
// set mu small when b is very noisy
|
||||
// suggested values for mu: 2⁴..2¹³
|
||||
float mu0; // initial value for mu
|
||||
float beta; // maximum value for penalty parameter beta, see (2.12)
|
||||
float beta0; // initial value for beta
|
||||
float rate_cnt; // continuation parameter for both mu and beta
|
||||
float tol; // outer loop tolerance
|
||||
float tol_inn; // inner loop tolerance
|
||||
int maxcnt; // maximum number of outer iterations
|
||||
int maxit; // maximum number of iterations (total)
|
||||
float c; // corresponds to the parameter delta in (2.33) (arminjo condition)
|
||||
float gamma; // corresponds to rho "Algorithm 2" on page 29
|
||||
float gam; // corresponds to parameter eta in (2.34)
|
||||
// Control the degree of nonmonotonicity. 0 corresponds to monotone line search.
|
||||
// The best convergence is obtained by using values closer to 1 when the iterates
|
||||
// are far from the optimum, and using values closer to 0 when near an optimum.
|
||||
float rate_gam; // shrinkage rate of gam
|
||||
bool isreal;
|
||||
bool nonneg;
|
||||
tval3_options(): mu(256.0f), mu0(256.0f), beta(32.0f), beta0(32.0f), rate_cnt(2.0f),
|
||||
tol(1e-6f), tol_inn(1e-3f), maxcnt(10), maxit(1025), c(1e-5f), gamma(0.6f),
|
||||
gam(0.9995f), rate_gam(0.9f), isreal(true), nonneg(true) {};
|
||||
};
|
||||
|
||||
struct tval3_info {
|
||||
int total_iters;
|
||||
int outer_iters;
|
||||
float rel_chg;
|
||||
float rel_error;
|
||||
double secs;
|
||||
};
|
||||
|
||||
class tval3_gpu_exception: public std::runtime_error {
|
||||
public:
|
||||
tval3_gpu_exception(const std::string& message) : std::runtime_error(message) {};
|
||||
};
|
||||
|
||||
#endif /* TVAL3_floatS_H_ */
|
||||
4
TVALGPU/include/tval3gpu3d.h
Normal file
4
TVALGPU/include/tval3gpu3d.h
Normal file
@@ -0,0 +1,4 @@
|
||||
#include <mex.h>
|
||||
extern "C"{
|
||||
void TVALGPU(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) ;
|
||||
}
|
||||
Reference in New Issue
Block a user