feat: make TVAL support CUDA 12
This commit is contained in:
@@ -4,7 +4,7 @@
|
|||||||
#include <string>
|
#include <string>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include "cuda_runtime.h"
|
#include "cuda_runtime.h"
|
||||||
#include "cublas.h"
|
#include "cublas_v2.h"
|
||||||
#include "cusparse.h"
|
#include "cusparse.h"
|
||||||
|
|
||||||
class cuda_exception: public std::runtime_error {
|
class cuda_exception: public std::runtime_error {
|
||||||
|
|||||||
@@ -7,6 +7,7 @@
|
|||||||
#ifndef MAT_VEC_MUL_H_
|
#ifndef MAT_VEC_MUL_H_
|
||||||
#define MAT_VEC_MUL_H_
|
#define MAT_VEC_MUL_H_
|
||||||
|
|
||||||
|
#include <cuda.h>
|
||||||
#include "container_device.h"
|
#include "container_device.h"
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
struct sparse_mm {
|
struct sparse_mm {
|
||||||
@@ -190,6 +191,20 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A
|
|||||||
cusparseCreateDnVec(&vecX,n, (void*)x, cudaDataType::CUDA_R_32F);
|
cusparseCreateDnVec(&vecX,n, (void*)x, cudaDataType::CUDA_R_32F);
|
||||||
cusparseCreateDnVec(&vecY,m, (void*)y, cudaDataType::CUDA_R_32F);
|
cusparseCreateDnVec(&vecY,m, (void*)y, cudaDataType::CUDA_R_32F);
|
||||||
unsigned long bufferSize = 0;
|
unsigned long bufferSize = 0;
|
||||||
|
//CUDA VERSION > 11, use new API
|
||||||
|
#if CUDA_VERSION >= 11000
|
||||||
|
status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA,
|
||||||
|
vecX,
|
||||||
|
&beta,
|
||||||
|
vecY,
|
||||||
|
cudaDataType::CUDA_R_32F,
|
||||||
|
CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
|
||||||
|
void* buffer = NULL;
|
||||||
|
cudaMalloc(&buffer,bufferSize);
|
||||||
|
status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||||
|
&alpha, matA, vecX, &beta, vecY,
|
||||||
|
CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, buffer);
|
||||||
|
#else
|
||||||
status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA,
|
status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA,
|
||||||
vecX,
|
vecX,
|
||||||
&beta,
|
&beta,
|
||||||
@@ -201,6 +216,7 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A
|
|||||||
status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||||
&alpha, matA, vecX, &beta, vecY,
|
&alpha, matA, vecX, &beta, vecY,
|
||||||
CUDA_R_32F, CUSPARSE_CSRMV_ALG1, buffer);
|
CUDA_R_32F, CUSPARSE_CSRMV_ALG1, buffer);
|
||||||
|
#endif
|
||||||
cusparseDestroySpMat(matA);
|
cusparseDestroySpMat(matA);
|
||||||
cusparseDestroyDnVec(vecX);
|
cusparseDestroyDnVec(vecX);
|
||||||
cusparseDestroyDnVec(vecY);
|
cusparseDestroyDnVec(vecY);
|
||||||
|
|||||||
@@ -17,12 +17,6 @@ cudaEvent_t start_part, stop_part;
|
|||||||
// dynamically allocated shared memory array
|
// dynamically allocated shared memory array
|
||||||
extern __shared__ float buffer[];
|
extern __shared__ float buffer[];
|
||||||
|
|
||||||
// texture references...
|
|
||||||
texture<float, 1> texRef;
|
|
||||||
texture<float, 1> texRefX;
|
|
||||||
texture<float, 1> texRefY;
|
|
||||||
texture<float, 1> texRefZ;
|
|
||||||
|
|
||||||
cudaChannelFormatDesc channelDesc;
|
cudaChannelFormatDesc channelDesc;
|
||||||
|
|
||||||
cudaStream_t stream1, stream2, stream3;
|
cudaStream_t stream1, stream2, stream3;
|
||||||
@@ -222,47 +216,6 @@ __global__ void D_kernel_no_tex(const float *U, float *Ux, float *Uy, float *Uz,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void D_kernel_tex(const float *U, float *Ux, float *Uy, float *Uz, int dim_x, int dim_y, int dim_z) {
|
|
||||||
|
|
||||||
int x, y, z, lin_index;
|
|
||||||
|
|
||||||
float tmp;
|
|
||||||
|
|
||||||
y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
while(y < dim_y) {
|
|
||||||
x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
while(x < dim_x) {
|
|
||||||
z = blockIdx.z * blockDim.z + threadIdx.z;
|
|
||||||
while(z < dim_z) {
|
|
||||||
lin_index = z * dim_x * dim_y + y * dim_x + x;
|
|
||||||
|
|
||||||
tmp = tex1Dfetch(texRef, lin_index);
|
|
||||||
|
|
||||||
if (x < dim_x - 1) {
|
|
||||||
Ux[lin_index] = tex1Dfetch(texRef, lin_index + 1) - tmp;
|
|
||||||
} else {
|
|
||||||
Ux[lin_index] = tex1Dfetch(texRef, lin_index - x) - tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (y < dim_y - 1) {
|
|
||||||
Uy[lin_index] = tex1Dfetch(texRef, lin_index + dim_x) - tmp;
|
|
||||||
} else {
|
|
||||||
Uy[lin_index] = tex1Dfetch(texRef, lin_index - y * dim_x) - tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (z < dim_z - 1) {
|
|
||||||
Uz[lin_index] = tex1Dfetch(texRef, lin_index + dim_x * dim_y) - tmp;
|
|
||||||
} else {
|
|
||||||
Uz[lin_index] = tex1Dfetch(texRef, lin_index - z * dim_x * dim_y) - tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
z += gridDim.z * blockDim.z;
|
|
||||||
}
|
|
||||||
x += gridDim.x * blockDim.x;
|
|
||||||
}
|
|
||||||
y += gridDim.y * blockDim.y;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// matrices stored in column major format
|
// matrices stored in column major format
|
||||||
void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) {
|
void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) {
|
||||||
@@ -270,7 +223,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U)
|
|||||||
HANDLE_ERROR(cudaEventRecord(start_part));
|
HANDLE_ERROR(cudaEventRecord(start_part));
|
||||||
#endif
|
#endif
|
||||||
dim3 threads, blocks;
|
dim3 threads, blocks;
|
||||||
if(majorRevision >= 2) {
|
|
||||||
threads.x = 32;
|
threads.x = 32;
|
||||||
threads.y = 4;
|
threads.y = 4;
|
||||||
threads.z = 1;
|
threads.z = 1;
|
||||||
@@ -278,21 +230,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U)
|
|||||||
blocks.y = max((int)round((double)U.dim_y / threads.y), 1);
|
blocks.y = max((int)round((double)U.dim_y / threads.y), 1);
|
||||||
blocks.z = max((int)round((double)U.dim_z / threads.z), 1);
|
blocks.z = max((int)round((double)U.dim_z / threads.z), 1);
|
||||||
D_kernel_no_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z);
|
D_kernel_no_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z);
|
||||||
} else {
|
|
||||||
threads.x = 64;
|
|
||||||
threads.y = 4;
|
|
||||||
threads.z = 1;
|
|
||||||
blocks.x = max((int)round((double)U.dim_x / threads.x), 1);
|
|
||||||
blocks.y = max((int)round((double)U.dim_y / threads.y), 1);
|
|
||||||
blocks.z = 1;
|
|
||||||
|
|
||||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRef, U.data_dev_ptr(), &channelDesc,
|
|
||||||
U.len * sizeof(float)));
|
|
||||||
|
|
||||||
D_kernel_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z);
|
|
||||||
|
|
||||||
HANDLE_ERROR(cudaUnbindTexture(texRef));
|
|
||||||
}
|
|
||||||
#ifdef PROFILING
|
#ifdef PROFILING
|
||||||
HANDLE_ERROR(cudaEventRecord(stop_part));
|
HANDLE_ERROR(cudaEventRecord(stop_part));
|
||||||
float elapsedTime;
|
float elapsedTime;
|
||||||
@@ -330,42 +267,11 @@ __global__ void Dt_kernel_no_tex(const float *X, const float *Y, const float *Z,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// texture mem...
|
|
||||||
__global__ void Dt_kernel_tex(const float *X, const float *Y, const float *Z, float *res, int dim_x, int dim_y, int dim_z) {
|
|
||||||
|
|
||||||
int x, y, z, lin_index;
|
|
||||||
float xp, yp, zp;
|
|
||||||
|
|
||||||
y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
||||||
while(y < dim_y) {
|
|
||||||
x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
||||||
while(x < dim_x) {
|
|
||||||
z = blockIdx.z * blockDim.z + threadIdx.z;
|
|
||||||
while(z < dim_y) {
|
|
||||||
|
|
||||||
lin_index = z * dim_x * dim_y + y * dim_x + x;
|
|
||||||
|
|
||||||
xp = (x == 0) ? tex1Dfetch(texRefX, lin_index + dim_x - 1) : tex1Dfetch(texRefX, lin_index - 1);
|
|
||||||
yp = (y == 0) ? tex1Dfetch(texRefY, lin_index + (dim_y - 1) * dim_x) : tex1Dfetch(texRefY, lin_index - dim_x);
|
|
||||||
yp = (z == 0) ? tex1Dfetch(texRefZ, lin_index + (dim_z - 1) * dim_x * dim_y) : tex1Dfetch(texRefZ, lin_index - dim_x * dim_y);
|
|
||||||
|
|
||||||
res[lin_index] = xp - tex1Dfetch(texRefX, lin_index) + yp - tex1Dfetch(texRefY, lin_index) + zp -
|
|
||||||
tex1Dfetch(texRefZ, lin_index);
|
|
||||||
|
|
||||||
z += blockDim.z * gridDim.z;
|
|
||||||
}
|
|
||||||
x += blockDim.x * gridDim.x;
|
|
||||||
}
|
|
||||||
y += blockDim.y * gridDim.y;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat_device &Z, cudaStream_t stream = 0) {
|
void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat_device &Z, cudaStream_t stream = 0) {
|
||||||
#ifdef PROFILING
|
#ifdef PROFILING
|
||||||
HANDLE_ERROR(cudaEventRecord(start_part));
|
HANDLE_ERROR(cudaEventRecord(start_part));
|
||||||
#endif
|
#endif
|
||||||
dim3 threads, blocks;
|
dim3 threads, blocks;
|
||||||
if(majorRevision >= 2) {
|
|
||||||
threads.x = 32;
|
threads.x = 32;
|
||||||
threads.y = 4;
|
threads.y = 4;
|
||||||
threads.z = 1;
|
threads.z = 1;
|
||||||
@@ -373,24 +279,6 @@ void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat
|
|||||||
blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1);
|
blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1);
|
||||||
blocks.z = max((int)round((double)X.dim_z / threads.z), 1);
|
blocks.z = max((int)round((double)X.dim_z / threads.z), 1);
|
||||||
Dt_kernel_no_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z);
|
Dt_kernel_no_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z);
|
||||||
} else {
|
|
||||||
threads.x = 32;
|
|
||||||
threads.y = 2;
|
|
||||||
threads.z = 8;
|
|
||||||
blocks.x = max((int)round((double)X.dim_x / threads.x), 1);
|
|
||||||
blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1);
|
|
||||||
blocks.z = 1;
|
|
||||||
|
|
||||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRefX, X.data_dev_ptr(), &channelDesc, X.len * sizeof(float)));
|
|
||||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRefY, Y.data_dev_ptr(), &channelDesc, Y.len * sizeof(float)));
|
|
||||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRefZ, Z.data_dev_ptr(), &channelDesc, Z.len * sizeof(float)));
|
|
||||||
|
|
||||||
Dt_kernel_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z);
|
|
||||||
|
|
||||||
HANDLE_ERROR(cudaUnbindTexture(texRefX));
|
|
||||||
HANDLE_ERROR(cudaUnbindTexture(texRefY));
|
|
||||||
HANDLE_ERROR(cudaUnbindTexture(texRefZ));
|
|
||||||
}
|
|
||||||
#ifdef PROFILING
|
#ifdef PROFILING
|
||||||
HANDLE_ERROR(cudaEventRecord(stop_part));
|
HANDLE_ERROR(cudaEventRecord(stop_part));
|
||||||
float elapsedTime;
|
float elapsedTime;
|
||||||
|
|||||||
Reference in New Issue
Block a user