feat: make TVAL support CUDA 12
This commit is contained in:
@@ -4,7 +4,7 @@
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include "cuda_runtime.h"
|
||||
#include "cublas.h"
|
||||
#include "cublas_v2.h"
|
||||
#include "cusparse.h"
|
||||
|
||||
class cuda_exception: public std::runtime_error {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#ifndef MAT_VEC_MUL_H_
|
||||
#define MAT_VEC_MUL_H_
|
||||
|
||||
#include <cuda.h>
|
||||
#include "container_device.h"
|
||||
#include <cublas_v2.h>
|
||||
struct sparse_mm {
|
||||
@@ -190,6 +191,20 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A
|
||||
cusparseCreateDnVec(&vecX,n, (void*)x, cudaDataType::CUDA_R_32F);
|
||||
cusparseCreateDnVec(&vecY,m, (void*)y, cudaDataType::CUDA_R_32F);
|
||||
unsigned long bufferSize = 0;
|
||||
//CUDA VERSION > 11, use new API
|
||||
#if CUDA_VERSION >= 11000
|
||||
status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA,
|
||||
vecX,
|
||||
&beta,
|
||||
vecY,
|
||||
cudaDataType::CUDA_R_32F,
|
||||
CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
|
||||
void* buffer = NULL;
|
||||
cudaMalloc(&buffer,bufferSize);
|
||||
status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
&alpha, matA, vecX, &beta, vecY,
|
||||
CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, buffer);
|
||||
#else
|
||||
status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA,
|
||||
vecX,
|
||||
&beta,
|
||||
@@ -201,6 +216,7 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A
|
||||
status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
|
||||
&alpha, matA, vecX, &beta, vecY,
|
||||
CUDA_R_32F, CUSPARSE_CSRMV_ALG1, buffer);
|
||||
#endif
|
||||
cusparseDestroySpMat(matA);
|
||||
cusparseDestroyDnVec(vecX);
|
||||
cusparseDestroyDnVec(vecY);
|
||||
|
||||
@@ -17,12 +17,6 @@ cudaEvent_t start_part, stop_part;
|
||||
// dynamically allocated shared memory array
|
||||
extern __shared__ float buffer[];
|
||||
|
||||
// texture references...
|
||||
texture<float, 1> texRef;
|
||||
texture<float, 1> texRefX;
|
||||
texture<float, 1> texRefY;
|
||||
texture<float, 1> texRefZ;
|
||||
|
||||
cudaChannelFormatDesc channelDesc;
|
||||
|
||||
cudaStream_t stream1, stream2, stream3;
|
||||
@@ -222,47 +216,6 @@ __global__ void D_kernel_no_tex(const float *U, float *Ux, float *Uy, float *Uz,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void D_kernel_tex(const float *U, float *Ux, float *Uy, float *Uz, int dim_x, int dim_y, int dim_z) {
|
||||
|
||||
int x, y, z, lin_index;
|
||||
|
||||
float tmp;
|
||||
|
||||
y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
while(y < dim_y) {
|
||||
x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
while(x < dim_x) {
|
||||
z = blockIdx.z * blockDim.z + threadIdx.z;
|
||||
while(z < dim_z) {
|
||||
lin_index = z * dim_x * dim_y + y * dim_x + x;
|
||||
|
||||
tmp = tex1Dfetch(texRef, lin_index);
|
||||
|
||||
if (x < dim_x - 1) {
|
||||
Ux[lin_index] = tex1Dfetch(texRef, lin_index + 1) - tmp;
|
||||
} else {
|
||||
Ux[lin_index] = tex1Dfetch(texRef, lin_index - x) - tmp;
|
||||
}
|
||||
|
||||
if (y < dim_y - 1) {
|
||||
Uy[lin_index] = tex1Dfetch(texRef, lin_index + dim_x) - tmp;
|
||||
} else {
|
||||
Uy[lin_index] = tex1Dfetch(texRef, lin_index - y * dim_x) - tmp;
|
||||
}
|
||||
|
||||
if (z < dim_z - 1) {
|
||||
Uz[lin_index] = tex1Dfetch(texRef, lin_index + dim_x * dim_y) - tmp;
|
||||
} else {
|
||||
Uz[lin_index] = tex1Dfetch(texRef, lin_index - z * dim_x * dim_y) - tmp;
|
||||
}
|
||||
|
||||
z += gridDim.z * blockDim.z;
|
||||
}
|
||||
x += gridDim.x * blockDim.x;
|
||||
}
|
||||
y += gridDim.y * blockDim.y;
|
||||
}
|
||||
}
|
||||
|
||||
// matrices stored in column major format
|
||||
void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) {
|
||||
@@ -270,7 +223,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U)
|
||||
HANDLE_ERROR(cudaEventRecord(start_part));
|
||||
#endif
|
||||
dim3 threads, blocks;
|
||||
if(majorRevision >= 2) {
|
||||
threads.x = 32;
|
||||
threads.y = 4;
|
||||
threads.z = 1;
|
||||
@@ -278,21 +230,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U)
|
||||
blocks.y = max((int)round((double)U.dim_y / threads.y), 1);
|
||||
blocks.z = max((int)round((double)U.dim_z / threads.z), 1);
|
||||
D_kernel_no_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z);
|
||||
} else {
|
||||
threads.x = 64;
|
||||
threads.y = 4;
|
||||
threads.z = 1;
|
||||
blocks.x = max((int)round((double)U.dim_x / threads.x), 1);
|
||||
blocks.y = max((int)round((double)U.dim_y / threads.y), 1);
|
||||
blocks.z = 1;
|
||||
|
||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRef, U.data_dev_ptr(), &channelDesc,
|
||||
U.len * sizeof(float)));
|
||||
|
||||
D_kernel_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z);
|
||||
|
||||
HANDLE_ERROR(cudaUnbindTexture(texRef));
|
||||
}
|
||||
#ifdef PROFILING
|
||||
HANDLE_ERROR(cudaEventRecord(stop_part));
|
||||
float elapsedTime;
|
||||
@@ -330,42 +267,11 @@ __global__ void Dt_kernel_no_tex(const float *X, const float *Y, const float *Z,
|
||||
}
|
||||
}
|
||||
|
||||
// texture mem...
|
||||
__global__ void Dt_kernel_tex(const float *X, const float *Y, const float *Z, float *res, int dim_x, int dim_y, int dim_z) {
|
||||
|
||||
int x, y, z, lin_index;
|
||||
float xp, yp, zp;
|
||||
|
||||
y = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
while(y < dim_y) {
|
||||
x = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
while(x < dim_x) {
|
||||
z = blockIdx.z * blockDim.z + threadIdx.z;
|
||||
while(z < dim_y) {
|
||||
|
||||
lin_index = z * dim_x * dim_y + y * dim_x + x;
|
||||
|
||||
xp = (x == 0) ? tex1Dfetch(texRefX, lin_index + dim_x - 1) : tex1Dfetch(texRefX, lin_index - 1);
|
||||
yp = (y == 0) ? tex1Dfetch(texRefY, lin_index + (dim_y - 1) * dim_x) : tex1Dfetch(texRefY, lin_index - dim_x);
|
||||
yp = (z == 0) ? tex1Dfetch(texRefZ, lin_index + (dim_z - 1) * dim_x * dim_y) : tex1Dfetch(texRefZ, lin_index - dim_x * dim_y);
|
||||
|
||||
res[lin_index] = xp - tex1Dfetch(texRefX, lin_index) + yp - tex1Dfetch(texRefY, lin_index) + zp -
|
||||
tex1Dfetch(texRefZ, lin_index);
|
||||
|
||||
z += blockDim.z * gridDim.z;
|
||||
}
|
||||
x += blockDim.x * gridDim.x;
|
||||
}
|
||||
y += blockDim.y * gridDim.y;
|
||||
}
|
||||
}
|
||||
|
||||
void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat_device &Z, cudaStream_t stream = 0) {
|
||||
#ifdef PROFILING
|
||||
HANDLE_ERROR(cudaEventRecord(start_part));
|
||||
#endif
|
||||
dim3 threads, blocks;
|
||||
if(majorRevision >= 2) {
|
||||
threads.x = 32;
|
||||
threads.y = 4;
|
||||
threads.z = 1;
|
||||
@@ -373,24 +279,6 @@ void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat
|
||||
blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1);
|
||||
blocks.z = max((int)round((double)X.dim_z / threads.z), 1);
|
||||
Dt_kernel_no_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z);
|
||||
} else {
|
||||
threads.x = 32;
|
||||
threads.y = 2;
|
||||
threads.z = 8;
|
||||
blocks.x = max((int)round((double)X.dim_x / threads.x), 1);
|
||||
blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1);
|
||||
blocks.z = 1;
|
||||
|
||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRefX, X.data_dev_ptr(), &channelDesc, X.len * sizeof(float)));
|
||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRefY, Y.data_dev_ptr(), &channelDesc, Y.len * sizeof(float)));
|
||||
HANDLE_ERROR(cudaBindTexture(NULL, &texRefZ, Z.data_dev_ptr(), &channelDesc, Z.len * sizeof(float)));
|
||||
|
||||
Dt_kernel_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z);
|
||||
|
||||
HANDLE_ERROR(cudaUnbindTexture(texRefX));
|
||||
HANDLE_ERROR(cudaUnbindTexture(texRefY));
|
||||
HANDLE_ERROR(cudaUnbindTexture(texRefZ));
|
||||
}
|
||||
#ifdef PROFILING
|
||||
HANDLE_ERROR(cudaEventRecord(stop_part));
|
||||
float elapsedTime;
|
||||
|
||||
Reference in New Issue
Block a user