feat: make TVAL support CUDA 12

This commit is contained in:
kradchen
2024-12-16 16:32:19 +08:00
parent 3f69b3bb98
commit 587986ef6c
3 changed files with 17 additions and 113 deletions

View File

@@ -4,7 +4,7 @@
#include <string> #include <string>
#include <sstream> #include <sstream>
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "cublas.h" #include "cublas_v2.h"
#include "cusparse.h" #include "cusparse.h"
class cuda_exception: public std::runtime_error { class cuda_exception: public std::runtime_error {

View File

@@ -7,6 +7,7 @@
#ifndef MAT_VEC_MUL_H_ #ifndef MAT_VEC_MUL_H_
#define MAT_VEC_MUL_H_ #define MAT_VEC_MUL_H_
#include <cuda.h>
#include "container_device.h" #include "container_device.h"
#include <cublas_v2.h> #include <cublas_v2.h>
struct sparse_mm { struct sparse_mm {
@@ -190,6 +191,20 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A
cusparseCreateDnVec(&vecX,n, (void*)x, cudaDataType::CUDA_R_32F); cusparseCreateDnVec(&vecX,n, (void*)x, cudaDataType::CUDA_R_32F);
cusparseCreateDnVec(&vecY,m, (void*)y, cudaDataType::CUDA_R_32F); cusparseCreateDnVec(&vecY,m, (void*)y, cudaDataType::CUDA_R_32F);
unsigned long bufferSize = 0; unsigned long bufferSize = 0;
//CUDA VERSION > 11, use new API
#if CUDA_VERSION >= 11000
status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA,
vecX,
&beta,
vecY,
cudaDataType::CUDA_R_32F,
CUSPARSE_SPMV_CSR_ALG1, &bufferSize);
void* buffer = NULL;
cudaMalloc(&buffer,bufferSize);
status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY,
CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, buffer);
#else
status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA, status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA,
vecX, vecX,
&beta, &beta,
@@ -201,6 +216,7 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A
status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
&alpha, matA, vecX, &beta, vecY, &alpha, matA, vecX, &beta, vecY,
CUDA_R_32F, CUSPARSE_CSRMV_ALG1, buffer); CUDA_R_32F, CUSPARSE_CSRMV_ALG1, buffer);
#endif
cusparseDestroySpMat(matA); cusparseDestroySpMat(matA);
cusparseDestroyDnVec(vecX); cusparseDestroyDnVec(vecX);
cusparseDestroyDnVec(vecY); cusparseDestroyDnVec(vecY);

View File

@@ -17,12 +17,6 @@ cudaEvent_t start_part, stop_part;
// dynamically allocated shared memory array // dynamically allocated shared memory array
extern __shared__ float buffer[]; extern __shared__ float buffer[];
// texture references...
texture<float, 1> texRef;
texture<float, 1> texRefX;
texture<float, 1> texRefY;
texture<float, 1> texRefZ;
cudaChannelFormatDesc channelDesc; cudaChannelFormatDesc channelDesc;
cudaStream_t stream1, stream2, stream3; cudaStream_t stream1, stream2, stream3;
@@ -222,47 +216,6 @@ __global__ void D_kernel_no_tex(const float *U, float *Ux, float *Uy, float *Uz,
} }
} }
__global__ void D_kernel_tex(const float *U, float *Ux, float *Uy, float *Uz, int dim_x, int dim_y, int dim_z) {
int x, y, z, lin_index;
float tmp;
y = blockIdx.y * blockDim.y + threadIdx.y;
while(y < dim_y) {
x = blockIdx.x * blockDim.x + threadIdx.x;
while(x < dim_x) {
z = blockIdx.z * blockDim.z + threadIdx.z;
while(z < dim_z) {
lin_index = z * dim_x * dim_y + y * dim_x + x;
tmp = tex1Dfetch(texRef, lin_index);
if (x < dim_x - 1) {
Ux[lin_index] = tex1Dfetch(texRef, lin_index + 1) - tmp;
} else {
Ux[lin_index] = tex1Dfetch(texRef, lin_index - x) - tmp;
}
if (y < dim_y - 1) {
Uy[lin_index] = tex1Dfetch(texRef, lin_index + dim_x) - tmp;
} else {
Uy[lin_index] = tex1Dfetch(texRef, lin_index - y * dim_x) - tmp;
}
if (z < dim_z - 1) {
Uz[lin_index] = tex1Dfetch(texRef, lin_index + dim_x * dim_y) - tmp;
} else {
Uz[lin_index] = tex1Dfetch(texRef, lin_index - z * dim_x * dim_y) - tmp;
}
z += gridDim.z * blockDim.z;
}
x += gridDim.x * blockDim.x;
}
y += gridDim.y * blockDim.y;
}
}
// matrices stored in column major format // matrices stored in column major format
void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) { void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) {
@@ -270,7 +223,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U)
HANDLE_ERROR(cudaEventRecord(start_part)); HANDLE_ERROR(cudaEventRecord(start_part));
#endif #endif
dim3 threads, blocks; dim3 threads, blocks;
if(majorRevision >= 2) {
threads.x = 32; threads.x = 32;
threads.y = 4; threads.y = 4;
threads.z = 1; threads.z = 1;
@@ -278,21 +230,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U)
blocks.y = max((int)round((double)U.dim_y / threads.y), 1); blocks.y = max((int)round((double)U.dim_y / threads.y), 1);
blocks.z = max((int)round((double)U.dim_z / threads.z), 1); blocks.z = max((int)round((double)U.dim_z / threads.z), 1);
D_kernel_no_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z); D_kernel_no_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z);
} else {
threads.x = 64;
threads.y = 4;
threads.z = 1;
blocks.x = max((int)round((double)U.dim_x / threads.x), 1);
blocks.y = max((int)round((double)U.dim_y / threads.y), 1);
blocks.z = 1;
HANDLE_ERROR(cudaBindTexture(NULL, &texRef, U.data_dev_ptr(), &channelDesc,
U.len * sizeof(float)));
D_kernel_tex<<<blocks, threads>>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z);
HANDLE_ERROR(cudaUnbindTexture(texRef));
}
#ifdef PROFILING #ifdef PROFILING
HANDLE_ERROR(cudaEventRecord(stop_part)); HANDLE_ERROR(cudaEventRecord(stop_part));
float elapsedTime; float elapsedTime;
@@ -330,42 +267,11 @@ __global__ void Dt_kernel_no_tex(const float *X, const float *Y, const float *Z,
} }
} }
// texture mem...
__global__ void Dt_kernel_tex(const float *X, const float *Y, const float *Z, float *res, int dim_x, int dim_y, int dim_z) {
int x, y, z, lin_index;
float xp, yp, zp;
y = blockIdx.y * blockDim.y + threadIdx.y;
while(y < dim_y) {
x = blockIdx.x * blockDim.x + threadIdx.x;
while(x < dim_x) {
z = blockIdx.z * blockDim.z + threadIdx.z;
while(z < dim_y) {
lin_index = z * dim_x * dim_y + y * dim_x + x;
xp = (x == 0) ? tex1Dfetch(texRefX, lin_index + dim_x - 1) : tex1Dfetch(texRefX, lin_index - 1);
yp = (y == 0) ? tex1Dfetch(texRefY, lin_index + (dim_y - 1) * dim_x) : tex1Dfetch(texRefY, lin_index - dim_x);
yp = (z == 0) ? tex1Dfetch(texRefZ, lin_index + (dim_z - 1) * dim_x * dim_y) : tex1Dfetch(texRefZ, lin_index - dim_x * dim_y);
res[lin_index] = xp - tex1Dfetch(texRefX, lin_index) + yp - tex1Dfetch(texRefY, lin_index) + zp -
tex1Dfetch(texRefZ, lin_index);
z += blockDim.z * gridDim.z;
}
x += blockDim.x * gridDim.x;
}
y += blockDim.y * gridDim.y;
}
}
void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat_device &Z, cudaStream_t stream = 0) { void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat_device &Z, cudaStream_t stream = 0) {
#ifdef PROFILING #ifdef PROFILING
HANDLE_ERROR(cudaEventRecord(start_part)); HANDLE_ERROR(cudaEventRecord(start_part));
#endif #endif
dim3 threads, blocks; dim3 threads, blocks;
if(majorRevision >= 2) {
threads.x = 32; threads.x = 32;
threads.y = 4; threads.y = 4;
threads.z = 1; threads.z = 1;
@@ -373,24 +279,6 @@ void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat
blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1); blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1);
blocks.z = max((int)round((double)X.dim_z / threads.z), 1); blocks.z = max((int)round((double)X.dim_z / threads.z), 1);
Dt_kernel_no_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z); Dt_kernel_no_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z);
} else {
threads.x = 32;
threads.y = 2;
threads.z = 8;
blocks.x = max((int)round((double)X.dim_x / threads.x), 1);
blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1);
blocks.z = 1;
HANDLE_ERROR(cudaBindTexture(NULL, &texRefX, X.data_dev_ptr(), &channelDesc, X.len * sizeof(float)));
HANDLE_ERROR(cudaBindTexture(NULL, &texRefY, Y.data_dev_ptr(), &channelDesc, Y.len * sizeof(float)));
HANDLE_ERROR(cudaBindTexture(NULL, &texRefZ, Z.data_dev_ptr(), &channelDesc, Z.len * sizeof(float)));
Dt_kernel_tex<<<blocks, threads, 0, stream>>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z);
HANDLE_ERROR(cudaUnbindTexture(texRefX));
HANDLE_ERROR(cudaUnbindTexture(texRefY));
HANDLE_ERROR(cudaUnbindTexture(texRefZ));
}
#ifdef PROFILING #ifdef PROFILING
HANDLE_ERROR(cudaEventRecord(stop_part)); HANDLE_ERROR(cudaEventRecord(stop_part));
float elapsedTime; float elapsedTime;