diff --git a/TVALGPU/src/handle_error.h b/TVALGPU/src/handle_error.h index 2ca9609..843e5ae 100644 --- a/TVALGPU/src/handle_error.h +++ b/TVALGPU/src/handle_error.h @@ -4,7 +4,7 @@ #include #include #include "cuda_runtime.h" -#include "cublas.h" +#include "cublas_v2.h" #include "cusparse.h" class cuda_exception: public std::runtime_error { diff --git a/TVALGPU/src/mat_vec_mul.h b/TVALGPU/src/mat_vec_mul.h index b9cdde1..ece63b1 100644 --- a/TVALGPU/src/mat_vec_mul.h +++ b/TVALGPU/src/mat_vec_mul.h @@ -7,6 +7,7 @@ #ifndef MAT_VEC_MUL_H_ #define MAT_VEC_MUL_H_ +#include #include "container_device.h" #include struct sparse_mm { @@ -190,6 +191,20 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A cusparseCreateDnVec(&vecX,n, (void*)x, cudaDataType::CUDA_R_32F); cusparseCreateDnVec(&vecY,m, (void*)y, cudaDataType::CUDA_R_32F); unsigned long bufferSize = 0; + //CUDA VERSION > 11, use new API + #if CUDA_VERSION >= 11000 + status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA, + vecX, + &beta, + vecY, + cudaDataType::CUDA_R_32F, + CUSPARSE_SPMV_CSR_ALG1, &bufferSize); + void* buffer = NULL; + cudaMalloc(&buffer,bufferSize); + status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + &alpha, matA, vecX, &beta, vecY, + CUDA_R_32F, CUSPARSE_SPMV_CSR_ALG1, buffer); + #else status = cusparseSpMV_bufferSize(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE,&alpha,matA, vecX, &beta, @@ -201,6 +216,7 @@ inline cusparseStatus_t mat_vec_mul(cublasOperation_t transA, const sparse_mm &A status = cusparseSpMV(A.cs_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, CUDA_R_32F, CUSPARSE_CSRMV_ALG1, buffer); + #endif cusparseDestroySpMat(matA); cusparseDestroyDnVec(vecX); cusparseDestroyDnVec(vecY); diff --git a/TVALGPU/src/tval3.cu b/TVALGPU/src/tval3.cu index 7dc012c..066d047 100644 --- a/TVALGPU/src/tval3.cu +++ b/TVALGPU/src/tval3.cu @@ -17,12 +17,6 @@ cudaEvent_t start_part, stop_part; // dynamically allocated shared memory array extern __shared__ float buffer[]; -// texture references... -texture texRef; -texture texRefX; -texture texRefY; -texture texRefZ; - cudaChannelFormatDesc channelDesc; cudaStream_t stream1, stream2, stream3; @@ -222,47 +216,6 @@ __global__ void D_kernel_no_tex(const float *U, float *Ux, float *Uy, float *Uz, } } -__global__ void D_kernel_tex(const float *U, float *Ux, float *Uy, float *Uz, int dim_x, int dim_y, int dim_z) { - - int x, y, z, lin_index; - - float tmp; - - y = blockIdx.y * blockDim.y + threadIdx.y; - while(y < dim_y) { - x = blockIdx.x * blockDim.x + threadIdx.x; - while(x < dim_x) { - z = blockIdx.z * blockDim.z + threadIdx.z; - while(z < dim_z) { - lin_index = z * dim_x * dim_y + y * dim_x + x; - - tmp = tex1Dfetch(texRef, lin_index); - - if (x < dim_x - 1) { - Ux[lin_index] = tex1Dfetch(texRef, lin_index + 1) - tmp; - } else { - Ux[lin_index] = tex1Dfetch(texRef, lin_index - x) - tmp; - } - - if (y < dim_y - 1) { - Uy[lin_index] = tex1Dfetch(texRef, lin_index + dim_x) - tmp; - } else { - Uy[lin_index] = tex1Dfetch(texRef, lin_index - y * dim_x) - tmp; - } - - if (z < dim_z - 1) { - Uz[lin_index] = tex1Dfetch(texRef, lin_index + dim_x * dim_y) - tmp; - } else { - Uz[lin_index] = tex1Dfetch(texRef, lin_index - z * dim_x * dim_y) - tmp; - } - - z += gridDim.z * blockDim.z; - } - x += gridDim.x * blockDim.x; - } - y += gridDim.y * blockDim.y; - } -} // matrices stored in column major format void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) { @@ -270,7 +223,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) HANDLE_ERROR(cudaEventRecord(start_part)); #endif dim3 threads, blocks; - if(majorRevision >= 2) { threads.x = 32; threads.y = 4; threads.z = 1; @@ -278,21 +230,6 @@ void D_gpu(mat_device &Ux, mat_device &Uy, mat_device &Uz, const mat_device &U) blocks.y = max((int)round((double)U.dim_y / threads.y), 1); blocks.z = max((int)round((double)U.dim_z / threads.z), 1); D_kernel_no_tex<<>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z); - } else { - threads.x = 64; - threads.y = 4; - threads.z = 1; - blocks.x = max((int)round((double)U.dim_x / threads.x), 1); - blocks.y = max((int)round((double)U.dim_y / threads.y), 1); - blocks.z = 1; - - HANDLE_ERROR(cudaBindTexture(NULL, &texRef, U.data_dev_ptr(), &channelDesc, - U.len * sizeof(float))); - - D_kernel_tex<<>>(U.data_dev_ptr(), Uy.data_dev_ptr(), Ux.data_dev_ptr(), Uz.data_dev_ptr(), U.dim_y, U.dim_x, U.dim_z); - - HANDLE_ERROR(cudaUnbindTexture(texRef)); - } #ifdef PROFILING HANDLE_ERROR(cudaEventRecord(stop_part)); float elapsedTime; @@ -330,42 +267,11 @@ __global__ void Dt_kernel_no_tex(const float *X, const float *Y, const float *Z, } } -// texture mem... -__global__ void Dt_kernel_tex(const float *X, const float *Y, const float *Z, float *res, int dim_x, int dim_y, int dim_z) { - - int x, y, z, lin_index; - float xp, yp, zp; - - y = blockIdx.y * blockDim.y + threadIdx.y; - while(y < dim_y) { - x = blockIdx.x * blockDim.x + threadIdx.x; - while(x < dim_x) { - z = blockIdx.z * blockDim.z + threadIdx.z; - while(z < dim_y) { - - lin_index = z * dim_x * dim_y + y * dim_x + x; - - xp = (x == 0) ? tex1Dfetch(texRefX, lin_index + dim_x - 1) : tex1Dfetch(texRefX, lin_index - 1); - yp = (y == 0) ? tex1Dfetch(texRefY, lin_index + (dim_y - 1) * dim_x) : tex1Dfetch(texRefY, lin_index - dim_x); - yp = (z == 0) ? tex1Dfetch(texRefZ, lin_index + (dim_z - 1) * dim_x * dim_y) : tex1Dfetch(texRefZ, lin_index - dim_x * dim_y); - - res[lin_index] = xp - tex1Dfetch(texRefX, lin_index) + yp - tex1Dfetch(texRefY, lin_index) + zp - - tex1Dfetch(texRefZ, lin_index); - - z += blockDim.z * gridDim.z; - } - x += blockDim.x * gridDim.x; - } - y += blockDim.y * gridDim.y; - } -} - void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat_device &Z, cudaStream_t stream = 0) { #ifdef PROFILING HANDLE_ERROR(cudaEventRecord(start_part)); #endif dim3 threads, blocks; - if(majorRevision >= 2) { threads.x = 32; threads.y = 4; threads.z = 1; @@ -373,24 +279,6 @@ void Dt_gpu(mat_device &res, const mat_device &X, const mat_device &Y, const mat blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1); blocks.z = max((int)round((double)X.dim_z / threads.z), 1); Dt_kernel_no_tex<<>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z); - } else { - threads.x = 32; - threads.y = 2; - threads.z = 8; - blocks.x = max((int)round((double)X.dim_x / threads.x), 1); - blocks.y = max((int)round((double)X.dim_y / threads.y * 0.6), 1); - blocks.z = 1; - - HANDLE_ERROR(cudaBindTexture(NULL, &texRefX, X.data_dev_ptr(), &channelDesc, X.len * sizeof(float))); - HANDLE_ERROR(cudaBindTexture(NULL, &texRefY, Y.data_dev_ptr(), &channelDesc, Y.len * sizeof(float))); - HANDLE_ERROR(cudaBindTexture(NULL, &texRefZ, Z.data_dev_ptr(), &channelDesc, Z.len * sizeof(float))); - - Dt_kernel_tex<<>>(Y.data_dev_ptr(), X.data_dev_ptr(), Z.data_dev_ptr(), res.data_dev_ptr(), X.dim_y, X.dim_x, X.dim_z); - - HANDLE_ERROR(cudaUnbindTexture(texRefX)); - HANDLE_ERROR(cudaUnbindTexture(texRefY)); - HANDLE_ERROR(cudaUnbindTexture(texRefZ)); - } #ifdef PROFILING HANDLE_ERROR(cudaEventRecord(stop_part)); float elapsedTime;