From 9d8c204ec4d99f1c0227188447bf86781ed01dd6 Mon Sep 17 00:00:00 2001 From: sunwen Date: Mon, 27 Nov 2023 13:51:35 +0800 Subject: [PATCH] Add log function and fix start kernel block thread size. --- src/Function1D.cu | 65 ++++++++++++++++++++++++++++++++++------------ src/Function1D.cuh | 2 ++ 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/src/Function1D.cu b/src/Function1D.cu index 7897138..d2339f8 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -36,7 +36,7 @@ CudaMatrix Aurora::complex(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - complexKernel<<>>(aMatrix.getData(), data, size); + complexKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex); } @@ -61,7 +61,7 @@ CudaMatrix Aurora::real(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - realKernel<<>>(aMatrix.getData(), data, size); + realKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } @@ -86,7 +86,7 @@ CudaMatrix Aurora::imag(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - imageKernel<<>>(aMatrix.getData(), data, size); + imageKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } @@ -106,7 +106,7 @@ CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - ceilKernel<<>>(aMatrix.getData(), data, size); + ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -117,7 +117,7 @@ CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - ceilKernel<<>>(aMatrix.getData(), data, size); + ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -137,7 +137,7 @@ CudaMatrix Aurora::round(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - roundKernel<<>>(aMatrix.getData(), data, size); + roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -148,7 +148,7 @@ CudaMatrix Aurora::round(const CudaMatrix&& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - roundKernel<<>>(aMatrix.getData(), data, size); + roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -168,7 +168,7 @@ CudaMatrix Aurora::floor(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - floorKernel<<>>(aMatrix.getData(), data, size); + floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -179,7 +179,7 @@ CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - floorKernel<<>>(aMatrix.getData(), data, size); + floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -204,7 +204,7 @@ CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - sqrtKernel<<>>(aMatrix.getData(), data, size); + sqrtKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -220,7 +220,7 @@ CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - sqrtKernel<<>>(aMatrix.getData(), data, size); + sqrtKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -247,7 +247,7 @@ CudaMatrix Aurora::abs(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); + absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } @@ -258,7 +258,7 @@ CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); + absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } @@ -297,7 +297,7 @@ CudaMatrix Aurora::sign(const CudaMatrix& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -308,7 +308,7 @@ CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix) float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -402,4 +402,37 @@ CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumn repMat3DKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); -} \ No newline at end of file +} + +__global__ void logKernel(float* aInputData, float* aOutput, unsigned int aInputSize, int aBaseNum) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aInputSize) + { + if(aBaseNum == -1) + { + aOutput[idx] = logf(aInputData[idx]); + } + else + { + float value = logf(aBaseNum); + aOutput[idx] = logf(aInputData[idx]) / value; + } + } +} + +CudaMatrix Aurora::log(const CudaMatrix& aMatrix, int aBaseNum) +{ + if(aMatrix.getValueType() == Aurora::Complex) + { + std::cerr<<"log not support complex"<>>(aMatrix.getData(), data, size, aBaseNum); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} diff --git a/src/Function1D.cuh b/src/Function1D.cuh index 800e600..0bedd57 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -45,6 +45,8 @@ namespace Aurora CudaMatrix repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes); CudaMatrix repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes); + + CudaMatrix log(const CudaMatrix& aMatrix, int aBaseNum = -1); } #endif //AURORA_CUDA_FUNCTION1D_H \ No newline at end of file