From fc9b6be9e82c0b7a338d0c2a101c06bb663f1429 Mon Sep 17 00:00:00 2001 From: sunwen Date: Mon, 27 Nov 2023 09:47:59 +0800 Subject: [PATCH] Add repmat and repmat3d, Fix sqrt with complex. --- src/CudaMatrix.cpp | 4 ++ src/Function1D.cu | 101 ++++++++++++++++++++++++++++++++++ src/Function1D.cuh | 6 ++ test/Function1D_Cuda_Test.cpp | 52 ++++++++++++++++- 4 files changed, 161 insertions(+), 2 deletions(-) diff --git a/src/CudaMatrix.cpp b/src/CudaMatrix.cpp index 8df61c0..2e97443 100644 --- a/src/CudaMatrix.cpp +++ b/src/CudaMatrix.cpp @@ -147,6 +147,10 @@ CudaMatrix CudaMatrix::deepCopy() const Matrix CudaMatrix::toHostMatrix() const { + if(!mData.get()) + { + return Matrix(); + } unsigned long long size = getDataSize() * getValueType(); float* data = new float[size]; cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToHost); diff --git a/src/Function1D.cu b/src/Function1D.cu index 776844a..7897138 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -195,6 +195,11 @@ __global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix) { + if(aMatrix.getValueType() == Aurora::Complex) + { + std::cerr<<"sqrt not support complex"< 2 || aMatrix.isNull()) + { + return CudaMatrix(); + } + + dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1); + dim3 gridSize(aRowTimes, aColumnTimes, 1); + + size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes; + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + repMatKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes) +{ + if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull()) + { + return CudaMatrix(); + } + + dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1); + dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes); + + size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + repMatKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); +} + +__global__ void repMat3DKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) +{ + unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; + unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; + if(aIsComplex) + { + unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX); + unsigned int inPutIndex = 2 * (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x); + aOutput[outPutIndex] = aInputData[inPutIndex]; + aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1]; + } + else + { + aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x]; + } + +} + +CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes) +{ + if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() < 3 || aMatrix.isNull()) + { + return CudaMatrix(); + } + + dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); + dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes); + + size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + repMat3DKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); +} \ No newline at end of file diff --git a/src/Function1D.cuh b/src/Function1D.cuh index d7ba7dd..800e600 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -39,6 +39,12 @@ namespace Aurora CudaMatrix sign(const CudaMatrix& aMatrix); CudaMatrix sign(const CudaMatrix&& aMatrix); + + CudaMatrix repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes); + + CudaMatrix repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes); + + CudaMatrix repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes); } #endif //AURORA_CUDA_FUNCTION1D_H \ No newline at end of file diff --git a/test/Function1D_Cuda_Test.cpp b/test/Function1D_Cuda_Test.cpp index bfdaf3f..78239cd 100644 --- a/test/Function1D_Cuda_Test.cpp +++ b/test/Function1D_Cuda_Test.cpp @@ -163,8 +163,8 @@ TEST_F(Function1D_Cuda_Test, sqrt) deviceMatrix = hostMatrix.toDeviceMatrix(); result1 = Aurora::sqrt(hostMatrix); result2 = Aurora::sqrt(deviceMatrix).toHostMatrix(); - EXPECT_EQ(result2.getDataSize(), 4); - EXPECT_EQ(result2.getValueType(), Aurora::Complex); + EXPECT_EQ(result2.getDataSize(), result1.getDataSize()); + EXPECT_EQ(result2.getValueType(), result1.getValueType()); for(size_t i=0; i