diff --git a/src/Function1D.cu b/src/Function1D.cu new file mode 100644 index 0000000..776844a --- /dev/null +++ b/src/Function1D.cu @@ -0,0 +1,304 @@ +#include "CudaMatrix.h" +#include "Function1D.cuh" +#include "Matrix.h" + +#include +#include +#include +#include +#include + +using namespace Aurora; + +namespace +{ + const int THREADS_PER_BLOCK = 256; +} + +__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + aOutput[2*idx] = aInputData[idx]; + aOutput[2*idx + 1] = 0; + } +} + +CudaMatrix Aurora::complex(const CudaMatrix& aMatrix) +{ + if(aMatrix.isComplex()) + { + return CudaMatrix(); + } + + size_t size = aMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + complexKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex); +} + +__global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + aOutput[idx] = aInputData[idx*2]; + } +} + +CudaMatrix Aurora::real(const CudaMatrix& aMatrix) +{ + if(!aMatrix.isComplex()) + { + return CudaMatrix(); + } + + size_t size = aMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + realKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); +} + +__global__ void imageKernel(float* aInputData, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + aOutput[idx] = aInputData[idx*2 + 1]; + } +} + +CudaMatrix Aurora::imag(const CudaMatrix& aMatrix) +{ + if(!aMatrix.isComplex()) + { + return CudaMatrix(); + } + + size_t size = aMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + imageKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); +} + +__global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + aOutput[idx] = std::ceil(aInputData[idx]); + } +} + +CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + ceilKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + ceilKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +__global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + aOutput[idx] = std::round(aInputData[idx]); + } +} + +CudaMatrix Aurora::round(const CudaMatrix& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + roundKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +CudaMatrix Aurora::round(const CudaMatrix&& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + roundKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +__global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + aOutput[idx] = std::floor(aInputData[idx]); + } +} + +CudaMatrix Aurora::floor(const CudaMatrix& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + floorKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + floorKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +__global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + aOutput[idx] = std::sqrt(aInputData[idx]); + } +} + +CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + sqrtKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + sqrtKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +__global__ void absKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + if(aIsComplex) + { + aOutput[idx] = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]); + } + else + { + aOutput[idx] = abs(aInputData[idx]); + } + } +} + +CudaMatrix Aurora::abs(const CudaMatrix& aMatrix) +{ + size_t size = aMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); +} + +CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix) +{ + size_t size = aMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); +} + +__global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aSize) + { + if(aIsComplex) + { + float absValue = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx + 1] * aInputData[2*idx + 1]); + aOutput[2*idx] = aInputData[2*idx] / absValue; + aOutput[2*idx + 1] = aInputData[2*idx + 1] / absValue; + return; + } + + if(aInputData[idx] < 0) + { + aOutput[idx] = -1; + } + else if(aInputData[idx] > 0) + { + aOutput[idx] = 1; + } + else + { + aOutput[idx] = 0; + } + } +} + +CudaMatrix Aurora::sign(const CudaMatrix& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix) +{ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} diff --git a/src/Function1D.cuh b/src/Function1D.cuh new file mode 100644 index 0000000..d7ba7dd --- /dev/null +++ b/src/Function1D.cuh @@ -0,0 +1,44 @@ +#ifndef AURORA_CUDA_FUNCTION1D_H +#define AURORA_CUDA_FUNCTION1D_H + +#include "CudaMatrix.h" + +namespace Aurora +{ + CudaMatrix complex(const CudaMatrix& aMatrix); + + CudaMatrix real(const CudaMatrix& aMatrix); + + CudaMatrix imag(const CudaMatrix& aMatrix); + + CudaMatrix ceil(const CudaMatrix& aMatrix); + + CudaMatrix ceil(const CudaMatrix&& aMatrix); + + CudaMatrix round(const CudaMatrix& aMatrix); + + CudaMatrix round(const CudaMatrix&& aMatrix); + + CudaMatrix floor(const CudaMatrix& aMatrix); + + CudaMatrix floor(const CudaMatrix&& aMatrix); + + /** + * 开根号,暂时只支持正整数 + * @param matrix + * @return + */ + CudaMatrix sqrt(const CudaMatrix& aMatrix); + + CudaMatrix sqrt(const CudaMatrix&& aMatrix); + + CudaMatrix abs(const CudaMatrix& aMatrix); + + CudaMatrix abs(const CudaMatrix&& aMatrix); + + CudaMatrix sign(const CudaMatrix& aMatrix); + + CudaMatrix sign(const CudaMatrix&& aMatrix); +} + +#endif //AURORA_CUDA_FUNCTION1D_H \ No newline at end of file diff --git a/test/Function1D_Cuda_Test.cpp b/test/Function1D_Cuda_Test.cpp new file mode 100644 index 0000000..bfdaf3f --- /dev/null +++ b/test/Function1D_Cuda_Test.cpp @@ -0,0 +1,224 @@ +#include + +#include "CudaMatrix.h" +#include "Matrix.h" +#include "TestUtility.h" + +#include "Function1D.h" +#include "Function1D.cuh" + +class Function1D_Cuda_Test:public ::testing::Test +{ +protected: + static void SetUpFunction1DCudaTester(){ + + } + static void TearDownTestCase(){ + } + void SetUp(){ + } + void TearDown(){ + } + +}; + +TEST_F(Function1D_Cuda_Test, complex) +{ + Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[8]{1,2,3,4,5,6,7,8}, 2,2,2); + Aurora::CudaMatrix deviceMatrix = hostMatrix.toDeviceMatrix(); + + auto result1 = Aurora::complex(hostMatrix); + auto result2 = Aurora::complex(deviceMatrix).toHostMatrix(); + EXPECT_EQ(result2.getDataSize(), 8); + EXPECT_EQ(result2.getValueType(), Aurora::Complex); + for(size_t i=0; i