#include "CudaMatrix.h" #include "Function1D.cuh" #include "Matrix.h" #include #include #include #include #include using namespace Aurora; namespace { const int THREADS_PER_BLOCK = 256; } __global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[2*idx] = aInputData[idx]; aOutput[2*idx + 1] = 0; } } CudaMatrix Aurora::complex(const CudaMatrix& aMatrix) { if(aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; complexKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex); } __global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = aInputData[idx*2]; } } CudaMatrix Aurora::real(const CudaMatrix& aMatrix) { if(!aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; realKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } __global__ void imageKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = aInputData[idx*2 + 1]; } } CudaMatrix Aurora::imag(const CudaMatrix& aMatrix) { if(!aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; imageKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } __global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::ceil(aInputData[idx]); } } CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::round(aInputData[idx]); } } CudaMatrix Aurora::round(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::round(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::floor(aInputData[idx]); } } CudaMatrix Aurora::floor(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::sqrt(aInputData[idx]); } } CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; sqrtKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; sqrtKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void absKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { if(aIsComplex) { aOutput[idx] = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]); } else { aOutput[idx] = abs(aInputData[idx]); } } } CudaMatrix Aurora::abs(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } __global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { if(aIsComplex) { float absValue = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx + 1] * aInputData[2*idx + 1]); aOutput[2*idx] = aInputData[2*idx] / absValue; aOutput[2*idx + 1] = aInputData[2*idx + 1] / absValue; return; } if(aInputData[idx] < 0) { aOutput[idx] = -1; } else if(aInputData[idx] > 0) { aOutput[idx] = 1; } else { aOutput[idx] = 0; } } } CudaMatrix Aurora::sign(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); }