2023-11-21 10:16:51 +08:00
|
|
|
#include "CudaMatrix.h"
|
|
|
|
|
#include "Function1D.cuh"
|
|
|
|
|
#include "Matrix.h"
|
|
|
|
|
|
|
|
|
|
#include <cmath>
|
|
|
|
|
#include <thrust/device_vector.h>
|
|
|
|
|
#include <thrust/transform.h>
|
|
|
|
|
#include <thrust/iterator/constant_iterator.h>
|
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
|
|
|
|
|
|
using namespace Aurora;
|
|
|
|
|
|
|
|
|
|
namespace
|
|
|
|
|
{
|
|
|
|
|
const int THREADS_PER_BLOCK = 256;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
aOutput[2*idx] = aInputData[idx];
|
|
|
|
|
aOutput[2*idx + 1] = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
if(aMatrix.isComplex())
|
|
|
|
|
{
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
complexKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = aInputData[idx*2];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::real(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
if(!aMatrix.isComplex())
|
|
|
|
|
{
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
realKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void imageKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = aInputData[idx*2 + 1];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::imag(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
if(!aMatrix.isComplex())
|
|
|
|
|
{
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
imageKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = std::ceil(aInputData[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
ceilKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
ceilKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = std::round(aInputData[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::round(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
roundKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::round(const CudaMatrix&& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
roundKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = std::floor(aInputData[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::floor(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
floorKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
floorKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = std::sqrt(aInputData[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
2023-11-27 09:47:59 +08:00
|
|
|
if(aMatrix.getValueType() == Aurora::Complex)
|
|
|
|
|
{
|
|
|
|
|
std::cerr<<"sqrt not support complex"<<std::endl;
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
2023-11-21 10:16:51 +08:00
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
sqrtKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix)
|
|
|
|
|
{
|
2023-11-27 09:47:59 +08:00
|
|
|
if(aMatrix.getValueType() == Aurora::Complex)
|
|
|
|
|
{
|
|
|
|
|
std::cerr<<"sqrt not support complex"<<std::endl;
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
2023-11-21 10:16:51 +08:00
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
sqrtKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void absKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
if(aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = abs(aInputData[idx]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::abs(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
absKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
absKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
if (idx < aSize)
|
|
|
|
|
{
|
|
|
|
|
if(aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
float absValue = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx + 1] * aInputData[2*idx + 1]);
|
|
|
|
|
aOutput[2*idx] = aInputData[2*idx] / absValue;
|
|
|
|
|
aOutput[2*idx + 1] = aInputData[2*idx + 1] / absValue;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if(aInputData[idx] < 0)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = -1;
|
|
|
|
|
}
|
|
|
|
|
else if(aInputData[idx] > 0)
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = 1;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
aOutput[idx] = 0;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::sign(const CudaMatrix& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
signKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix)
|
|
|
|
|
{
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
|
|
|
signKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
2023-11-27 09:47:59 +08:00
|
|
|
|
|
|
|
|
__global__ void repMatKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
|
|
|
|
if(aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX);
|
|
|
|
|
unsigned int inPutIndex = 2 * (threadIdx.y * blockDim.x + threadIdx.x);
|
|
|
|
|
aOutput[outPutIndex] = aInputData[inPutIndex];
|
|
|
|
|
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.y * blockDim.x + threadIdx.x];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes)
|
|
|
|
|
{
|
|
|
|
|
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull())
|
|
|
|
|
{
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1);
|
|
|
|
|
dim3 gridSize(aRowTimes, aColumnTimes, 1);
|
|
|
|
|
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes;
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
|
|
|
|
|
{
|
|
|
|
|
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull())
|
|
|
|
|
{
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1);
|
|
|
|
|
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes);
|
|
|
|
|
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__global__ void repMat3DKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
|
|
|
|
if(aIsComplex)
|
|
|
|
|
{
|
|
|
|
|
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX);
|
|
|
|
|
unsigned int inPutIndex = 2 * (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x);
|
|
|
|
|
aOutput[outPutIndex] = aInputData[inPutIndex];
|
|
|
|
|
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
|
|
|
|
|
{
|
|
|
|
|
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() < 3 || aMatrix.isNull())
|
|
|
|
|
{
|
|
|
|
|
return CudaMatrix();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
|
|
|
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes);
|
|
|
|
|
|
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
|
|
|
|
float* data = nullptr;
|
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
|
|
|
repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
|
|
|
|
cudaDeviceSynchronize();
|
|
|
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
|
|
|
|
}
|