Merge branch 'dtof' of http://192.168.1.9:3000/Bug/Aurora into dtof

This commit is contained in:
kradchen
2023-11-27 09:35:42 +08:00
3 changed files with 572 additions and 0 deletions

304
src/Function1D.cu Normal file
View File

@@ -0,0 +1,304 @@
#include "CudaMatrix.h"
#include "Function1D.cuh"
#include "Matrix.h"
#include <cmath>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/constant_iterator.h>
#include <cuda_runtime.h>
using namespace Aurora;
namespace
{
const int THREADS_PER_BLOCK = 256;
}
__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[2*idx] = aInputData[idx];
aOutput[2*idx + 1] = 0;
}
}
CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
{
if(aMatrix.isComplex())
{
return CudaMatrix();
}
size_t size = aMatrix.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
complexKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex);
}
__global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[idx] = aInputData[idx*2];
}
}
CudaMatrix Aurora::real(const CudaMatrix& aMatrix)
{
if(!aMatrix.isComplex())
{
return CudaMatrix();
}
size_t size = aMatrix.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
realKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
}
__global__ void imageKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[idx] = aInputData[idx*2 + 1];
}
}
CudaMatrix Aurora::imag(const CudaMatrix& aMatrix)
{
if(!aMatrix.isComplex())
{
return CudaMatrix();
}
size_t size = aMatrix.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
imageKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
}
__global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[idx] = std::ceil(aInputData[idx]);
}
}
CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
ceilKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
ceilKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[idx] = std::round(aInputData[idx]);
}
}
CudaMatrix Aurora::round(const CudaMatrix& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
roundKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::round(const CudaMatrix&& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
roundKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[idx] = std::floor(aInputData[idx]);
}
}
CudaMatrix Aurora::floor(const CudaMatrix& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
floorKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
floorKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
aOutput[idx] = std::sqrt(aInputData[idx]);
}
}
CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
sqrtKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
sqrtKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void absKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
if(aIsComplex)
{
aOutput[idx] = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]);
}
else
{
aOutput[idx] = abs(aInputData[idx]);
}
}
}
CudaMatrix Aurora::abs(const CudaMatrix& aMatrix)
{
size_t size = aMatrix.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
absKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
}
CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix)
{
size_t size = aMatrix.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
absKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
}
__global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aSize)
{
if(aIsComplex)
{
float absValue = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx + 1] * aInputData[2*idx + 1]);
aOutput[2*idx] = aInputData[2*idx] / absValue;
aOutput[2*idx + 1] = aInputData[2*idx + 1] / absValue;
return;
}
if(aInputData[idx] < 0)
{
aOutput[idx] = -1;
}
else if(aInputData[idx] > 0)
{
aOutput[idx] = 1;
}
else
{
aOutput[idx] = 0;
}
}
}
CudaMatrix Aurora::sign(const CudaMatrix& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
signKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix)
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
signKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}

44
src/Function1D.cuh Normal file
View File

@@ -0,0 +1,44 @@
#ifndef AURORA_CUDA_FUNCTION1D_H
#define AURORA_CUDA_FUNCTION1D_H
#include "CudaMatrix.h"
namespace Aurora
{
CudaMatrix complex(const CudaMatrix& aMatrix);
CudaMatrix real(const CudaMatrix& aMatrix);
CudaMatrix imag(const CudaMatrix& aMatrix);
CudaMatrix ceil(const CudaMatrix& aMatrix);
CudaMatrix ceil(const CudaMatrix&& aMatrix);
CudaMatrix round(const CudaMatrix& aMatrix);
CudaMatrix round(const CudaMatrix&& aMatrix);
CudaMatrix floor(const CudaMatrix& aMatrix);
CudaMatrix floor(const CudaMatrix&& aMatrix);
/**
* 开根号,暂时只支持正整数
* @param matrix
* @return
*/
CudaMatrix sqrt(const CudaMatrix& aMatrix);
CudaMatrix sqrt(const CudaMatrix&& aMatrix);
CudaMatrix abs(const CudaMatrix& aMatrix);
CudaMatrix abs(const CudaMatrix&& aMatrix);
CudaMatrix sign(const CudaMatrix& aMatrix);
CudaMatrix sign(const CudaMatrix&& aMatrix);
}
#endif //AURORA_CUDA_FUNCTION1D_H