1483 lines
57 KiB
Plaintext
1483 lines
57 KiB
Plaintext
#include "CudaMatrix.h"
|
|
#include "AuroraDefs.h"
|
|
#include "Function1D.cuh"
|
|
#include "Function1D.h"
|
|
#include "Matrix.h"
|
|
|
|
#include <cmath>
|
|
#include <cstddef>
|
|
#include <cstdlib>
|
|
#include <thrust/device_vector.h>
|
|
#include <thrust/transform.h>
|
|
#include <thrust/iterator/constant_iterator.h>
|
|
#include <thrust/iterator/counting_iterator.h>
|
|
#include <thrust/complex.h>
|
|
#include <cuda_runtime.h>
|
|
#include <cusolverDn.h>
|
|
|
|
using namespace Aurora;
|
|
using namespace thrust::placeholders;
|
|
|
|
namespace
|
|
{
|
|
const int THREADS_PER_BLOCK = 256;
|
|
const int THREADS_PER_BLOCK_DIM2_X = 16;
|
|
const int THREADS_PER_BLOCK_DIM2_Y = 16;
|
|
const int THREADS_PER_BLOCK_DIM3_X = 8;
|
|
const int THREADS_PER_BLOCK_DIM3_Y = 8;
|
|
const int THREADS_PER_BLOCK_DIM3_Z = 8;
|
|
}
|
|
|
|
__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
aOutput[2*idx] = aInputData[idx];
|
|
aOutput[2*idx + 1] = 0;
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
|
|
{
|
|
if(aMatrix.isComplex())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
complexKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex);
|
|
}
|
|
|
|
__global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
aOutput[idx] = aInputData[idx*2];
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::real(const CudaMatrix& aMatrix)
|
|
{
|
|
if(!aMatrix.isComplex())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
realKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
|
|
}
|
|
|
|
__global__ void imageKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
aOutput[idx] = aInputData[idx*2 + 1];
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::imag(const CudaMatrix& aMatrix)
|
|
{
|
|
if(!aMatrix.isComplex())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
imageKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
|
|
}
|
|
|
|
__global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
aOutput[idx] = std::ceil(aInputData[idx]);
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
ceilKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
ceilKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
aOutput[idx] = std::round(aInputData[idx]);
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::round(const CudaMatrix& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
roundKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::round(const CudaMatrix&& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
roundKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
aOutput[idx] = std::floor(aInputData[idx]);
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::floor(const CudaMatrix& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
floorKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
floorKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
aOutput[idx] = std::sqrt(aInputData[idx]);
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix)
|
|
{
|
|
if(aMatrix.getValueType() == Aurora::Complex)
|
|
{
|
|
std::cerr<<"sqrt not support complex"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
sqrtKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix)
|
|
{
|
|
if(aMatrix.getValueType() == Aurora::Complex)
|
|
{
|
|
std::cerr<<"sqrt not support complex"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
sqrtKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void absKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
if(aIsComplex)
|
|
{
|
|
aOutput[idx] = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]);
|
|
}
|
|
else
|
|
{
|
|
aOutput[idx] = abs(aInputData[idx]);
|
|
}
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::abs(const CudaMatrix& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
absKernel<<<blocksPerGrid,THREADS_PER_BLOCK >>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
}
|
|
|
|
CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
absKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
}
|
|
|
|
__global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aSize)
|
|
{
|
|
if(aIsComplex)
|
|
{
|
|
float absValue = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx + 1] * aInputData[2*idx + 1]);
|
|
aOutput[2*idx] = aInputData[2*idx] / absValue;
|
|
aOutput[2*idx + 1] = aInputData[2*idx + 1] / absValue;
|
|
return;
|
|
}
|
|
|
|
if(aInputData[idx] < 0)
|
|
{
|
|
aOutput[idx] = -1;
|
|
}
|
|
else if(aInputData[idx] > 0)
|
|
{
|
|
aOutput[idx] = 1;
|
|
}
|
|
else
|
|
{
|
|
aOutput[idx] = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::sign(const CudaMatrix& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
signKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
signKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
void Aurora::nantoval(CudaMatrix& aMatrix,float val){
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return ::isnan(x)?val:x;
|
|
};
|
|
thrust::transform(thrust::device,aMatrix.getData(),
|
|
aMatrix.getData()+aMatrix.getDataSize(),aMatrix.getData(),lambda);
|
|
}
|
|
|
|
CudaMatrix Aurora::isnan(const CudaMatrix& aMatrix){
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return ::isnan(x)?1.0:0;
|
|
};
|
|
thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),
|
|
data,lambda);
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
|
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::isfinite(const CudaMatrix& aMatrix){
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return ::isfinite(x)?1.0:0;
|
|
};
|
|
thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),
|
|
data,lambda);
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
|
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
void Aurora::padding(CudaMatrix& aMatrix, int aIndex, float aValue){
|
|
if(aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex())
|
|
{
|
|
std::cerr<<"padding only support real vector"<<std::endl;
|
|
return;
|
|
}
|
|
if (aMatrix.getDataSize()>aIndex){
|
|
aMatrix.setValue(aIndex, aValue);
|
|
return;
|
|
}
|
|
//长度不足需补齐
|
|
size_t size = (aIndex+1) ;
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
cudaMemcpy(data, aMatrix.getData(), aMatrix.getDataSize(), cudaMemcpyDeviceToDevice);
|
|
thrust::fill_n(thrust::device, data+aMatrix.getDataSize(),size-aMatrix.getDataSize(),aValue);
|
|
aMatrix=CudaMatrix::fromRawData(data,size,1,1,aMatrix.getValueType());
|
|
}
|
|
|
|
|
|
CudaMatrix Aurora::auroraNot(const CudaMatrix& aMatrix){
|
|
return auroraNot(std::forward<CudaMatrix&&>(aMatrix.deepCopy()));
|
|
}
|
|
|
|
CudaMatrix Aurora::auroraNot(CudaMatrix&& aMatrix){
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return x<=0?1.0:0;
|
|
};
|
|
thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),
|
|
data,lambda);
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
|
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
|
|
void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op)
|
|
{
|
|
switch (op)
|
|
{
|
|
case GT:
|
|
{
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return x>compareValue?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
|
|
break;
|
|
}
|
|
case NG:{
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return x<=compareValue?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
|
|
break;
|
|
}
|
|
case EQ:{
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return x==compareValue?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
|
|
break;
|
|
}
|
|
case NE:{
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return x!=compareValue?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
|
|
break;
|
|
}
|
|
case NL:{
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return x>=compareValue?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
|
|
break;
|
|
}
|
|
case LT:{
|
|
auto lambda = [=] __host__ __device__ (const float& x){
|
|
return x<compareValue?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,float compareValue, float newValue,CompareOp op){
|
|
switch (op)
|
|
{
|
|
case GT:
|
|
{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x>compareValue?newValue:y;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aValueMatrix.getDataSize(),
|
|
aValueMatrix.getData(), aValueMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NG:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x<=compareValue?newValue:y;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aValueMatrix.getDataSize(),
|
|
aValueMatrix.getData(), aValueMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case EQ:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x==compareValue?newValue:y;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aValueMatrix.getDataSize(),
|
|
aValueMatrix.getData(), aValueMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NE:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x!=compareValue?newValue:y;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aValueMatrix.getDataSize(),
|
|
aValueMatrix.getData(), aValueMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NL:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x>=compareValue?newValue:y;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aValueMatrix.getDataSize(),
|
|
aValueMatrix.getData(), aValueMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case LT:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x<compareValue?newValue:y;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aValueMatrix.getDataSize(),
|
|
aValueMatrix.getData(), aValueMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op){
|
|
switch (op)
|
|
{
|
|
case GT:
|
|
{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x>y?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aDesAndCompareMatrix.getData(),
|
|
aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(),
|
|
aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NG:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x<=y?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aDesAndCompareMatrix.getData(),
|
|
aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(),
|
|
aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case EQ:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x==y?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aDesAndCompareMatrix.getData(),
|
|
aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(),
|
|
aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NE:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x!=y?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aDesAndCompareMatrix.getData(),
|
|
aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(),
|
|
aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NL:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x>=y?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aDesAndCompareMatrix.getData(),
|
|
aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(),
|
|
aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case LT:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x<y?newValue:x;
|
|
};
|
|
thrust::transform(thrust::device,aDesAndCompareMatrix.getData(),
|
|
aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(),
|
|
aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatrix& aNewValueMatrix,CompareOp op)
|
|
{
|
|
switch (op)
|
|
{
|
|
case GT:
|
|
{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x>compareValue?y:x;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aCompareMatrix.getDataSize(),
|
|
aNewValueMatrix.getData(), aCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NG:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x<=compareValue?y:x;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aCompareMatrix.getDataSize(),
|
|
aNewValueMatrix.getData(), aCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case EQ:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x==compareValue?y:x;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aCompareMatrix.getDataSize(),
|
|
aNewValueMatrix.getData(), aCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NE:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x!=compareValue?y:x;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aCompareMatrix.getDataSize(),
|
|
aNewValueMatrix.getData(), aCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case NL:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x>=compareValue?y:x;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aCompareMatrix.getDataSize(),
|
|
aNewValueMatrix.getData(), aCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
case LT:{
|
|
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
|
return x<compareValue?y:x;
|
|
};
|
|
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
|
aCompareMatrix.getData()+aCompareMatrix.getDataSize(),
|
|
aNewValueMatrix.getData(), aCompareMatrix.getData(),
|
|
lambda);
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
__global__ void repMatKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns,
|
|
float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex)
|
|
{
|
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
|
|
|
if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(aIsComplex)
|
|
{
|
|
unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX);
|
|
unsigned int inPutIndex = 2 * (idY % aInputColumns * aInputRows + idX % aInputRows);
|
|
aOutput[outPutIndex] = aInputData[inPutIndex];
|
|
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
|
}
|
|
else
|
|
{
|
|
aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idY % aInputColumns * aInputRows + idX % aInputRows];
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes)
|
|
{
|
|
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
|
|
size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
|
|
dim3 blockSize(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_Y, 1);
|
|
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM2_X - 1)/THREADS_PER_BLOCK_DIM2_X,
|
|
(columnSize + THREADS_PER_BLOCK_DIM2_Y - 1)/THREADS_PER_BLOCK_DIM2_Y, 1);
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes;
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, 1, aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
|
|
{
|
|
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
|
|
size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
|
|
size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes;
|
|
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
|
|
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
|
|
(columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
|
|
(sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, sliceSize, aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns, unsigned int aInputSlices,
|
|
float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex)
|
|
{
|
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
|
|
|
if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(aIsComplex)
|
|
{
|
|
unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX);
|
|
unsigned int inPutIndex = 2 * (idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows);
|
|
aOutput[outPutIndex] = aInputData[inPutIndex];
|
|
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
|
}
|
|
else
|
|
{
|
|
aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows];
|
|
}
|
|
|
|
}
|
|
|
|
CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
|
|
{
|
|
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() < 3 || aMatrix.isNull())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
|
|
size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
|
|
size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes;
|
|
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
|
|
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
|
|
(columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
|
|
(sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
|
|
|
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), data, rowSize, columnSize, sliceSize, aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void logKernel(float* aInputData, float* aOutput, unsigned int aInputSize, int aBaseNum)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aInputSize)
|
|
{
|
|
if(aBaseNum == -1)
|
|
{
|
|
aOutput[idx] = logf(aInputData[idx]);
|
|
}
|
|
else
|
|
{
|
|
float value = logf(aBaseNum);
|
|
aOutput[idx] = logf(aInputData[idx]) / value;
|
|
}
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::log(const CudaMatrix& aMatrix, int aBaseNum)
|
|
{
|
|
if(aMatrix.getValueType() == Aurora::Complex)
|
|
{
|
|
std::cerr<<"log not support complex"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
logKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size, aBaseNum);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void expKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aInputSize)
|
|
{
|
|
if(aIsComplex)
|
|
{
|
|
unsigned int index = 2 * idx;
|
|
float expReal = expf(aInputData[index]);
|
|
aOutput[index] = expReal * cosf(aInputData[index + 1]);
|
|
aOutput[index + 1] = expReal * sinf(aInputData[index + 1]);
|
|
}
|
|
else
|
|
{
|
|
aOutput[idx] = expf(aInputData[idx]);
|
|
}
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::exp(const CudaMatrix& aMatrix)
|
|
{
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType());
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
expKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
__global__ void modKernel(float* aInputData, float* aOutput, unsigned int aInputSize, float aModValue)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aInputSize)
|
|
{
|
|
aOutput[idx] = fmodf(aInputData[idx], aModValue);
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::mod(const CudaMatrix& aMatrix, float aValue)
|
|
{
|
|
if(aMatrix.isComplex())
|
|
{
|
|
std::cerr<<"mod not support complex"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
modKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size, aValue);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
}
|
|
|
|
__global__ void acosKernel(float* aInputData, float* aOutput, unsigned int aInputSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aInputSize)
|
|
{
|
|
aOutput[idx] = acosf(aInputData[idx]);
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::acos(const CudaMatrix& aMatrix)
|
|
{
|
|
if(aMatrix.isComplex())
|
|
{
|
|
std::cerr<<"acos not support complex"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
acosKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
}
|
|
|
|
__global__ void acosdKernel(float* aInputData, float* aOutput, unsigned int aInputSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aInputSize)
|
|
{
|
|
aOutput[idx] = acosf(aInputData[idx]) * 180 / PI;
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::acosd(const CudaMatrix& aMatrix)
|
|
{
|
|
if(aMatrix.isComplex())
|
|
{
|
|
std::cerr<<"acos not support complex"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
acosdKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
|
}
|
|
|
|
__global__ void conjKernel(float* aInputData, float* aOutput, unsigned int aInputSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aInputSize)
|
|
{
|
|
unsigned int index = idx * 2;
|
|
aOutput[index] = aInputData[index];
|
|
aOutput[index + 1] = -aInputData[index + 1];
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::conj(const CudaMatrix& aMatrix)
|
|
{
|
|
if(!aMatrix.isComplex())
|
|
{
|
|
return CudaMatrix::copyFromRawData(aMatrix.getData(),aMatrix.getDimSize(0),aMatrix.getDimSize(1),aMatrix.getDimSize(2));
|
|
}
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType());
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
conjKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
|
|
float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod)
|
|
{
|
|
float resultValue = 0;
|
|
if(aMatrix.getDims() > 2)
|
|
{
|
|
std::cerr<<"norm not support 3d matrix"<<std::endl;
|
|
return 0;
|
|
}
|
|
//for 1 dims
|
|
if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1 )
|
|
{
|
|
if(aNormMethod == Aurora::Norm1)
|
|
{
|
|
CudaMatrix result = abs(aMatrix);
|
|
resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize(), 0.0, thrust::plus<float>());
|
|
return resultValue;
|
|
}
|
|
else
|
|
{
|
|
CudaMatrix result = aMatrix.deepCopy();
|
|
thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square<float>());
|
|
resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus<float>());
|
|
return std::sqrt(resultValue);
|
|
}
|
|
}
|
|
//for 2 dims
|
|
if(aNormMethod == Aurora::NormF)
|
|
{
|
|
CudaMatrix result = aMatrix.deepCopy();
|
|
thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square<float>());
|
|
resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus<float>());
|
|
return std::sqrt(resultValue);
|
|
}
|
|
else if(aNormMethod == Aurora::Norm1)
|
|
{
|
|
for(int i=0; i<aMatrix.getDimSize(1); ++i)
|
|
{
|
|
CudaMatrix result = abs(aMatrix.block(1, i, i));
|
|
float tempValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize(), 0.0, thrust::plus<float>());
|
|
if(resultValue < tempValue)
|
|
{
|
|
resultValue = tempValue;
|
|
}
|
|
}
|
|
return resultValue;
|
|
}
|
|
else
|
|
{
|
|
if(aMatrix.isComplex())
|
|
{
|
|
std::cerr<<"norm2 not support 2d complex matrix"<<std::endl;
|
|
return 0;
|
|
}
|
|
cusolverDnHandle_t cusolverH = NULL;
|
|
cudaStream_t stream = NULL;
|
|
float* d_S = NULL;
|
|
float* d_U = NULL;
|
|
float* d_VT = NULL;
|
|
int* devInfo = NULL;
|
|
float* d_work = NULL;
|
|
int lwork = 0;
|
|
const int m = aMatrix.getDimSize(0);
|
|
const int n = aMatrix.getDimSize(1);
|
|
const int lda = m;
|
|
const int ldu = m;
|
|
const int ldvt = n;
|
|
|
|
cusolverDnCreate(&cusolverH);
|
|
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
|
|
cusolverDnSetStream(cusolverH, stream);
|
|
cudaMalloc((void**)&d_S, sizeof(float)*n);
|
|
cudaMalloc((void**)&d_U, sizeof(float)*ldu*m);
|
|
cudaMalloc((void**)&d_VT, sizeof(float)*ldvt*n);
|
|
cudaMalloc((void**)&devInfo, sizeof(int));
|
|
|
|
cusolverDnSgesvd_bufferSize(cusolverH, m, n, &lwork);
|
|
|
|
cudaMalloc((void**)&d_work, sizeof(float)*lwork);
|
|
auto matrix = aMatrix.deepCopy();
|
|
cusolverDnSgesvd(cusolverH, 'A', 'A', m, n, matrix.getData(), lda, d_S,d_U, ldu, d_VT, ldvt, d_work, lwork, NULL, devInfo);
|
|
|
|
int devInfo_h = 0;
|
|
cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost);
|
|
|
|
if (devInfo_h != 0)
|
|
{
|
|
printf("Unsuccessful SVD execution\n");
|
|
printf("Error code: %d\n", devInfo_h);
|
|
}
|
|
|
|
float S[n] = {0};
|
|
cudaMemcpy(S, d_S, sizeof(float)*n, cudaMemcpyDeviceToHost);
|
|
|
|
float resultValue = S[0];
|
|
for (int i = 1; i < n; i++)
|
|
{
|
|
if (S[i] > resultValue)
|
|
{
|
|
resultValue = S[i];
|
|
}
|
|
}
|
|
|
|
if (d_S ) cudaFree(d_S);
|
|
if (d_U ) cudaFree(d_U);
|
|
if (d_VT) cudaFree(d_VT);
|
|
if (devInfo) cudaFree(devInfo);
|
|
if (d_work) cudaFree(d_work);
|
|
if (cusolverH) cusolverDnDestroy(cusolverH);
|
|
if (stream) cudaStreamDestroy(stream);
|
|
return resultValue;
|
|
}
|
|
}
|
|
|
|
__global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int aRowSize, unsigned int aColumnSize, bool aIsComplex)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
|
|
if (idx < aRowSize && idy < aColumnSize)
|
|
{
|
|
unsigned int inputindex = idy * aRowSize + idx;
|
|
unsigned int outputIndex = idx * aColumnSize + idy;
|
|
if(aIsComplex)
|
|
{
|
|
aOutput[2*outputIndex] = aInputData[2*inputindex];
|
|
aOutput[2*outputIndex + 1] = aInputData[2*inputindex + 1];
|
|
}
|
|
else
|
|
{
|
|
aOutput[outputIndex] = aInputData[inputindex];
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix)
|
|
{
|
|
//not surpport for 3 dims.
|
|
if(aMatrix.isNull() || aMatrix.getDimSize(2) > 1)
|
|
{
|
|
std::cerr<<"transpose not support 3d complex matrix"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType());
|
|
dim3 blocksPerGrid((aMatrix.getDimSize(0) + THREADS_PER_BLOCK_DIM2_X - 1) / THREADS_PER_BLOCK_DIM2_X, (aMatrix.getDimSize(1) + THREADS_PER_BLOCK_DIM2_Y - 1) / THREADS_PER_BLOCK_DIM2_Y, 1);
|
|
dim3 threadPerBlock(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_X, 1);
|
|
transposeKernel<<<blocksPerGrid, threadPerBlock>>>(aMatrix.getData(), data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(1), aMatrix.getDimSize(0), aMatrix.getDimSize(2), aMatrix.getValueType());
|
|
}
|
|
|
|
CudaMatrix Aurora::horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
|
|
{
|
|
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) ||
|
|
aMatrix1.getDimSize(0) != aMatrix2.getDimSize(0) || aMatrix1.getValueType() != aMatrix2.getValueType())
|
|
{
|
|
std::cerr<<"horzcat must have same rows and slices"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
int column1 = aMatrix1.getDimSize(1);
|
|
int column2 = aMatrix2.getDimSize(1);
|
|
int slice = aMatrix1.getDimSize(2);
|
|
int row = aMatrix1.getDimSize(0);
|
|
size_t size1= row*column1*aMatrix1.getValueType();
|
|
size_t size2= row*column2*aMatrix2.getValueType();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * (aMatrix1.getDataSize() + aMatrix2.getDataSize()) * aMatrix1.getValueType());
|
|
size_t sliceStride = row*(column1+column2) * aMatrix1.getValueType();
|
|
for (size_t i = 0; i < slice; i++)
|
|
{
|
|
cudaMemcpy(data + i*sliceStride, aMatrix1.getData() + i*size1, sizeof(float) * size1, cudaMemcpyDeviceToDevice);
|
|
cudaMemcpy(data + i*sliceStride + size1, aMatrix2.getData() + i*size2, sizeof(float) * size2, cudaMemcpyDeviceToDevice);
|
|
}
|
|
return CudaMatrix::fromRawData(data, row, column1+column2, slice, aMatrix1.getValueType());
|
|
}
|
|
|
|
__global__ void vertcatKernel(float* aInputData1, unsigned int aInputData1RowSize, float* aInputData2, unsigned int aInputData2RowSize,
|
|
float* aOutput, unsigned int aOutputRowSize, unsigned int aOutputColumnSize, unsigned int aOutputSliceSize, bool aIsComplex)
|
|
{
|
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
|
|
|
if(idX >= aOutputRowSize || idY >= aOutputColumnSize || idZ >= aOutputSliceSize)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(aIsComplex)
|
|
{
|
|
if(idX < aInputData1RowSize)
|
|
{
|
|
unsigned int inputIndex = idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX;
|
|
unsigned int outputIndex = idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX;
|
|
aOutput[2*outputIndex] = aInputData1[2*inputIndex];
|
|
aOutput[2*outputIndex + 1] = aInputData1[2*inputIndex + 1];
|
|
}
|
|
else
|
|
{
|
|
unsigned int inputIndex = idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize;
|
|
unsigned int outputIndex =idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX;
|
|
aOutput[2*outputIndex] = aInputData2[2*inputIndex];
|
|
aOutput[2*outputIndex + 1] = aInputData2[2*inputIndex + 1];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if(idX < aInputData1RowSize)
|
|
{
|
|
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData1[idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX];
|
|
}
|
|
else
|
|
{
|
|
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize];
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
|
|
{
|
|
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) ||
|
|
aMatrix1.getDimSize(1) != aMatrix2.getDimSize(1) || aMatrix1.getValueType() != aMatrix2.getValueType())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
size_t outputRows = aMatrix1.getDimSize(0) + aMatrix2.getDimSize(0);
|
|
size_t outputColumns = aMatrix1.getDimSize(1);
|
|
size_t outputSlices = aMatrix1.getDimSize(2);
|
|
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * outputRows * outputColumns * outputSlices * aMatrix1.getValueType());
|
|
|
|
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
|
|
dim3 gridSize((outputRows + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
|
|
(outputColumns + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
|
|
(outputSlices + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
|
|
vertcatKernel<<<gridSize, blockSize>>>(aMatrix1.getData(), aMatrix1.getDimSize(0), aMatrix2.getData(), aMatrix2.getDimSize(0),
|
|
data, outputRows, outputColumns, outputSlices, aMatrix1.isComplex());
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType());
|
|
}
|
|
|
|
__global__ void vecnorm1Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex)
|
|
{
|
|
__shared__ float sharedValue[THREADS_PER_BLOCK];
|
|
sharedValue[threadIdx.x] = 0;
|
|
if(aIsComplex)
|
|
{
|
|
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
|
{
|
|
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
|
if(indexByRows < aInputRowSize)
|
|
{
|
|
unsigned int idx = blockIdx.x*aInputRowSize + indexByRows;
|
|
sharedValue[threadIdx.x] += sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
|
{
|
|
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
|
if(indexByRows < aInputRowSize)
|
|
{
|
|
sharedValue[threadIdx.x] += abs(aInputData[blockIdx.x*aInputRowSize + indexByRows]);
|
|
}
|
|
}
|
|
}
|
|
__syncthreads();
|
|
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
|
{
|
|
if(threadIdx.x < i)
|
|
{
|
|
sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
aOutput[blockIdx.x] = sharedValue[0];
|
|
}
|
|
|
|
__global__ void vecnorm2Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex)
|
|
{
|
|
__shared__ float sharedValue[THREADS_PER_BLOCK];
|
|
sharedValue[threadIdx.x] = 0;
|
|
|
|
for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i)
|
|
{
|
|
unsigned int indexByRows = i*blockDim.x + threadIdx.x;
|
|
if(indexByRows < aInputRowSize)
|
|
{
|
|
if(aIsComplex)
|
|
{
|
|
unsigned int idx = blockIdx.x*aInputRowSize + indexByRows;
|
|
sharedValue[threadIdx.x] += aInputData[2 * idx] * aInputData[2 * idx];
|
|
sharedValue[threadIdx.x] += aInputData[2 * idx + 1] * aInputData[2 * idx + 1];
|
|
}
|
|
else
|
|
{
|
|
sharedValue[threadIdx.x] += aInputData[blockIdx.x*aInputRowSize + indexByRows] * aInputData[blockIdx.x*aInputRowSize + indexByRows];
|
|
}
|
|
}
|
|
}
|
|
__syncthreads();
|
|
for(unsigned int i = blockDim.x/2; i>0; i >>= 1)
|
|
{
|
|
if(threadIdx.x < i)
|
|
{
|
|
sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
aOutput[blockIdx.x] = sqrt(sharedValue[0]);
|
|
}
|
|
|
|
CudaMatrix Aurora::vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim)
|
|
{
|
|
//only surpport aDim = 1 for now.
|
|
if(aDim != 1 || aNormMethod == NormMethod::NormF || aMatrix.isNull())
|
|
{
|
|
return CudaMatrix();
|
|
}
|
|
unsigned int column = aMatrix.getDimSize(1);
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * column);
|
|
if(aNormMethod == Aurora::Norm1)
|
|
{
|
|
vecnorm1Kernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex());
|
|
}
|
|
else if(aNormMethod == Aurora::Norm2)
|
|
{
|
|
vecnorm2Kernel<<<column, THREADS_PER_BLOCK>>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex());
|
|
}
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data,column);
|
|
}
|
|
|
|
__global__ void linspaceKernel(float* aOutput, unsigned int aOutputSize, float aStartNum, float aStepNum)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aOutputSize)
|
|
{
|
|
aOutput[idx] = aStartNum + idx * aStepNum;
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::linspaceCuda(float aStart, float aEnd, int aNum)
|
|
{
|
|
float step = (aEnd - aStart) / (aNum - 1);
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * aNum);
|
|
int blocksPerGrid = (aNum + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
linspaceKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(data, aNum, aStart, step);
|
|
cudaDeviceSynchronize();
|
|
return Aurora::CudaMatrix::fromRawData(data,aNum);
|
|
}
|
|
|
|
CudaMatrix Aurora::auroraUnion(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
|
|
{
|
|
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.isComplex() || aMatrix2.isComplex())
|
|
{
|
|
std::cerr<<"auroraUnion not support complex cudamatrix"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size1= aMatrix1.getDataSize();
|
|
size_t size2= aMatrix2.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * (size1 + size2));
|
|
cudaMemcpy(data, aMatrix1.getData(), sizeof(float) * size1, cudaMemcpyDeviceToDevice);
|
|
cudaMemcpy(data + size1, aMatrix2.getData(), sizeof(float) * size2, cudaMemcpyDeviceToDevice);
|
|
thrust::sort(thrust::device, data, data+size1+size2);
|
|
float* endPointer = thrust::unique(thrust::device, data, data+size1+size2);
|
|
|
|
return CudaMatrix::fromRawData(data, endPointer - data);
|
|
}
|
|
|
|
CudaMatrix Aurora::intersect(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
|
|
{
|
|
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.isComplex() || aMatrix2.isComplex())
|
|
{
|
|
std::cerr<<"intersect not support complex cudamatrix"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size1= aMatrix1.getDataSize();
|
|
size_t size2= aMatrix2.getDataSize();
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * (size1 + size2));
|
|
cudaMemcpy(data, aMatrix1.getData(), sizeof(float) * size1, cudaMemcpyDeviceToDevice);
|
|
cudaMemcpy(data + size1, aMatrix2.getData(), sizeof(float) * size2, cudaMemcpyDeviceToDevice);
|
|
thrust::sort(thrust::device, data, data+size1);
|
|
thrust::sort(thrust::device, data+size1, data+size1+size2);
|
|
float* end = thrust::set_intersection(thrust::device, data, data+size1,data+size1, data+size1+size2,data);
|
|
|
|
return CudaMatrix::fromRawData(data, end - data);
|
|
}
|
|
|
|
__global__ void intersectKernel(float* aMatrixData, float* aIntersectData, unsigned int aMatrixDataSize, float* aOutputData, unsigned int aOutputDataSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aOutputDataSize)
|
|
{
|
|
for(unsigned int i=0; i<aMatrixDataSize; ++i)
|
|
{
|
|
if(aMatrixData[i] == aIntersectData[idx])
|
|
{
|
|
aOutputData[idx] = i+1;
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::intersect(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2, CudaMatrix& aIa)
|
|
{
|
|
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.isComplex() || aMatrix2.isComplex())
|
|
{
|
|
std::cerr<<"intersect not support complex cudamatrix"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
CudaMatrix result = intersect(aMatrix1,aMatrix2);
|
|
|
|
size_t size = result.getDataSize();
|
|
float* iaResult = nullptr;
|
|
cudaMalloc((void**)&iaResult, sizeof(float) * size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
intersectKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix1.getData(), result.getData(), aMatrix1.getDataSize(), iaResult, size);
|
|
cudaDeviceSynchronize();
|
|
|
|
aIa = CudaMatrix::fromRawData(iaResult,size);
|
|
return result;
|
|
}
|
|
|
|
CudaMatrix Aurora::reshape(const CudaMatrix& aMatrix, int aRows, int aColumns, int aSlices)
|
|
{
|
|
if(aMatrix.isNull() || (aMatrix.getDataSize() != aRows * aColumns * aSlices))
|
|
{
|
|
std::cerr<<"reshape diffirent size with cudamatrix"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
return CudaMatrix::copyFromRawData(aMatrix.getData(),aRows,aColumns,aSlices);
|
|
}
|
|
|
|
__global__ void xcorrKernel(float* aInputData1, float* aInputData2,unsigned int aInputSize, float* aOutput, unsigned int aOutputSize)
|
|
{
|
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (idx < aOutputSize/2 + 1)
|
|
{
|
|
for(unsigned int i=0; i<=idx; ++i)
|
|
{
|
|
aOutput[idx] += aInputData1[i] * aInputData2[aInputSize - idx - 1 + i];
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (idx < aOutputSize)
|
|
{
|
|
for(int i=0; i<idx-aOutputSize/2; ++i)
|
|
{
|
|
aOutput[aOutputSize - idx + aOutputSize/2] += aInputData1[aInputSize + i - idx + aOutputSize/2] * aInputData2[i];
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
CudaMatrix Aurora::xcorr(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
|
|
{
|
|
if (aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDataSize() != aMatrix2.getDataSize() || aMatrix1.isComplex() || aMatrix2.isComplex())
|
|
{
|
|
std::cerr<<"xcorr not surpport with diffirent input size or complex cudamatrix"<<std::endl;
|
|
return CudaMatrix();
|
|
}
|
|
|
|
size_t size = aMatrix1.getDataSize() * 2 - 1;
|
|
float* data = nullptr;
|
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
|
cudaMemset(data, 0.0, size);
|
|
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
|
xcorrKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix1.getData(), aMatrix2.getData(), aMatrix1.getDataSize(), data, size);
|
|
cudaDeviceSynchronize();
|
|
|
|
return CudaMatrix::fromRawData(data, size);
|
|
}
|
|
|
|
CudaMatrix Aurora::deleteColumn(const CudaMatrix& aMatrix, int aColumnIndex)
|
|
{
|
|
int rows = aMatrix.getDimSize(0);
|
|
int columns = aMatrix.getDimSize(1);
|
|
if (aColumnIndex < 0 || aColumnIndex >= columns)
|
|
{
|
|
return aMatrix;
|
|
}
|
|
|
|
float* resultData = nullptr;
|
|
cudaMalloc((void**)&resultData, sizeof(float) * rows* (columns-1));
|
|
if(aColumnIndex == 0)
|
|
{
|
|
cudaMemcpy(resultData, aMatrix.getData() + rows, sizeof(float) * rows* (columns-1), cudaMemcpyDeviceToDevice);
|
|
}
|
|
else if(aColumnIndex == (columns - 1))
|
|
{
|
|
cblas_scopy(rows* (columns-1), aMatrix.getData(), 1, resultData, 1);
|
|
cudaMemcpy(resultData, aMatrix.getData(), sizeof(float) * rows* (columns-1), cudaMemcpyDeviceToDevice);
|
|
}
|
|
else
|
|
{
|
|
cudaMemcpy(resultData, aMatrix.getData(), sizeof(float) * rows * aColumnIndex, cudaMemcpyDeviceToDevice);
|
|
cudaMemcpy(resultData + rows * aColumnIndex, aMatrix.getData() + rows * (aColumnIndex + 1), sizeof(float) * rows * (columns - aColumnIndex - 1), cudaMemcpyDeviceToDevice);
|
|
}
|
|
|
|
return CudaMatrix::fromRawData(resultData, rows, columns-1);
|
|
}
|
|
|
|
CudaMatrix Aurora::createCudaVectorMatrix(float aStartValue, float aStepValue, float aEndValue)
|
|
{
|
|
std::vector<float> matrixData;
|
|
float tempValue = aStartValue;
|
|
matrixData.push_back(tempValue);
|
|
long long compare1 = std::round(aEndValue * 10e13);
|
|
long long compare2 = std::round(tempValue * 10e13);
|
|
while(std::round(tempValue* 10e13) <= compare1)
|
|
{
|
|
tempValue += aStepValue;
|
|
matrixData.push_back(tempValue);
|
|
compare2 = std::round(tempValue * 10e14);
|
|
}
|
|
matrixData.pop_back();
|
|
|
|
return Matrix::copyFromRawData(matrixData.data(), 1, matrixData.size()).toDeviceMatrix();
|
|
}
|