feat: Add some new cuda support to Aurora
This commit is contained in:
@@ -8,6 +8,12 @@
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <sys/types.h>
|
||||
#if THRUST_VERSION >= 200000
|
||||
#include <thrust/unique.h>
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/set_operations.h>
|
||||
#endif
|
||||
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/transform.h>
|
||||
#include <thrust/iterator/constant_iterator.h>
|
||||
@@ -49,6 +55,23 @@ __global__ void complexKernel(float* aInputRealData, float* aInputImagData, floa
|
||||
}
|
||||
}
|
||||
|
||||
#if THRUST_VERSION >= 200000
|
||||
template<typename InputIterator,
|
||||
typename OutputIterator,
|
||||
typename UnaryFunction>
|
||||
void callThrustTransform(InputIterator aInput, OutputIterator aOutput, UnaryFunction aFunc){
|
||||
thrust::transform(aInput, aOutput, aFunc);
|
||||
}
|
||||
#else
|
||||
template<typename InputIterator,
|
||||
typename OutputIterator,
|
||||
typename UnaryFunction>
|
||||
void callThrustTransform(InputIterator aInput, OutputIterator aOutput, UnaryFunction aFunc){
|
||||
thrust::transform(thrust::device, aInput, aOutput, aFunc);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
CudaMatrix Aurora::complex(const CudaMatrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.isComplex())
|
||||
@@ -353,7 +376,8 @@ CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix)
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
|
||||
void Aurora::nantoval(CudaMatrix& aMatrix,float val){
|
||||
void Aurora::nantoval(CudaMatrix &aMatrix, float val)
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return ::isnan(x) ? val : x;
|
||||
};
|
||||
@@ -361,7 +385,8 @@ void Aurora::nantoval(CudaMatrix& aMatrix,float val){
|
||||
aMatrix.getData() + aMatrix.getDataSize(), aMatrix.getData(), lambda);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::isnan(const CudaMatrix& aMatrix){
|
||||
CudaMatrix Aurora::isnan(const CudaMatrix &aMatrix)
|
||||
{
|
||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||||
float *data = nullptr;
|
||||
|
||||
@@ -375,7 +400,8 @@ CudaMatrix Aurora::isnan(const CudaMatrix& aMatrix){
|
||||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::isfinite(const CudaMatrix& aMatrix){
|
||||
CudaMatrix Aurora::isfinite(const CudaMatrix &aMatrix)
|
||||
{
|
||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
@@ -388,13 +414,15 @@ CudaMatrix Aurora::isfinite(const CudaMatrix& aMatrix){
|
||||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
|
||||
void Aurora::padding(CudaMatrix& aMatrix, int aIndex, float aValue){
|
||||
void Aurora::padding(CudaMatrix &aMatrix, int aIndex, float aValue)
|
||||
{
|
||||
if (aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr << "padding only support real vector" << std::endl;
|
||||
return;
|
||||
}
|
||||
if (aMatrix.getDataSize()>aIndex){
|
||||
if (aMatrix.getDataSize() > aIndex)
|
||||
{
|
||||
aMatrix.setValue(aIndex, aValue);
|
||||
return;
|
||||
}
|
||||
@@ -407,12 +435,13 @@ void Aurora::padding(CudaMatrix& aMatrix, int aIndex, float aValue){
|
||||
aMatrix = CudaMatrix::fromRawData(data, size, 1, 1, aMatrix.getValueType());
|
||||
}
|
||||
|
||||
|
||||
CudaMatrix Aurora::auroraNot(const CudaMatrix& aMatrix){
|
||||
CudaMatrix Aurora::auroraNot(const CudaMatrix &aMatrix)
|
||||
{
|
||||
return auroraNot(std::forward<CudaMatrix &&>(aMatrix.deepCopy()));
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::auroraNot(CudaMatrix&& aMatrix){
|
||||
CudaMatrix Aurora::auroraNot(CudaMatrix &&aMatrix)
|
||||
{
|
||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
@@ -425,7 +454,6 @@ CudaMatrix Aurora::auroraNot(CudaMatrix&& aMatrix){
|
||||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
|
||||
|
||||
void Aurora::compareSet(CudaMatrix &aValueMatrix, float compareValue, float newValue, CompareOp op)
|
||||
{
|
||||
switch (op)
|
||||
@@ -438,35 +466,40 @@ void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newVa
|
||||
thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda);
|
||||
break;
|
||||
}
|
||||
case NG:{
|
||||
case NG:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return x <= compareValue ? newValue : x;
|
||||
};
|
||||
thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda);
|
||||
break;
|
||||
}
|
||||
case EQ:{
|
||||
case EQ:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return x == compareValue ? newValue : x;
|
||||
};
|
||||
thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda);
|
||||
break;
|
||||
}
|
||||
case NE:{
|
||||
case NE:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return x != compareValue ? newValue : x;
|
||||
};
|
||||
thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda);
|
||||
break;
|
||||
}
|
||||
case NL:{
|
||||
case NL:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return x >= compareValue ? newValue : x;
|
||||
};
|
||||
thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda);
|
||||
break;
|
||||
}
|
||||
case LT:{
|
||||
case LT:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return x < compareValue ? newValue : x;
|
||||
};
|
||||
@@ -476,10 +509,10 @@ void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newVa
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,float compareValue, float newValue,CompareOp op){
|
||||
void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, float compareValue, float newValue, CompareOp op)
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
case GT:
|
||||
@@ -493,7 +526,8 @@ void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,floa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NG:{
|
||||
case NG:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x <= compareValue ? newValue : y;
|
||||
};
|
||||
@@ -503,7 +537,8 @@ void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,floa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case EQ:{
|
||||
case EQ:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x == compareValue ? newValue : y;
|
||||
};
|
||||
@@ -513,7 +548,8 @@ void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,floa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NE:{
|
||||
case NE:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x != compareValue ? newValue : y;
|
||||
};
|
||||
@@ -523,17 +559,19 @@ void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,floa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NL:{
|
||||
case NL:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x >= compareValue ? newValue : y;
|
||||
};
|
||||
thrust::transform(thrust::device,aCompareMatrix.getData(),
|
||||
thrust::transform(aCompareMatrix.getData(),
|
||||
aCompareMatrix.getData() + aValueMatrix.getDataSize(),
|
||||
aValueMatrix.getData(), aValueMatrix.getData(),
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case LT:{
|
||||
case LT:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x < compareValue ? newValue : y;
|
||||
};
|
||||
@@ -548,7 +586,8 @@ void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,floa
|
||||
}
|
||||
}
|
||||
|
||||
void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op){
|
||||
void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherCompareMatrix, float newValue, CompareOp op)
|
||||
{
|
||||
switch (op)
|
||||
{
|
||||
case GT:
|
||||
@@ -562,7 +601,8 @@ void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NG:{
|
||||
case NG:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x <= y ? newValue : x;
|
||||
};
|
||||
@@ -572,7 +612,8 @@ void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case EQ:{
|
||||
case EQ:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x == y ? newValue : x;
|
||||
};
|
||||
@@ -582,7 +623,8 @@ void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NE:{
|
||||
case NE:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x != y ? newValue : x;
|
||||
};
|
||||
@@ -592,7 +634,8 @@ void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NL:{
|
||||
case NL:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x >= y ? newValue : x;
|
||||
};
|
||||
@@ -602,7 +645,8 @@ void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompa
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case LT:{
|
||||
case LT:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x < y ? newValue : x;
|
||||
};
|
||||
@@ -632,7 +676,8 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NG:{
|
||||
case NG:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x <= compareValue ? y : x;
|
||||
};
|
||||
@@ -642,7 +687,8 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case EQ:{
|
||||
case EQ:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x == compareValue ? y : x;
|
||||
};
|
||||
@@ -652,7 +698,8 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NE:{
|
||||
case NE:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x != compareValue ? y : x;
|
||||
};
|
||||
@@ -662,7 +709,8 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case NL:{
|
||||
case NL:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x >= compareValue ? y : x;
|
||||
};
|
||||
@@ -672,7 +720,8 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri
|
||||
lambda);
|
||||
break;
|
||||
}
|
||||
case LT:{
|
||||
case LT:
|
||||
{
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return x < compareValue ? y : x;
|
||||
};
|
||||
@@ -778,7 +827,6 @@ __global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsig
|
||||
{
|
||||
aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::repmat3d(const CudaMatrix &aMatrix, int aRowTimes, int aColumnTimes, int aSliceTimes)
|
||||
@@ -971,7 +1019,6 @@ CudaMatrix Aurora::conj(const CudaMatrix& aMatrix)
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
|
||||
|
||||
float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod)
|
||||
{
|
||||
float resultValue = 0;
|
||||
@@ -1074,13 +1121,20 @@ float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod)
|
||||
}
|
||||
}
|
||||
|
||||
if (d_S ) cudaFree(d_S);
|
||||
if (d_U ) cudaFree(d_U);
|
||||
if (d_VT) cudaFree(d_VT);
|
||||
if (devInfo) cudaFree(devInfo);
|
||||
if (d_work) cudaFree(d_work);
|
||||
if (cusolverH) cusolverDnDestroy(cusolverH);
|
||||
if (stream) cudaStreamDestroy(stream);
|
||||
if (d_S)
|
||||
cudaFree(d_S);
|
||||
if (d_U)
|
||||
cudaFree(d_U);
|
||||
if (d_VT)
|
||||
cudaFree(d_VT);
|
||||
if (devInfo)
|
||||
cudaFree(devInfo);
|
||||
if (d_work)
|
||||
cudaFree(d_work);
|
||||
if (cusolverH)
|
||||
cusolverDnDestroy(cusolverH);
|
||||
if (stream)
|
||||
cudaStreamDestroy(stream);
|
||||
return resultValue;
|
||||
}
|
||||
}
|
||||
@@ -1105,7 +1159,6 @@ __global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
CudaMatrix Aurora::transpose(const CudaMatrix &aMatrix)
|
||||
{
|
||||
// not surpport for 3 dims.
|
||||
@@ -1191,7 +1244,6 @@ __global__ void vertcatKernel(float* aInputData1, unsigned int aInputData1RowSiz
|
||||
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize];
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::vertcat(const CudaMatrix &aMatrix1, const CudaMatrix &aMatrix2)
|
||||
@@ -1511,12 +1563,10 @@ CudaMatrix Aurora::createCudaVectorMatrix(float aStartValue, float aStepValue, f
|
||||
struct compareMatrixByRows
|
||||
{
|
||||
compareMatrixByRows(unsigned int aSize)
|
||||
: mSize(aSize)
|
||||
{
|
||||
: mSize(aSize) {
|
||||
};
|
||||
unsigned int mSize;
|
||||
__host__ __device__
|
||||
bool operator()(const float* aVector1, const float* aVector2) const
|
||||
__host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const
|
||||
{
|
||||
for (unsigned int i = 0; i < mSize; ++i)
|
||||
{
|
||||
@@ -1536,12 +1586,10 @@ struct compareMatrixByRows
|
||||
struct uniqueMatrixByRows
|
||||
{
|
||||
uniqueMatrixByRows(unsigned int aSize)
|
||||
: mSize(aSize)
|
||||
{
|
||||
: mSize(aSize) {
|
||||
};
|
||||
unsigned int mSize;
|
||||
__host__ __device__
|
||||
bool operator()(const float* aVector1, const float* aVector2) const
|
||||
__host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const
|
||||
{
|
||||
for (unsigned int i = 0; i < mSize; ++i)
|
||||
{
|
||||
@@ -1557,14 +1605,11 @@ struct uniqueMatrixByRows
|
||||
struct equalMatrixByRows
|
||||
{
|
||||
equalMatrixByRows(unsigned int aSize, float *aValue)
|
||||
: mSize(aSize)
|
||||
, mValue(aValue)
|
||||
{
|
||||
: mSize(aSize), mValue(aValue) {
|
||||
};
|
||||
unsigned int mSize;
|
||||
float *mValue;
|
||||
__host__ __device__
|
||||
bool operator()(const float* aVector) const
|
||||
__host__ __device__ bool operator()(const float *aVector) const
|
||||
{
|
||||
for (unsigned int i = 0; i < mSize; ++i)
|
||||
{
|
||||
@@ -1577,8 +1622,6 @@ struct equalMatrixByRows
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
CudaMatrix Aurora::uniqueByRows(const CudaMatrix &aMatrix, CudaMatrix &aIndexResult)
|
||||
{
|
||||
CudaMatrix transposeMatrix = transpose(aMatrix);
|
||||
@@ -1617,12 +1660,14 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexRes
|
||||
return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
|
||||
}
|
||||
|
||||
__global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){
|
||||
__global__ void convertValueKernel(float *aSrc, float *aDes, unsigned int size)
|
||||
{
|
||||
__shared__ ushort CONVERT_AND_VALUE;
|
||||
__shared__ ushort CONVERT_AND_VALUE_2;
|
||||
__shared__ ushort CONVERT_MUL_VALUE;
|
||||
__shared__ unsigned int CONVERT_ADD_VALUE;
|
||||
if (threadIdx.x == 0){
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
CONVERT_AND_VALUE = 15u;
|
||||
CONVERT_AND_VALUE_2 = 2047u;
|
||||
CONVERT_MUL_VALUE = 2048u;
|
||||
@@ -1630,7 +1675,8 @@ __global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){
|
||||
}
|
||||
__syncthreads();
|
||||
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx >= size) return;
|
||||
if (idx >= size)
|
||||
return;
|
||||
short value = aSrc[idx];
|
||||
ushort exponent = (ushort)value;
|
||||
exponent = (exponent >> 11) & CONVERT_AND_VALUE;
|
||||
|
||||
@@ -11,6 +11,10 @@
|
||||
#include <iostream>
|
||||
|
||||
#include <cmath>
|
||||
#if THRUST_VERSION >= 200000
|
||||
#include <thrust/sort.h>
|
||||
#include <thrust/set_operations.h>
|
||||
#endif
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/transform.h>
|
||||
#include <thrust/reduce.h>
|
||||
@@ -36,7 +40,6 @@ namespace
|
||||
const int THREADS_PER_BLOCK = 256;
|
||||
}
|
||||
|
||||
|
||||
__global__ void maxColKernel(float *aInputData, float *aOutput, unsigned int aColSize)
|
||||
{
|
||||
// 确定每个thread的index
|
||||
@@ -46,23 +49,28 @@ __global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aCo
|
||||
shared_data[threadIdx.x] = (threadIdx.x < aColSize) ? aInputData[idx] : -FLT_MAX;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<aColSize; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<aColSize){
|
||||
for (int offset = blockDim.x; offset < aColSize; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aColSize)
|
||||
{
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], aInputData[idx + offset]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
|
||||
if (threadIdx.x < i) {
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x] = shared_data[0];
|
||||
}
|
||||
}
|
||||
@@ -76,46 +84,55 @@ __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aCol
|
||||
shared_data[threadIdx.x] = (threadIdx.x < aRowSize) ? aInputData[idx] : -FLT_MAX;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) {
|
||||
if(threadIdx.x+offset < aRowSize){
|
||||
for (int offset = blockDim.x; offset < aRowSize; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aRowSize)
|
||||
{
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], aInputData[idx + offset * aColSize]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
|
||||
if (threadIdx.x < i) {
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x] = shared_data[0];
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) {
|
||||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction)
|
||||
{
|
||||
long a, b;
|
||||
return max(aMatrix, direction, a, b);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long &rowIdx, long &colIdx)
|
||||
{
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
// 针对向量行等于列
|
||||
if (aMatrix.isVector()){
|
||||
if (aMatrix.isVector())
|
||||
{
|
||||
direction = All;
|
||||
}
|
||||
switch (direction)
|
||||
{
|
||||
case All: {
|
||||
case All:
|
||||
{
|
||||
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(aMatrix.getData());
|
||||
auto max_iter = thrust::max_element(thrust::device, d_ptr, d_ptr + aMatrix.getDataSize());
|
||||
int index = max_iter - d_ptr;
|
||||
@@ -153,17 +170,19 @@ CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, l
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) {
|
||||
CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat)
|
||||
{
|
||||
// col-vec x mat
|
||||
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) {
|
||||
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0))
|
||||
{
|
||||
size_t size = aMat.getDataSize();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return fmaxf(x, y);
|
||||
};
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i)
|
||||
{
|
||||
thrust::transform(thrust::device, aVec.getData(),
|
||||
aVec.getData() + aVec.getDataSize(),
|
||||
aMat.getData() + aMat.getDimSize(0) * i,
|
||||
@@ -178,7 +197,8 @@ CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) {
|
||||
size_t size = aMat.getDataSize();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i)
|
||||
{
|
||||
float v = aVec.getValue(i);
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return fmaxf(x, v);
|
||||
@@ -195,24 +215,27 @@ CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) {
|
||||
<< "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther)
|
||||
{
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr
|
||||
<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
if (aOther.getDimSize(2)>1 || aOther.isComplex()) {
|
||||
if (aOther.getDimSize(2) > 1 || aOther.isComplex())
|
||||
{
|
||||
std::cerr
|
||||
<< (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
// same shape
|
||||
if (aMatrix.compareShape(aOther)){
|
||||
if (aMatrix.compareShape(aOther))
|
||||
{
|
||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
@@ -226,12 +249,14 @@ CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
// one is scalar
|
||||
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){
|
||||
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1)
|
||||
{
|
||||
float scalar = (aMatrix.getDataSize() == 1) ? aMatrix.getValue(0) : aOther.getValue(0);
|
||||
auto matrix = (aMatrix.getDataSize() == 1) ? aOther : aMatrix;
|
||||
return max(matrix, scalar);
|
||||
}
|
||||
else if (aMatrix.isVector()) {
|
||||
else if (aMatrix.isVector())
|
||||
{
|
||||
return ::vxmMax(aMatrix, aOther);
|
||||
}
|
||||
else if (aOther.isVector())
|
||||
@@ -244,7 +269,8 @@ CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||||
return CudaMatrix();
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue){
|
||||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue)
|
||||
{
|
||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
@@ -266,23 +292,28 @@ __global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aCo
|
||||
shared_data[threadIdx.x] = (threadIdx.x < aColSize) ? aInputData[idx] : FLT_MAX;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<aColSize; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<aColSize){
|
||||
for (int offset = blockDim.x; offset < aColSize; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aColSize)
|
||||
{
|
||||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], aInputData[idx + offset]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
|
||||
if (threadIdx.x < i) {
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x] = shared_data[0];
|
||||
}
|
||||
}
|
||||
@@ -296,45 +327,54 @@ __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aCol
|
||||
shared_data[threadIdx.x] = (threadIdx.x < aRowSize) ? aInputData[idx] : FLT_MAX;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) {
|
||||
if(threadIdx.x+offset < aRowSize){
|
||||
for (int offset = blockDim.x; offset < aRowSize; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aRowSize)
|
||||
{
|
||||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], aInputData[idx + offset * aColSize]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x] = shared_data[0];
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) {
|
||||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction)
|
||||
{
|
||||
long a, b;
|
||||
return min(aMatrix, direction, a, b);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long &rowIdx, long &colIdx)
|
||||
{
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
// 针对向量行等于列
|
||||
if (aMatrix.isVector()){
|
||||
if (aMatrix.isVector())
|
||||
{
|
||||
direction = All;
|
||||
}
|
||||
switch (direction)
|
||||
{
|
||||
case All: {
|
||||
case All:
|
||||
{
|
||||
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(aMatrix.getData());
|
||||
auto max_iter = thrust::min_element(thrust::device, d_ptr, d_ptr + aMatrix.getDataSize());
|
||||
int index = max_iter - d_ptr;
|
||||
@@ -372,16 +412,19 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, l
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) {
|
||||
CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat)
|
||||
{
|
||||
// col-vec x mat
|
||||
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) {
|
||||
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0))
|
||||
{
|
||||
size_t size = aMat.getDataSize();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
auto lambda = [=] __host__ __device__(const float &x, const float &y) {
|
||||
return fminf(x, y);
|
||||
};
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i)
|
||||
{
|
||||
thrust::transform(thrust::device, aVec.getData(),
|
||||
aVec.getData() + aVec.getDataSize(),
|
||||
aMat.getData() + aMat.getDimSize(0) * i,
|
||||
@@ -396,7 +439,8 @@ CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) {
|
||||
size_t size = aMat.getDataSize();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||||
for (int i = 0; i < aMat.getDimSize(1); ++i)
|
||||
{
|
||||
float v = aVec.getValue(i);
|
||||
auto lambda = [=] __host__ __device__(const float &x) {
|
||||
return fminf(x, v);
|
||||
@@ -413,24 +457,27 @@ CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) {
|
||||
<< "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther)
|
||||
{
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr
|
||||
<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
if (aOther.getDimSize(2)>1 || aOther.isComplex()) {
|
||||
if (aOther.getDimSize(2) > 1 || aOther.isComplex())
|
||||
{
|
||||
std::cerr
|
||||
<< (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
// same shape
|
||||
if (aMatrix.compareShape(aOther)){
|
||||
if (aMatrix.compareShape(aOther))
|
||||
{
|
||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
@@ -444,12 +491,14 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
// one is scalar
|
||||
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){
|
||||
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1)
|
||||
{
|
||||
float scalar = (aMatrix.getDataSize() == 1) ? aMatrix.getValue(0) : aOther.getValue(0);
|
||||
auto matrix = (aMatrix.getDataSize() == 1) ? aOther : aMatrix;
|
||||
return min(matrix, scalar);
|
||||
}
|
||||
else if (aMatrix.isVector()) {
|
||||
else if (aMatrix.isVector())
|
||||
{
|
||||
return ::vxmMin(aMatrix, aOther);
|
||||
}
|
||||
else if (aOther.isVector())
|
||||
@@ -462,8 +511,8 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||||
return CudaMatrix();
|
||||
}
|
||||
|
||||
|
||||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){
|
||||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue)
|
||||
{
|
||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, sizeof(float) * size);
|
||||
@@ -485,23 +534,28 @@ __global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount
|
||||
shared_data[threadIdx.x] = (threadIdx.x < aColEleCount) ? aInputData[idx] : 0.0;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<aColEleCount){
|
||||
for (int offset = blockDim.x; offset < aColEleCount; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aColEleCount)
|
||||
{
|
||||
shared_data[threadIdx.x] += (double)aInputData[idx + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x] = (float)shared_data[0];
|
||||
}
|
||||
}
|
||||
@@ -515,22 +569,27 @@ __global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aCol
|
||||
shared_data[threadIdx.x] = (threadIdx.x < aRowEleCount) ? aInputData[idx] : 0.0;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x+offset < aRowEleCount){
|
||||
for (int offset = blockDim.x; offset < aRowEleCount; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aRowEleCount)
|
||||
{
|
||||
shared_data[threadIdx.x] += aInputData[idx + offset * aColEleCount];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x] += shared_data[threadIdx.x + i];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x] = shared_data[0];
|
||||
}
|
||||
}
|
||||
@@ -547,8 +606,10 @@ __global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSi
|
||||
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<4096; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<4096 && idx + offset<aTotalSize){
|
||||
for (int offset = blockDim.x; offset < 4096; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < 4096 && idx + offset < aTotalSize)
|
||||
{
|
||||
shared_data[threadIdx.x][0] += aInputData[idx * 2 + offset * 2];
|
||||
shared_data[threadIdx.x][1] += aInputData[idx * 2 + offset * 2 + 1];
|
||||
}
|
||||
@@ -556,8 +617,10 @@ __global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSi
|
||||
}
|
||||
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
|
||||
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
|
||||
}
|
||||
@@ -565,7 +628,8 @@ __global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSi
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x] = shared_data[0][0];
|
||||
aOutput[blockIdx.x + gridDim.x] = shared_data[0][1];
|
||||
}
|
||||
@@ -582,8 +646,10 @@ __global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCoun
|
||||
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<aColEleCount){
|
||||
for (int offset = blockDim.x; offset < aColEleCount; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aColEleCount)
|
||||
{
|
||||
shared_data[threadIdx.x][0] += aInputData[idx * 2 + offset * 2];
|
||||
shared_data[threadIdx.x][1] += aInputData[idx * 2 + offset * 2 + 1];
|
||||
}
|
||||
@@ -591,8 +657,10 @@ __global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCoun
|
||||
}
|
||||
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
|
||||
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
|
||||
}
|
||||
@@ -600,7 +668,8 @@ __global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCoun
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x * 2] = shared_data[0][0];
|
||||
aOutput[blockIdx.x * 2 + 1] = shared_data[0][1];
|
||||
}
|
||||
@@ -617,39 +686,45 @@ __global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aC
|
||||
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x+offset < aRowEleCount){
|
||||
for (int offset = blockDim.x; offset < aRowEleCount; offset += blockDim.x)
|
||||
{
|
||||
if (threadIdx.x + offset < aRowEleCount)
|
||||
{
|
||||
shared_data[threadIdx.x][0] += aInputData[idx * 2 + offset * aColEleCount * 2];
|
||||
shared_data[threadIdx.x][1] += aInputData[idx * 2 + offset * aColEleCount * 2 + 1];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
for (int i = blockDim.x / 2; i > 0; i >>= 1)
|
||||
{
|
||||
if (threadIdx.x < i)
|
||||
{
|
||||
shared_data[threadIdx.x][0] += shared_data[threadIdx.x + i][0];
|
||||
shared_data[threadIdx.x][1] += shared_data[threadIdx.x + i][1];
|
||||
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
aOutput[blockIdx.x * 2] = shared_data[0][0];
|
||||
aOutput[blockIdx.x * 2 + 1] = shared_data[0][1];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
if (aMatrix.getDimSize(2)>1 ) {
|
||||
CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction)
|
||||
{
|
||||
if (aMatrix.getDimSize(2) > 1)
|
||||
{
|
||||
std::cerr << "sum() not support 3D data!"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
// 针对向量行等于列
|
||||
if (direction == Column && aMatrix.getDimSize(0)==1){
|
||||
if (direction == Column && aMatrix.getDimSize(0) == 1)
|
||||
{
|
||||
direction = Row;
|
||||
}
|
||||
if (!aMatrix.isComplex())
|
||||
@@ -682,7 +757,8 @@ CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
float *matData = aMatrix.getData();
|
||||
float *retData = nullptr;
|
||||
int colElementCount = aMatrix.getDimSize(0);
|
||||
if (colElementCount == 1) return aMatrix;
|
||||
if (colElementCount == 1)
|
||||
return aMatrix;
|
||||
cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1));
|
||||
sumColKernel<<<aMatrix.getDimSize(1), 256>>>(matData, retData, colElementCount);
|
||||
cudaDeviceSynchronize();
|
||||
@@ -691,7 +767,8 @@ CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
else
|
||||
{
|
||||
switch (direction)
|
||||
{
|
||||
case All:
|
||||
@@ -730,7 +807,8 @@ CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
float *matData = aMatrix.getData();
|
||||
float *retData = nullptr;
|
||||
int colElementCount = aMatrix.getDimSize(0);
|
||||
if (colElementCount == 1) return aMatrix;
|
||||
if (colElementCount == 1)
|
||||
return aMatrix;
|
||||
cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1) * 2);
|
||||
sumZColKernel<<<aMatrix.getDimSize(1), 256>>>(matData, retData, colElementCount);
|
||||
cudaDeviceSynchronize();
|
||||
@@ -741,14 +819,17 @@ CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
if (aMatrix.getDimSize(2)>1 ) {
|
||||
CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction)
|
||||
{
|
||||
if (aMatrix.getDimSize(2) > 1)
|
||||
{
|
||||
std::cerr << "sum() not support 3D data!"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
// 针对向量行等于列
|
||||
if (direction == Column && aMatrix.getDimSize(0)==1){
|
||||
if (direction == Column && aMatrix.getDimSize(0) == 1)
|
||||
{
|
||||
direction = Row;
|
||||
}
|
||||
if (!aMatrix.isComplex())
|
||||
@@ -765,7 +846,8 @@ CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction )
|
||||
{
|
||||
auto ret = sum(aMatrix, Row);
|
||||
float count = (float)aMatrix.getDimSize(1);
|
||||
auto lambda = [=] __device__ (const float& v){
|
||||
auto lambda = [=] __device__(const float &v)
|
||||
{
|
||||
return v / count;
|
||||
};
|
||||
thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), ret.getData(), lambda);
|
||||
@@ -776,7 +858,8 @@ CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction )
|
||||
{
|
||||
auto ret = sum(aMatrix, Column);
|
||||
float count = (float)aMatrix.getDimSize(0);
|
||||
auto lambda = [=] __device__ (const float& v){
|
||||
auto lambda = [=] __device__(const float &v)
|
||||
{
|
||||
return v / count;
|
||||
};
|
||||
thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), ret.getData(), lambda);
|
||||
@@ -784,15 +867,18 @@ CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction )
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
else
|
||||
{
|
||||
std::cerr << "mean() not support complex data!"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::std(const CudaMatrix &aMatrix){
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) {
|
||||
CudaMatrix Aurora::std(const CudaMatrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr
|
||||
<< (aMatrix.getDimSize(2) > 1 ? "std() not support 3D data!" : "std() not support complex value type!")
|
||||
<< std::endl;
|
||||
@@ -811,44 +897,47 @@ class RowElementIterator:public thrust::iterator_facade<
|
||||
ValueType,
|
||||
thrust::device_system_tag,
|
||||
thrust::random_access_traversal_tag,
|
||||
ValueType& >{
|
||||
ValueType &>
|
||||
{
|
||||
public:
|
||||
// 构造函数
|
||||
__host__ __device__
|
||||
RowElementIterator(ValueType *ptr, int aColElementCount = 1) : ptr_(ptr), col_elements_(aColElementCount) {}
|
||||
|
||||
__host__ __device__
|
||||
ValueType& dereference() const{
|
||||
__host__ __device__ ValueType &dereference() const
|
||||
{
|
||||
return *ptr_;
|
||||
}
|
||||
|
||||
// 实现递增操作符
|
||||
__host__ __device__
|
||||
void increment() {
|
||||
__host__ __device__ void increment()
|
||||
{
|
||||
ptr_ = ptr_ + col_elements_;
|
||||
}
|
||||
|
||||
// 实现递减操作符
|
||||
__host__ __device__
|
||||
void decrement() {
|
||||
__host__ __device__ void decrement()
|
||||
{
|
||||
ptr_ = ptr_ - col_elements_;
|
||||
}
|
||||
|
||||
// 实现加法操作符
|
||||
__host__ __device__
|
||||
void advance(typename RowElementIterator::difference_type n) {
|
||||
__host__ __device__ void advance(typename RowElementIterator::difference_type n)
|
||||
{
|
||||
ptr_ += col_elements_ * n;
|
||||
}
|
||||
|
||||
// 实现减法操作符
|
||||
__host__ __device__
|
||||
typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const {
|
||||
typename RowElementIterator::difference_type
|
||||
distance_to(const RowElementIterator &other) const
|
||||
{
|
||||
return (other.ptr_ - ptr_) / col_elements_;
|
||||
}
|
||||
|
||||
// 实现比较操作符
|
||||
__host__ __device__
|
||||
bool equal(const RowElementIterator& other) const {
|
||||
__host__ __device__ bool equal(const RowElementIterator &other) const
|
||||
{
|
||||
return ptr_ == other.ptr_;
|
||||
}
|
||||
|
||||
@@ -859,7 +948,8 @@ class RowElementIterator:public thrust::iterator_facade<
|
||||
|
||||
CudaMatrix Aurora::sort(const CudaMatrix &aMatrix, FunctionDirection direction)
|
||||
{
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr
|
||||
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
|
||||
<< std::endl;
|
||||
@@ -870,7 +960,8 @@ CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction)
|
||||
|
||||
CudaMatrix Aurora::sort(CudaMatrix &&aMatrix, FunctionDirection direction)
|
||||
{
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex())
|
||||
{
|
||||
std::cerr
|
||||
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
|
||||
<< std::endl;
|
||||
@@ -907,9 +998,7 @@ CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction)
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
__global__ void immseKernel(float *aInputData1, float *aInputData2, float *aOutputData, unsigned int aInputSize)
|
||||
@@ -955,12 +1044,10 @@ float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB)
|
||||
struct compareMatrixByRows
|
||||
{
|
||||
compareMatrixByRows(unsigned int aSize)
|
||||
: mSize(aSize)
|
||||
{
|
||||
: mSize(aSize) {
|
||||
};
|
||||
unsigned int mSize;
|
||||
__host__ __device__
|
||||
bool operator()(const float* aVector1, const float* aVector2) const
|
||||
__host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const
|
||||
{
|
||||
for (unsigned int i = 0; i < mSize; ++i)
|
||||
{
|
||||
@@ -1239,7 +1326,6 @@ __global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigne
|
||||
aOutputData[2 * blockIdx.x] = sharedValue[0];
|
||||
aOutputData[2 * blockIdx.x + 1] = sharedValue[1];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
|
||||
@@ -1280,7 +1366,8 @@ CudaMatrix Aurora::prod(const CudaMatrix &aMatrix)
|
||||
* @param aMatrix
|
||||
* @param aDirection 0 FORWARD, 1 BACKWARD
|
||||
*/
|
||||
void ExecFFT(CudaMatrix& aMatrix,int aDirection){
|
||||
void ExecFFT(CudaMatrix &aMatrix, int aDirection)
|
||||
{
|
||||
cufftHandle plan;
|
||||
cudaStream_t stream = NULL;
|
||||
int batch_size = aMatrix.getDimSize(1);
|
||||
@@ -1305,16 +1392,18 @@ __global__ void complexFillKernel(float* aInputData, float* aOutput,unsigned int
|
||||
|
||||
for (int offset = 0; offset < aDesColEleCount; offset += blockDim.x)
|
||||
{
|
||||
if(threadIdx.x + offset< aCopySize){
|
||||
if (threadIdx.x + offset < aCopySize)
|
||||
{
|
||||
aOutput[2 * idx_d + offset * 2] = aInputData[idx_s + offset];
|
||||
aOutput[2 * idx_d + offset * 2 + 1] = 0;
|
||||
|
||||
}
|
||||
else if(threadIdx.x + offset< aDesColEleCount){
|
||||
else if (threadIdx.x + offset < aDesColEleCount)
|
||||
{
|
||||
aOutput[2 * idx_d + offset * 2] = 0;
|
||||
aOutput[2 * idx_d + offset * 2 + 1] = 0;
|
||||
}
|
||||
else{
|
||||
else
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
@@ -1328,21 +1417,25 @@ __global__ void complexCopyKernel(float* aInputData, float* aOutput,unsigned int
|
||||
|
||||
for (int offset = 0; offset < aDesColEleCount; offset += blockDim.x)
|
||||
{
|
||||
if(threadIdx.x + offset< aCopySize){
|
||||
if (threadIdx.x + offset < aCopySize)
|
||||
{
|
||||
aOutput[2 * idx_d + offset * 2] = aInputData[idx_s * 2 + offset * 2];
|
||||
aOutput[2 * idx_d + offset * 2 + 1] = aInputData[idx_s * 2 + offset * 2 + 1];
|
||||
}
|
||||
else if(threadIdx.x + offset< aDesColEleCount){
|
||||
else if (threadIdx.x + offset < aDesColEleCount)
|
||||
{
|
||||
aOutput[2 * idx_d + offset * 2] = 0;
|
||||
aOutput[2 * idx_d + offset * 2 + 1] = 0;
|
||||
}
|
||||
else{
|
||||
else
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
|
||||
CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize)
|
||||
{
|
||||
size_t ColEleCount = (aFFTSize > 0) ? aFFTSize : aMatrix.getDimSize(0);
|
||||
// 实际需要copy赋值的非0值
|
||||
size_t needCopySize = (ColEleCount < aMatrix.getDimSize(0)) ? ColEleCount : aMatrix.getDimSize(0);
|
||||
@@ -1350,10 +1443,12 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
|
||||
float *data = nullptr;
|
||||
|
||||
cudaMalloc((void **)&data, sizeof(float) * 2 * bufferSize);
|
||||
if (aMatrix.isComplex()){
|
||||
if (aMatrix.isComplex())
|
||||
{
|
||||
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount);
|
||||
}
|
||||
else{
|
||||
else
|
||||
{
|
||||
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
@@ -1362,8 +1457,10 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
|
||||
return ret;
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){
|
||||
if (!aMatrix.isComplex()){
|
||||
CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize)
|
||||
{
|
||||
if (!aMatrix.isComplex())
|
||||
{
|
||||
std::cerr << "ifft input must be complex value" << std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
@@ -1379,14 +1476,16 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){
|
||||
auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex);
|
||||
ExecFFT(ret, 1);
|
||||
float colEleCountf = 1.f / ColEleCount;
|
||||
auto lambda = [=]__device__(const float& v){
|
||||
auto lambda = [=] __device__(const float &v)
|
||||
{
|
||||
return v * colEleCountf;
|
||||
};
|
||||
thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize() * 2, ret.getData(), lambda);
|
||||
return ret;
|
||||
}
|
||||
|
||||
__global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){
|
||||
__global__ void fftshiftSwapKernel(float *aData, unsigned int aColEleCount)
|
||||
{
|
||||
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
|
||||
unsigned int stride = aColEleCount / 2;
|
||||
for (int i = 0; i < stride; i += blockDim.x)
|
||||
@@ -1401,7 +1500,8 @@ __global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){
|
||||
}
|
||||
|
||||
__global__ void memcpyColKernel(float *aInputData, float *aOutputData, unsigned int aCopySize,
|
||||
unsigned int aSrcColEleCount,unsigned int aDesColEleCount){
|
||||
unsigned int aSrcColEleCount, unsigned int aDesColEleCount)
|
||||
{
|
||||
unsigned int idx = blockIdx.x * aSrcColEleCount + threadIdx.x;
|
||||
unsigned int idx2 = blockIdx.x * aDesColEleCount + threadIdx.x;
|
||||
for (int i = 0; i < aCopySize; i += blockDim.x)
|
||||
@@ -1413,12 +1513,16 @@ __global__ void memcpyColKernel(float* aInputData,float* aOutputData, unsigned i
|
||||
}
|
||||
}
|
||||
|
||||
void Aurora::fftshift(CudaMatrix &aMatrix){
|
||||
if (aMatrix.getDimSize(0) % 2 == 0) {
|
||||
void Aurora::fftshift(CudaMatrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.getDimSize(0) % 2 == 0)
|
||||
{
|
||||
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||||
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||||
cudaDeviceSynchronize();
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
int copySize = aMatrix.getDimSize(0) / 2 + 1;
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, copySize * sizeof(float) * aMatrix.getValueType());
|
||||
@@ -1442,11 +1546,15 @@ void Aurora::fftshift(CudaMatrix &aMatrix){
|
||||
}
|
||||
}
|
||||
|
||||
void Aurora::ifftshift(CudaMatrix &aMatrix){
|
||||
if (aMatrix.getDimSize(0) % 2 == 0) {
|
||||
void Aurora::ifftshift(CudaMatrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.getDimSize(0) % 2 == 0)
|
||||
{
|
||||
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||||
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||||
} else {
|
||||
}
|
||||
else
|
||||
{
|
||||
int copySize = aMatrix.getDimSize(0) / 2 + 1;
|
||||
float *data = nullptr;
|
||||
cudaMalloc((void **)&data, copySize * sizeof(float) * aMatrix.getValueType());
|
||||
@@ -1556,7 +1664,8 @@ CudaMatrix Aurora::ifft_symmetric(const CudaMatrix &aMatrix, long aLength)
|
||||
return real(ifft(CudaMatrix::fromRawData(data, aLength, 1, 1, Complex)));
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::hilbert(const CudaMatrix &aMatrix){
|
||||
CudaMatrix Aurora::hilbert(const CudaMatrix &aMatrix)
|
||||
{
|
||||
auto x = fft(aMatrix);
|
||||
auto h = Aurora::zerosCuda(aMatrix.getDimSize(0), 1, 1);
|
||||
thrust::fill_n(thrust::device, h.getData(), aMatrix.getDimSize(0) / 2, 2.0f);
|
||||
|
||||
Reference in New Issue
Block a user