diff --git a/src/Function1D.cu b/src/Function1D.cu index 84c30c6..57d3065 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -8,6 +8,12 @@ #include #include #include +#if THRUST_VERSION >= 200000 +#include +#include +#include +#endif + #include #include #include @@ -29,109 +35,126 @@ namespace const int THREADS_PER_BLOCK_DIM3_Z = 8; } -__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize) +__global__ void complexKernel(float *aInputData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { - aOutput[2*idx] = aInputData[idx]; - aOutput[2*idx + 1] = 0; + aOutput[2 * idx] = aInputData[idx]; + aOutput[2 * idx + 1] = 0; } } -__global__ void complexKernel(float* aInputRealData, float* aInputImagData, float* aOutput, unsigned int aSize) +__global__ void complexKernel(float *aInputRealData, float *aInputImagData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { - aOutput[2*idx] = aInputRealData[idx]; - aOutput[2*idx + 1] = aInputImagData[idx]; + aOutput[2 * idx] = aInputRealData[idx]; + aOutput[2 * idx + 1] = aInputImagData[idx]; } } -CudaMatrix Aurora::complex(const CudaMatrix& aMatrix) +#if THRUST_VERSION >= 200000 +template +void callThrustTransform(InputIterator aInput, OutputIterator aOutput, UnaryFunction aFunc){ + thrust::transform(aInput, aOutput, aFunc); +} +#else +template +void callThrustTransform(InputIterator aInput, OutputIterator aOutput, UnaryFunction aFunc){ + thrust::transform(thrust::device, aInput, aOutput, aFunc); +} +#endif + + +CudaMatrix Aurora::complex(const CudaMatrix &aMatrix) { - if(aMatrix.isComplex()) + if (aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; complexKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex); } -CudaMatrix Aurora::complex(const CudaMatrix& aReal, const CudaMatrix& aImag) +CudaMatrix Aurora::complex(const CudaMatrix &aReal, const CudaMatrix &aImag) { - if(aReal.isComplex() || aImag.isComplex() || aReal.getDataSize() != aImag.getDataSize()) + if (aReal.isComplex() || aImag.isComplex() || aReal.getDataSize() != aImag.getDataSize()) { return CudaMatrix(); } size_t size = aReal.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size * Aurora::Complex); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size * Aurora::Complex); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; complexKernel<<>>(aReal.getData(), aImag.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aReal.getDimSize(0), aReal.getDimSize(1), aReal.getDimSize(2), Aurora::Complex); } -__global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize) +__global__ void realKernel(float *aInputData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { - aOutput[idx] = aInputData[idx*2]; + aOutput[idx] = aInputData[idx * 2]; } } -CudaMatrix Aurora::real(const CudaMatrix& aMatrix) +CudaMatrix Aurora::real(const CudaMatrix &aMatrix) { - if(!aMatrix.isComplex()) + if (!aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; realKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } -__global__ void imageKernel(float* aInputData, float* aOutput, unsigned int aSize) +__global__ void imageKernel(float *aInputData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { - aOutput[idx] = aInputData[idx*2 + 1]; + aOutput[idx] = aInputData[idx * 2 + 1]; } } -CudaMatrix Aurora::imag(const CudaMatrix& aMatrix) +CudaMatrix Aurora::imag(const CudaMatrix &aMatrix) { - if(!aMatrix.isComplex()) + if (!aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; imageKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } -__global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize) +__global__ void ceilKernel(float *aInputData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) @@ -140,29 +163,29 @@ __global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize } } -CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix) +CudaMatrix Aurora::ceil(const CudaMatrix &aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix) +CudaMatrix Aurora::ceil(const CudaMatrix &&aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSize) +__global__ void roundKernel(float *aInputData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) @@ -171,29 +194,29 @@ __global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSiz } } -CudaMatrix Aurora::round(const CudaMatrix& aMatrix) +CudaMatrix Aurora::round(const CudaMatrix &aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::round(const CudaMatrix&& aMatrix) +CudaMatrix Aurora::round(const CudaMatrix &&aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSize) +__global__ void floorKernel(float *aInputData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) @@ -202,29 +225,29 @@ __global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSiz } } -CudaMatrix Aurora::floor(const CudaMatrix& aMatrix) +CudaMatrix Aurora::floor(const CudaMatrix &aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix) +CudaMatrix Aurora::floor(const CudaMatrix &&aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize) +__global__ void sqrtKernel(float *aInputData, float *aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) @@ -233,94 +256,94 @@ __global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize } } -CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix) +CudaMatrix Aurora::sqrt(const CudaMatrix &aMatrix) { - if(aMatrix.getValueType() == Aurora::Complex) + if (aMatrix.getValueType() == Aurora::Complex) { - std::cerr<<"sqrt not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix) +CudaMatrix Aurora::sqrt(const CudaMatrix &&aMatrix) { - if(aMatrix.getValueType() == Aurora::Complex) + if (aMatrix.getValueType() == Aurora::Complex) { - std::cerr<<"sqrt not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void absKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) +__global__ void absKernel(float *aInputData, float *aOutput, unsigned int aSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { - if(aIsComplex) + if (aIsComplex) { - aOutput[idx] = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]); + aOutput[idx] = sqrt(aInputData[2 * idx] * aInputData[2 * idx] + aInputData[2 * idx + 1] * aInputData[2 * idx + 1]); } else { aOutput[idx] = abs(aInputData[idx]); - } + } } } -CudaMatrix Aurora::abs(const CudaMatrix& aMatrix) +CudaMatrix Aurora::abs(const CudaMatrix &aMatrix) { size_t size = aMatrix.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); - cudaDeviceSynchronize(); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); -} - -CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix) -{ - size_t size = aMatrix.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } -__global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) +CudaMatrix Aurora::abs(const CudaMatrix &&aMatrix) +{ + size_t size = aMatrix.getDataSize(); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); +} + +__global__ void signKernel(float *aInputData, float *aOutput, unsigned int aSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { - if(aIsComplex) + if (aIsComplex) { - float absValue = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx + 1] * aInputData[2*idx + 1]); - aOutput[2*idx] = aInputData[2*idx] / absValue; - aOutput[2*idx + 1] = aInputData[2*idx + 1] / absValue; + float absValue = sqrt(aInputData[2 * idx] * aInputData[2 * idx] + aInputData[2 * idx + 1] * aInputData[2 * idx + 1]); + aOutput[2 * idx] = aInputData[2 * idx] / absValue; + aOutput[2 * idx + 1] = aInputData[2 * idx + 1] / absValue; return; } - if(aInputData[idx] < 0) + if (aInputData[idx] < 0) { aOutput[idx] = -1; } - else if(aInputData[idx] > 0) + else if (aInputData[idx] > 0) { aOutput[idx] = 1; } @@ -331,216 +354,156 @@ __global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize } } -CudaMatrix Aurora::sign(const CudaMatrix& aMatrix) +CudaMatrix Aurora::sign(const CudaMatrix &aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix) +CudaMatrix Aurora::sign(const CudaMatrix &&aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -void Aurora::nantoval(CudaMatrix& aMatrix,float val){ - auto lambda = [=] __host__ __device__ (const float& x){ - return ::isnan(x)?val:x; +void Aurora::nantoval(CudaMatrix &aMatrix, float val) +{ + auto lambda = [=] __host__ __device__(const float &x) { + return ::isnan(x) ? val : x; }; - thrust::transform(thrust::device,aMatrix.getData(), - aMatrix.getData()+aMatrix.getDataSize(),aMatrix.getData(),lambda); + thrust::transform(thrust::device, aMatrix.getData(), + aMatrix.getData() + aMatrix.getDataSize(), aMatrix.getData(), lambda); } -CudaMatrix Aurora::isnan(const CudaMatrix& aMatrix){ +CudaMatrix Aurora::isnan(const CudaMatrix &aMatrix) +{ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__ (const float& x){ - return ::isnan(x)?1.0:0; + float *data = nullptr; + + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x) { + return ::isnan(x) ? 1.0 : 0; }; - thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), - data,lambda); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), + data, lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::isfinite(const CudaMatrix& aMatrix){ +CudaMatrix Aurora::isfinite(const CudaMatrix &aMatrix) +{ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__ (const float& x){ - return ::isfinite(x)?1.0:0; + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x) { + return ::isfinite(x) ? 1.0 : 0; }; - thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), - data,lambda); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), + data, lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -void Aurora::padding(CudaMatrix& aMatrix, int aIndex, float aValue){ - if(aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex()) +void Aurora::padding(CudaMatrix &aMatrix, int aIndex, float aValue) +{ + if (aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex()) { - std::cerr<<"padding only support real vector"<aIndex){ - aMatrix.setValue(aIndex, aValue); + if (aMatrix.getDataSize() > aIndex) + { + aMatrix.setValue(aIndex, aValue); return; } - //长度不足需补齐 - size_t size = (aIndex+1) ; - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + // 长度不足需补齐 + size_t size = (aIndex + 1); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); cudaMemcpy(data, aMatrix.getData(), aMatrix.getDataSize(), cudaMemcpyDeviceToDevice); - thrust::fill_n(thrust::device, data+aMatrix.getDataSize(),size-aMatrix.getDataSize(),aValue); - aMatrix=CudaMatrix::fromRawData(data,size,1,1,aMatrix.getValueType()); + thrust::fill_n(thrust::device, data + aMatrix.getDataSize(), size - aMatrix.getDataSize(), aValue); + aMatrix = CudaMatrix::fromRawData(data, size, 1, 1, aMatrix.getValueType()); } - -CudaMatrix Aurora::auroraNot(const CudaMatrix& aMatrix){ - return auroraNot(std::forward(aMatrix.deepCopy())); +CudaMatrix Aurora::auroraNot(const CudaMatrix &aMatrix) +{ + return auroraNot(std::forward(aMatrix.deepCopy())); } -CudaMatrix Aurora::auroraNot(CudaMatrix&& aMatrix){ +CudaMatrix Aurora::auroraNot(CudaMatrix &&aMatrix) +{ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__ (const float& x){ - return x<=0?1.0:0; + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x) { + return x <= 0 ? 1.0 : 0; }; - thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), - data,lambda); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), + data, lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } - -void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op) +void Aurora::compareSet(CudaMatrix &aValueMatrix, float compareValue, float newValue, CompareOp op) { switch (op) { case GT: { - auto lambda = [=] __host__ __device__ (const float& x){ - return x>compareValue?newValue:x; + auto lambda = [=] __host__ __device__(const float &x) { + return x > compareValue ? newValue : x; }; - thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); + thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda); break; } - case NG:{ - auto lambda = [=] __host__ __device__ (const float& x){ - return x<=compareValue?newValue:x; - }; - thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); - break; - } - case EQ:{ - auto lambda = [=] __host__ __device__ (const float& x){ - return x==compareValue?newValue:x; - }; - thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); - break; - } - case NE:{ - auto lambda = [=] __host__ __device__ (const float& x){ - return x!=compareValue?newValue:x; - }; - thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); - break; - } - case NL:{ - auto lambda = [=] __host__ __device__ (const float& x){ - return x>=compareValue?newValue:x; - }; - thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); - break; - } - case LT:{ - auto lambda = [=] __host__ __device__ (const float& x){ - return xcompareValue?newValue:y; + auto lambda = [=] __host__ __device__(const float &x) { + return x == compareValue ? newValue : x; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aValueMatrix.getDataSize(), - aValueMatrix.getData(), aValueMatrix.getData(), - lambda); + thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda); break; } - case NG:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x<=compareValue?newValue:y; + case NE: + { + auto lambda = [=] __host__ __device__(const float &x) { + return x != compareValue ? newValue : x; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aValueMatrix.getDataSize(), - aValueMatrix.getData(), aValueMatrix.getData(), - lambda); + thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda); break; } - case EQ:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x==compareValue?newValue:y; + case NL: + { + auto lambda = [=] __host__ __device__(const float &x) { + return x >= compareValue ? newValue : x; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aValueMatrix.getDataSize(), - aValueMatrix.getData(), aValueMatrix.getData(), - lambda); + thrust::transform(thrust::device, aValueMatrix.getData(), aValueMatrix.getData() + aValueMatrix.getDataSize(), aValueMatrix.getData(), lambda); break; } - case NE:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x!=compareValue?newValue:y; + case LT: + { + auto lambda = [=] __host__ __device__(const float &x) { + return x < compareValue ? newValue : x; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aValueMatrix.getDataSize(), - aValueMatrix.getData(), aValueMatrix.getData(), - lambda); - break; - } - case NL:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x>=compareValue?newValue:y; - }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aValueMatrix.getDataSize(), - aValueMatrix.getData(), aValueMatrix.getData(), - lambda); - break; - } - case LT:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return xy?newValue:x; - }; - thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), - aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), - aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), - lambda); - break; - } - case NG:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x<=y?newValue:x; - }; - thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), - aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), - aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), - lambda); - break; - } - case EQ:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x==y?newValue:x; - }; - thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), - aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), - aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), - lambda); - break; - } - case NE:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x!=y?newValue:x; - }; - thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), - aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), - aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), - lambda); - break; - } - case NL:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x>=y?newValue:x; - }; - thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), - aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), - aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), - lambda); - break; - } - case LT:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return xcompareValue?y:x; + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x > compareValue ? newValue : y; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aCompareMatrix.getDataSize(), - aNewValueMatrix.getData(), aCompareMatrix.getData(), - lambda); + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aValueMatrix.getDataSize(), + aValueMatrix.getData(), aValueMatrix.getData(), + lambda); break; } - case NG:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x<=compareValue?y:x; + case NG: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x <= compareValue ? newValue : y; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aCompareMatrix.getDataSize(), - aNewValueMatrix.getData(), aCompareMatrix.getData(), - lambda); + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aValueMatrix.getDataSize(), + aValueMatrix.getData(), aValueMatrix.getData(), + lambda); break; } - case EQ:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x==compareValue?y:x; + case EQ: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x == compareValue ? newValue : y; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aCompareMatrix.getDataSize(), - aNewValueMatrix.getData(), aCompareMatrix.getData(), - lambda); + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aValueMatrix.getDataSize(), + aValueMatrix.getData(), aValueMatrix.getData(), + lambda); break; } - case NE:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x!=compareValue?y:x; + case NE: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x != compareValue ? newValue : y; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aCompareMatrix.getDataSize(), - aNewValueMatrix.getData(), aCompareMatrix.getData(), - lambda); + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aValueMatrix.getDataSize(), + aValueMatrix.getData(), aValueMatrix.getData(), + lambda); break; } - case NL:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x>=compareValue?y:x; + case NL: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x >= compareValue ? newValue : y; }; - thrust::transform(thrust::device,aCompareMatrix.getData(), - aCompareMatrix.getData()+aCompareMatrix.getDataSize(), - aNewValueMatrix.getData(), aCompareMatrix.getData(), - lambda); + thrust::transform(aCompareMatrix.getData(), + aCompareMatrix.getData() + aValueMatrix.getDataSize(), + aValueMatrix.getData(), aValueMatrix.getData(), + lambda); break; } - case LT:{ - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return x y ? newValue : x; + }; + thrust::transform(thrust::device, aDesAndCompareMatrix.getData(), + aDesAndCompareMatrix.getData() + aDesAndCompareMatrix.getDataSize(), + aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), + lambda); + break; + } + case NG: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x <= y ? newValue : x; + }; + thrust::transform(thrust::device, aDesAndCompareMatrix.getData(), + aDesAndCompareMatrix.getData() + aDesAndCompareMatrix.getDataSize(), + aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), + lambda); + break; + } + case EQ: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x == y ? newValue : x; + }; + thrust::transform(thrust::device, aDesAndCompareMatrix.getData(), + aDesAndCompareMatrix.getData() + aDesAndCompareMatrix.getDataSize(), + aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), + lambda); + break; + } + case NE: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x != y ? newValue : x; + }; + thrust::transform(thrust::device, aDesAndCompareMatrix.getData(), + aDesAndCompareMatrix.getData() + aDesAndCompareMatrix.getDataSize(), + aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), + lambda); + break; + } + case NL: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x >= y ? newValue : x; + }; + thrust::transform(thrust::device, aDesAndCompareMatrix.getData(), + aDesAndCompareMatrix.getData() + aDesAndCompareMatrix.getDataSize(), + aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), + lambda); + break; + } + case LT: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x < y ? newValue : x; + }; + thrust::transform(thrust::device, aDesAndCompareMatrix.getData(), + aDesAndCompareMatrix.getData() + aDesAndCompareMatrix.getDataSize(), + aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), + lambda); + break; + } + default: + break; + } +} + +void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatrix &aNewValueMatrix, CompareOp op) +{ + switch (op) + { + case GT: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x > compareValue ? y : x; + }; + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aCompareMatrix.getDataSize(), + aNewValueMatrix.getData(), aCompareMatrix.getData(), + lambda); + break; + } + case NG: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x <= compareValue ? y : x; + }; + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aCompareMatrix.getDataSize(), + aNewValueMatrix.getData(), aCompareMatrix.getData(), + lambda); + break; + } + case EQ: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x == compareValue ? y : x; + }; + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aCompareMatrix.getDataSize(), + aNewValueMatrix.getData(), aCompareMatrix.getData(), + lambda); + break; + } + case NE: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x != compareValue ? y : x; + }; + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aCompareMatrix.getDataSize(), + aNewValueMatrix.getData(), aCompareMatrix.getData(), + lambda); + break; + } + case NL: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x >= compareValue ? y : x; + }; + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aCompareMatrix.getDataSize(), + aNewValueMatrix.getData(), aCompareMatrix.getData(), + lambda); + break; + } + case LT: + { + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return x < compareValue ? y : x; + }; + thrust::transform(thrust::device, aCompareMatrix.getData(), + aCompareMatrix.getData() + aCompareMatrix.getDataSize(), + aNewValueMatrix.getData(), aCompareMatrix.getData(), + lambda); + break; + } + default: + break; + } +} + +__global__ void repMatKernel(float *aInputData, unsigned int aInputRows, unsigned int aInputColumns, + float *aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex) { unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; - if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) + if (idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) { return; } - if(aIsComplex) + if (aIsComplex) { unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX); unsigned int inPutIndex = 2 * (idY % aInputColumns * aInputRows + idX % aInputRows); @@ -712,29 +761,29 @@ __global__ void repMatKernel(float* aInputData, unsigned int aInputRows, unsigne } } -CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes) +CudaMatrix Aurora::repmat(const CudaMatrix &aMatrix, int aRowTimes, int aColumnTimes) { - if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull()) + if (aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull()) { return CudaMatrix(); } size_t rowSize = aMatrix.getDimSize(0) * aRowTimes; size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; dim3 blockSize(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_Y, 1); - dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM2_X - 1)/THREADS_PER_BLOCK_DIM2_X, - (columnSize + THREADS_PER_BLOCK_DIM2_Y - 1)/THREADS_PER_BLOCK_DIM2_Y, 1); + dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM2_X - 1) / THREADS_PER_BLOCK_DIM2_X, + (columnSize + THREADS_PER_BLOCK_DIM2_Y - 1) / THREADS_PER_BLOCK_DIM2_Y, 1); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes; - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); repMatKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, 1, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes) +CudaMatrix Aurora::repmat(const CudaMatrix &aMatrix, int aRowTimes, int aColumnTimes, int aSliceTimes) { - if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull()) + if (aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull()) { return CudaMatrix(); } @@ -743,31 +792,31 @@ CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTi size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes; dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); - dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, - (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, - (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); + dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1) / THREADS_PER_BLOCK_DIM3_X, + (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1) / THREADS_PER_BLOCK_DIM3_Y, + (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1) / THREADS_PER_BLOCK_DIM3_Z); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); repMatKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, sliceSize, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); } -__global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns, unsigned int aInputSlices, - float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex) +__global__ void repMat3DKernel(float *aInputData, unsigned int aInputRows, unsigned int aInputColumns, unsigned int aInputSlices, + float *aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex) { unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; - if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) + if (idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) { return; } - if(aIsComplex) + if (aIsComplex) { unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX); unsigned int inPutIndex = 2 * (idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows); @@ -778,12 +827,11 @@ __global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsig { aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows]; } - } -CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes) +CudaMatrix Aurora::repmat3d(const CudaMatrix &aMatrix, int aRowTimes, int aColumnTimes, int aSliceTimes) { - if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() < 3 || aMatrix.isNull()) + if (aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() < 3 || aMatrix.isNull()) { return CudaMatrix(); } @@ -792,57 +840,57 @@ CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumn size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes; dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); - dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, - (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, - (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); + dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1) / THREADS_PER_BLOCK_DIM3_X, + (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1) / THREADS_PER_BLOCK_DIM3_Y, + (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1) / THREADS_PER_BLOCK_DIM3_Z); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - repMat3DKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), data, rowSize, columnSize, sliceSize, aMatrix.isComplex()); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + repMat3DKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), data, rowSize, columnSize, sliceSize, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); } -__global__ void logKernel(float* aInputData, float* aOutput, unsigned int aInputSize, int aBaseNum) +__global__ void logKernel(float *aInputData, float *aOutput, unsigned int aInputSize, int aBaseNum) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { - if(aBaseNum == -1) + if (aBaseNum == -1) { - aOutput[idx] = logf(aInputData[idx]); + aOutput[idx] = logf(aInputData[idx]); } else { float value = logf(aBaseNum); - aOutput[idx] = logf(aInputData[idx]) / value; + aOutput[idx] = logf(aInputData[idx]) / value; } } } -CudaMatrix Aurora::log(const CudaMatrix& aMatrix, int aBaseNum) +CudaMatrix Aurora::log(const CudaMatrix &aMatrix, int aBaseNum) { - if(aMatrix.getValueType() == Aurora::Complex) + if (aMatrix.getValueType() == Aurora::Complex) { - std::cerr<<"log not support complex"<>>(aMatrix.getData(), data, size, aBaseNum); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void expKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) +__global__ void expKernel(float *aInputData, float *aOutput, unsigned int aInputSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { - if(aIsComplex) + if (aIsComplex) { unsigned int index = 2 * idx; float expReal = expf(aInputData[index]); @@ -851,167 +899,166 @@ __global__ void expKernel(float* aInputData, float* aOutput, unsigned int aInput } else { - aOutput[idx] = expf(aInputData[idx]); + aOutput[idx] = expf(aInputData[idx]); } } } -CudaMatrix Aurora::exp(const CudaMatrix& aMatrix) +CudaMatrix Aurora::exp(const CudaMatrix &aMatrix) { size_t size = aMatrix.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType()); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size * aMatrix.getValueType()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; expKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void modKernel(float* aInputData, float* aOutput, unsigned int aInputSize, float aModValue) +__global__ void modKernel(float *aInputData, float *aOutput, unsigned int aInputSize, float aModValue) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { - aOutput[idx] = fmodf(aInputData[idx], aModValue); + aOutput[idx] = fmodf(aInputData[idx], aModValue); } } -CudaMatrix Aurora::mod(const CudaMatrix& aMatrix, float aValue) +CudaMatrix Aurora::mod(const CudaMatrix &aMatrix, float aValue) { - if(aMatrix.isComplex()) + if (aMatrix.isComplex()) { - std::cerr<<"mod not support complex"<>>(aMatrix.getData(), data, size, aValue); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } -__global__ void acosKernel(float* aInputData, float* aOutput, unsigned int aInputSize) +__global__ void acosKernel(float *aInputData, float *aOutput, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { - aOutput[idx] = acosf(aInputData[idx]); + aOutput[idx] = acosf(aInputData[idx]); } } -CudaMatrix Aurora::acos(const CudaMatrix& aMatrix) +CudaMatrix Aurora::acos(const CudaMatrix &aMatrix) { - if(aMatrix.isComplex()) + if (aMatrix.isComplex()) { - std::cerr<<"acos not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } -__global__ void acosdKernel(float* aInputData, float* aOutput, unsigned int aInputSize) +__global__ void acosdKernel(float *aInputData, float *aOutput, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { - aOutput[idx] = acosf(aInputData[idx]) * 180 / PI; + aOutput[idx] = acosf(aInputData[idx]) * 180 / PI; } } -CudaMatrix Aurora::acosd(const CudaMatrix& aMatrix) +CudaMatrix Aurora::acosd(const CudaMatrix &aMatrix) { - if(aMatrix.isComplex()) + if (aMatrix.isComplex()) { - std::cerr<<"acos not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } -__global__ void conjKernel(float* aInputData, float* aOutput, unsigned int aInputSize) +__global__ void conjKernel(float *aInputData, float *aOutput, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { unsigned int index = idx * 2; - aOutput[index] = aInputData[index]; + aOutput[index] = aInputData[index]; aOutput[index + 1] = -aInputData[index + 1]; } } -CudaMatrix Aurora::conj(const CudaMatrix& aMatrix) +CudaMatrix Aurora::conj(const CudaMatrix &aMatrix) { - if(!aMatrix.isComplex()) + if (!aMatrix.isComplex()) { - return CudaMatrix::copyFromRawData(aMatrix.getData(),aMatrix.getDimSize(0),aMatrix.getDimSize(1),aMatrix.getDimSize(2)); + return CudaMatrix::copyFromRawData(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } size_t size = aMatrix.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType()); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size * aMatrix.getValueType()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; conjKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } - -float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod) +float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod) { float resultValue = 0; - if(aMatrix.getDims() > 2) + if (aMatrix.getDims() > 2) { - std::cerr<<"norm not support 3d matrix"<()); + resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize(), 0.0, thrust::plus()); return resultValue; } else { CudaMatrix result = aMatrix.deepCopy(); - thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); - resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); + thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); + resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); return std::sqrt(resultValue); } } - //for 2 dims - if(aNormMethod == Aurora::NormF) + // for 2 dims + if (aNormMethod == Aurora::NormF) { CudaMatrix result = aMatrix.deepCopy(); - thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); - resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); + thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); + resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); return std::sqrt(resultValue); } - else if(aNormMethod == Aurora::Norm1) + else if (aNormMethod == Aurora::Norm1) { - for(int i=0; i()); - if(resultValue < tempValue) + float tempValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize(), 0.0, thrust::plus()); + if (resultValue < tempValue) { resultValue = tempValue; } @@ -1020,18 +1067,18 @@ float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod) } else { - if(aMatrix.isComplex()) + if (aMatrix.isComplex()) { - std::cerr<<"norm2 not support 2d complex matrix"< resultValue) { resultValue = S[i]; } } - if (d_S ) cudaFree(d_S); - if (d_U ) cudaFree(d_U); - if (d_VT) cudaFree(d_VT); - if (devInfo) cudaFree(devInfo); - if (d_work) cudaFree(d_work); - if (cusolverH) cusolverDnDestroy(cusolverH); - if (stream) cudaStreamDestroy(stream); + if (d_S) + cudaFree(d_S); + if (d_U) + cudaFree(d_U); + if (d_VT) + cudaFree(d_VT); + if (devInfo) + cudaFree(devInfo); + if (d_work) + cudaFree(d_work); + if (cusolverH) + cusolverDnDestroy(cusolverH); + if (stream) + cudaStreamDestroy(stream); return resultValue; } } -__global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int aRowSize, unsigned int aColumnSize, bool aIsComplex) +__global__ void transposeKernel(float *aInputData, float *aOutput, unsigned int aRowSize, unsigned int aColumnSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; @@ -1093,31 +1147,30 @@ __global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int { unsigned int inputindex = idy * aRowSize + idx; unsigned int outputIndex = idx * aColumnSize + idy; - if(aIsComplex) + if (aIsComplex) { - aOutput[2*outputIndex] = aInputData[2*inputindex]; - aOutput[2*outputIndex + 1] = aInputData[2*inputindex + 1]; + aOutput[2 * outputIndex] = aInputData[2 * inputindex]; + aOutput[2 * outputIndex + 1] = aInputData[2 * inputindex + 1]; } else { - aOutput[outputIndex] = aInputData[inputindex]; + aOutput[outputIndex] = aInputData[inputindex]; } } } - -CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix) +CudaMatrix Aurora::transpose(const CudaMatrix &aMatrix) { - //not surpport for 3 dims. - if(aMatrix.isNull() || aMatrix.getDimSize(2) > 1) + // not surpport for 3 dims. + if (aMatrix.isNull() || aMatrix.getDimSize(2) > 1) { - std::cerr<<"transpose not support 3d complex matrix"<>>(aMatrix.getData(), data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.isComplex()); @@ -1125,12 +1178,12 @@ CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix) return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(1), aMatrix.getDimSize(0), aMatrix.getDimSize(2), aMatrix.getValueType()); } -CudaMatrix Aurora::horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) +CudaMatrix Aurora::horzcat(const CudaMatrix &aMatrix1, const CudaMatrix &aMatrix2) { - if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || - aMatrix1.getDimSize(0) != aMatrix2.getDimSize(0) || aMatrix1.getValueType() != aMatrix2.getValueType()) + if (aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || + aMatrix1.getDimSize(0) != aMatrix2.getDimSize(0) || aMatrix1.getValueType() != aMatrix2.getValueType()) { - std::cerr<<"horzcat must have same rows and slices"<= aOutputRowSize || idY >= aOutputColumnSize || idZ >= aOutputSliceSize) + if (idX >= aOutputRowSize || idY >= aOutputColumnSize || idZ >= aOutputSliceSize) { return; } - if(aIsComplex) + if (aIsComplex) { - if(idX < aInputData1RowSize) + if (idX < aInputData1RowSize) { unsigned int inputIndex = idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX; unsigned int outputIndex = idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX; - aOutput[2*outputIndex] = aInputData1[2*inputIndex]; - aOutput[2*outputIndex + 1] = aInputData1[2*inputIndex + 1]; + aOutput[2 * outputIndex] = aInputData1[2 * inputIndex]; + aOutput[2 * outputIndex + 1] = aInputData1[2 * inputIndex + 1]; } else { unsigned int inputIndex = idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize; - unsigned int outputIndex =idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX; - aOutput[2*outputIndex] = aInputData2[2*inputIndex]; - aOutput[2*outputIndex + 1] = aInputData2[2*inputIndex + 1]; + unsigned int outputIndex = idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX; + aOutput[2 * outputIndex] = aInputData2[2 * inputIndex]; + aOutput[2 * outputIndex + 1] = aInputData2[2 * inputIndex + 1]; } } else { - if(idX < aInputData1RowSize) + if (idX < aInputData1RowSize) { aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData1[idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX]; } @@ -1191,13 +1244,12 @@ __global__ void vertcatKernel(float* aInputData1, unsigned int aInputData1RowSiz aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize]; } } - } -CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) +CudaMatrix Aurora::vertcat(const CudaMatrix &aMatrix1, const CudaMatrix &aMatrix2) { - if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || - aMatrix1.getDimSize(1) != aMatrix2.getDimSize(1) || aMatrix1.getValueType() != aMatrix2.getValueType()) + if (aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || + aMatrix1.getDimSize(1) != aMatrix2.getDimSize(1) || aMatrix1.getValueType() != aMatrix2.getValueType()) { return CudaMatrix(); } @@ -1205,115 +1257,115 @@ CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix size_t outputColumns = aMatrix1.getDimSize(1); size_t outputSlices = aMatrix1.getDimSize(2); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * outputRows * outputColumns * outputSlices * aMatrix1.getValueType()); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * outputRows * outputColumns * outputSlices * aMatrix1.getValueType()); dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); - dim3 gridSize((outputRows + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, - (outputColumns + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, - (outputSlices + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); - vertcatKernel<<>>(aMatrix1.getData(), aMatrix1.getDimSize(0), aMatrix2.getData(), aMatrix2.getDimSize(0), + dim3 gridSize((outputRows + THREADS_PER_BLOCK_DIM3_X - 1) / THREADS_PER_BLOCK_DIM3_X, + (outputColumns + THREADS_PER_BLOCK_DIM3_Y - 1) / THREADS_PER_BLOCK_DIM3_Y, + (outputSlices + THREADS_PER_BLOCK_DIM3_Z - 1) / THREADS_PER_BLOCK_DIM3_Z); + vertcatKernel<<>>(aMatrix1.getData(), aMatrix1.getDimSize(0), aMatrix2.getData(), aMatrix2.getDimSize(0), data, outputRows, outputColumns, outputSlices, aMatrix1.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType()); } -__global__ void vecnorm1Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex) +__global__ void vecnorm1Kernel(float *aInputData, unsigned int aInputRowSize, float *aOutput, bool aIsComplex) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 0; - if(aIsComplex) + if (aIsComplex) { - for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + for (unsigned int i = 0; i <= aInputRowSize / blockDim.x; ++i) { - unsigned int indexByRows = i*blockDim.x + threadIdx.x; - if(indexByRows < aInputRowSize) + unsigned int indexByRows = i * blockDim.x + threadIdx.x; + if (indexByRows < aInputRowSize) { - unsigned int idx = blockIdx.x*aInputRowSize + indexByRows; - sharedValue[threadIdx.x] += sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]); + unsigned int idx = blockIdx.x * aInputRowSize + indexByRows; + sharedValue[threadIdx.x] += sqrt(aInputData[2 * idx] * aInputData[2 * idx] + aInputData[2 * idx + 1] * aInputData[2 * idx + 1]); } } } else { - for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + for (unsigned int i = 0; i <= aInputRowSize / blockDim.x; ++i) { - unsigned int indexByRows = i*blockDim.x + threadIdx.x; - if(indexByRows < aInputRowSize) + unsigned int indexByRows = i * blockDim.x + threadIdx.x; + if (indexByRows < aInputRowSize) { - sharedValue[threadIdx.x] += abs(aInputData[blockIdx.x*aInputRowSize + indexByRows]); + sharedValue[threadIdx.x] += abs(aInputData[blockIdx.x * aInputRowSize + indexByRows]); } } } __syncthreads(); - for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { - if(threadIdx.x < i) + if (threadIdx.x < i) { sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; } __syncthreads(); } - aOutput[blockIdx.x] = sharedValue[0]; + aOutput[blockIdx.x] = sharedValue[0]; } -__global__ void vecnorm2Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex) +__global__ void vecnorm2Kernel(float *aInputData, unsigned int aInputRowSize, float *aOutput, bool aIsComplex) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 0; - for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + for (unsigned int i = 0; i <= aInputRowSize / blockDim.x; ++i) { - unsigned int indexByRows = i*blockDim.x + threadIdx.x; - if(indexByRows < aInputRowSize) + unsigned int indexByRows = i * blockDim.x + threadIdx.x; + if (indexByRows < aInputRowSize) { - if(aIsComplex) + if (aIsComplex) { - unsigned int idx = blockIdx.x*aInputRowSize + indexByRows; + unsigned int idx = blockIdx.x * aInputRowSize + indexByRows; sharedValue[threadIdx.x] += aInputData[2 * idx] * aInputData[2 * idx]; sharedValue[threadIdx.x] += aInputData[2 * idx + 1] * aInputData[2 * idx + 1]; } else { - sharedValue[threadIdx.x] += aInputData[blockIdx.x*aInputRowSize + indexByRows] * aInputData[blockIdx.x*aInputRowSize + indexByRows]; + sharedValue[threadIdx.x] += aInputData[blockIdx.x * aInputRowSize + indexByRows] * aInputData[blockIdx.x * aInputRowSize + indexByRows]; } } - } + } __syncthreads(); - for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { - if(threadIdx.x < i) + if (threadIdx.x < i) { sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; } __syncthreads(); } - aOutput[blockIdx.x] = sqrt(sharedValue[0]); + aOutput[blockIdx.x] = sqrt(sharedValue[0]); } -CudaMatrix Aurora::vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim) +CudaMatrix Aurora::vecnorm(const CudaMatrix &aMatrix, NormMethod aNormMethod, int aDim) { - //only surpport aDim = 1 for now. - if(aDim != 1 || aNormMethod == NormMethod::NormF || aMatrix.isNull()) + // only surpport aDim = 1 for now. + if (aDim != 1 || aNormMethod == NormMethod::NormF || aMatrix.isNull()) { return CudaMatrix(); } unsigned int column = aMatrix.getDimSize(1); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * column); - if(aNormMethod == Aurora::Norm1) + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * column); + if (aNormMethod == Aurora::Norm1) { vecnorm1Kernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex()); } - else if(aNormMethod == Aurora::Norm2) + else if (aNormMethod == Aurora::Norm2) { vecnorm2Kernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex()); } cudaDeviceSynchronize(); - return Aurora::CudaMatrix::fromRawData(data,column); + return Aurora::CudaMatrix::fromRawData(data, column); } -__global__ void linspaceKernel(float* aOutput, unsigned int aOutputSize, float aStartNum, float aStepNum) +__global__ void linspaceKernel(float *aOutput, unsigned int aOutputSize, float aStartNum, float aStepNum) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aOutputSize) @@ -1325,107 +1377,107 @@ __global__ void linspaceKernel(float* aOutput, unsigned int aOutputSize, float a CudaMatrix Aurora::linspaceCuda(float aStart, float aEnd, int aNum) { float step = (aEnd - aStart) / (aNum - 1); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * aNum); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * aNum); int blocksPerGrid = (aNum + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; linspaceKernel<<>>(data, aNum, aStart, step); cudaDeviceSynchronize(); - return Aurora::CudaMatrix::fromRawData(data,aNum); + return Aurora::CudaMatrix::fromRawData(data, aNum); } -CudaMatrix Aurora::auroraUnion(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) +CudaMatrix Aurora::auroraUnion(const CudaMatrix &aMatrix1, const CudaMatrix &aMatrix2) { - if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.isComplex() || aMatrix2.isComplex()) + if (aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.isComplex() || aMatrix2.isComplex()) { - std::cerr<<"auroraUnion not support complex cudamatrix"<>>(aMatrix1.getData(), result.getData(), aMatrix1.getDataSize(), iaResult, size); cudaDeviceSynchronize(); - aIa = CudaMatrix::fromRawData(iaResult,size); + aIa = CudaMatrix::fromRawData(iaResult, size); return result; } -CudaMatrix Aurora::reshape(const CudaMatrix& aMatrix, int aRows, int aColumns, int aSlices) +CudaMatrix Aurora::reshape(const CudaMatrix &aMatrix, int aRows, int aColumns, int aSlices) { - if(aMatrix.isNull() || (aMatrix.getDataSize() != aRows * aColumns * aSlices)) + if (aMatrix.isNull() || (aMatrix.getDataSize() != aRows * aColumns * aSlices)) { - std::cerr<<"reshape diffirent size with cudamatrix"<>>(aMatrix1.getData(), aMatrix2.getData(), aMatrix1.getDataSize(), data, size); @@ -1461,7 +1513,7 @@ CudaMatrix Aurora::xcorr(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) return CudaMatrix::fromRawData(data, size); } -CudaMatrix Aurora::deleteColumn(const CudaMatrix& aMatrix, int aColumnIndex) +CudaMatrix Aurora::deleteColumn(const CudaMatrix &aMatrix, int aColumnIndex) { int rows = aMatrix.getDimSize(0); int columns = aMatrix.getDimSize(1); @@ -1470,16 +1522,16 @@ CudaMatrix Aurora::deleteColumn(const CudaMatrix& aMatrix, int aColumnIndex) return aMatrix; } - float* resultData = nullptr; - cudaMalloc((void**)&resultData, sizeof(float) * rows* (columns-1)); - if(aColumnIndex == 0) + float *resultData = nullptr; + cudaMalloc((void **)&resultData, sizeof(float) * rows * (columns - 1)); + if (aColumnIndex == 0) { - cudaMemcpy(resultData, aMatrix.getData() + rows, sizeof(float) * rows* (columns-1), cudaMemcpyDeviceToDevice); + cudaMemcpy(resultData, aMatrix.getData() + rows, sizeof(float) * rows * (columns - 1), cudaMemcpyDeviceToDevice); } - else if(aColumnIndex == (columns - 1)) + else if (aColumnIndex == (columns - 1)) { - cblas_scopy(rows* (columns-1), aMatrix.getData(), 1, resultData, 1); - cudaMemcpy(resultData, aMatrix.getData(), sizeof(float) * rows* (columns-1), cudaMemcpyDeviceToDevice); + cblas_scopy(rows * (columns - 1), aMatrix.getData(), 1, resultData, 1); + cudaMemcpy(resultData, aMatrix.getData(), sizeof(float) * rows * (columns - 1), cudaMemcpyDeviceToDevice); } else { @@ -1487,7 +1539,7 @@ CudaMatrix Aurora::deleteColumn(const CudaMatrix& aMatrix, int aColumnIndex) cudaMemcpy(resultData + rows * aColumnIndex, aMatrix.getData() + rows * (aColumnIndex + 1), sizeof(float) * rows * (columns - aColumnIndex - 1), cudaMemcpyDeviceToDevice); } - return CudaMatrix::fromRawData(resultData, rows, columns-1); + return CudaMatrix::fromRawData(resultData, rows, columns - 1); } CudaMatrix Aurora::createCudaVectorMatrix(float aStartValue, float aStepValue, float aEndValue) @@ -1497,7 +1549,7 @@ CudaMatrix Aurora::createCudaVectorMatrix(float aStartValue, float aStepValue, f matrixData.push_back(tempValue); long long compare1 = std::round(aEndValue * 10e13); long long compare2 = std::round(tempValue * 10e13); - while(std::round(tempValue* 10e13) <= compare1) + while (std::round(tempValue * 10e13) <= compare1) { tempValue += aStepValue; matrixData.push_back(tempValue); @@ -1511,20 +1563,18 @@ CudaMatrix Aurora::createCudaVectorMatrix(float aStartValue, float aStepValue, f struct compareMatrixByRows { compareMatrixByRows(unsigned int aSize) - : mSize(aSize) - { - }; + : mSize(aSize) { + }; unsigned int mSize; - __host__ __device__ - bool operator()(const float* aVector1, const float* aVector2) const + __host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const { - for(unsigned int i=0; i aVector2[i]) + else if (aVector1[i] > aVector2[i]) { return false; } @@ -1536,16 +1586,14 @@ struct compareMatrixByRows struct uniqueMatrixByRows { uniqueMatrixByRows(unsigned int aSize) - : mSize(aSize) - { - }; + : mSize(aSize) { + }; unsigned int mSize; - __host__ __device__ - bool operator()(const float* aVector1, const float* aVector2) const + __host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const { - for(unsigned int i=0; i vector(columns); - float* indexResult = new float[columns]; - std::vector vectorCopy(columns); - for(unsigned int i=0; i vector(columns); + float *indexResult = new float[columns]; + std::vector vectorCopy(columns); + for (unsigned int i = 0; i < columns; ++i) { - vector[i] = transposeMatrix.getData() + i*rows; + vector[i] = transposeMatrix.getData() + i * rows; vectorCopy[i] = vector[i]; } thrust::sort(thrust::device, vector.begin(), vector.begin() + columns, compareMatrixByRows(rows)); auto end = thrust::unique(thrust::device, vector.begin(), vector.end(), uniqueMatrixByRows(rows)); - for(unsigned int i=0; i= size) return; + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= size) + return; short value = aSrc[idx]; - ushort exponent=(ushort)value; + ushort exponent = (ushort)value; exponent = (exponent >> 11) & CONVERT_AND_VALUE; short sign = value; unsigned int sign_bit = (unsigned int)(sign < 0 ? 1 : 0); - ushort fraction3= (ushort)value; + ushort fraction3 = (ushort)value; fraction3 &= CONVERT_AND_VALUE_2; unsigned int hidden_bit = sign_bit * (!exponent ? 1 : 0) * CONVERT_MUL_VALUE + - ((!sign_bit && exponent) ? 1 : 0) * CONVERT_MUL_VALUE; + ((!sign_bit && exponent) ? 1 : 0) * CONVERT_MUL_VALUE; unsigned int temp = fraction3 + hidden_bit + sign_bit * CONVERT_ADD_VALUE; - int ret = (int)(exponent> 1 ? (temp << (exponent - 1)): temp); + int ret = (int)(exponent > 1 ? (temp << (exponent - 1)) : temp); aDes[idx] = (float)ret; } -CudaMatrix Aurora::convertfp16tofloatCuda(const CudaMatrix& aData, int aRows, int aColumns) +CudaMatrix Aurora::convertfp16tofloatCuda(const CudaMatrix &aData, int aRows, int aColumns) { - unsigned int size = aRows*aColumns; - //uint16变换为float(32位)输出大小翻倍 - float* output = nullptr; - cudaMalloc((void**)&output,size*sizeof(float)); + unsigned int size = aRows * aColumns; + // uint16变换为float(32位)输出大小翻倍 + float *output = nullptr; + cudaMalloc((void **)&output, size * sizeof(float)); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; convertValueKernel<<>>(aData.getData(), output, size); cudaDeviceSynchronize(); diff --git a/src/Function2D.cu b/src/Function2D.cu index ddf01c6..3dcb6fa 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -11,6 +11,10 @@ #include #include +#if THRUST_VERSION >= 200000 +#include +#include +#endif #include #include #include @@ -36,528 +40,587 @@ namespace const int THREADS_PER_BLOCK = 256; } - -__global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize) +__global__ void maxColKernel(float *aInputData, float *aOutput, unsigned int aColSize) { - //确定每个thread的index + // 确定每个thread的index unsigned int idx = blockIdx.x * aColSize + threadIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : -FLT_MAX; + shared_data[threadIdx.x] = (threadIdx.x < aColSize) ? aInputData[idx] : -FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset0; i>>=1) { + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { - if (threadIdx.x < i) { + if (threadIdx.x < i) + { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { aOutput[blockIdx.x] = shared_data[0]; } } -__global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize) +__global__ void maxRowKernel(float *aInputData, float *aOutput, unsigned int aColSize, unsigned int aRowSize) { - //确定每个thread的基础index - unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; + // 确定每个thread的基础index + unsigned int idx = threadIdx.x * aColSize + blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX; + shared_data[threadIdx.x] = (threadIdx.x < aRowSize) ? aInputData[idx] : -FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) { - if(threadIdx.x+offset < aRowSize){ - shared_data[threadIdx.x]= fmaxf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]); + for (int offset = blockDim.x; offset < aRowSize; offset += blockDim.x) + { + if (threadIdx.x + offset < aRowSize) + { + shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], aInputData[idx + offset * aColSize]); } __syncthreads(); } // 规约最前面一段 - for (int i = blockDim.x/2; i >0; i>>=1) { + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { - if (threadIdx.x < i) { + if (threadIdx.x < i) + { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { aOutput[blockIdx.x] = shared_data[0]; } } -CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) { - long a,b; - return max(aMatrix,direction,a,b); +CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) +{ + long a, b; + return max(aMatrix, direction, a, b); } -CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx) +CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long &rowIdx, long &colIdx) { - if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { - std::cerr<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") - << std::endl; + if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) + { + std::cerr << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") + << std::endl; return CudaMatrix(); } - //针对向量行等于列 - if (aMatrix.isVector()){ + // 针对向量行等于列 + if (aMatrix.isVector()) + { direction = All; } switch (direction) { - case All: { - thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); - auto max_iter = thrust::max_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize()); - int index = max_iter-d_ptr; - rowIdx = index%aMatrix.getDimSize(0); - colIdx = index/aMatrix.getDimSize(0); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float)); - auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1); - ret.setValue(0, *max_iter); - return ret; - } - case Row: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int rowCount = aMatrix.getDimSize(1); - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); - maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); - return ret; - } - case Column: - default: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int colCount = aMatrix.getDimSize(0); - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)); - maxColKernel<<>>(matData,retData,colCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); - return ret; - } + case All: + { + thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); + auto max_iter = thrust::max_element(thrust::device, d_ptr, d_ptr + aMatrix.getDataSize()); + int index = max_iter - d_ptr; + rowIdx = index % aMatrix.getDimSize(0); + colIdx = index / aMatrix.getDimSize(0); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float)); + auto ret = Aurora::CudaMatrix::fromRawData(data, 1, 1, 1); + ret.setValue(0, *max_iter); + return ret; + } + case Row: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int rowCount = aMatrix.getDimSize(1); + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0)); + maxRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); + return ret; + } + case Column: + default: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int colCount = aMatrix.getDimSize(0); + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1)); + maxColKernel<<>>(matData, retData, colCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1)); + return ret; + } } } - -CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) { - //col-vec x mat - if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { +CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) +{ + // col-vec x mat + if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) + { size_t size = aMat.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__(const float& x, const float& y) { + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x, const float &y) { return fmaxf(x, y); }; - for (int i = 0; i < aMat.getDimSize(1); ++i) { + for (int i = 0; i < aMat.getDimSize(1); ++i) + { thrust::transform(thrust::device, aVec.getData(), - aVec.getData() + aVec.getDataSize(), - aMat.getData() + aMat.getDimSize(0) * i, - data + aMat.getDimSize(0) * i, lambda); + aVec.getData() + aVec.getDataSize(), + aMat.getData() + aMat.getDimSize(0) * i, + data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), - aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); + aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } // row-vec x mat else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1)) { - size_t size = aMat.getDataSize() ; - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - for (int i = 0; i < aMat.getDimSize(1); ++i) { + size_t size = aMat.getDataSize(); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + for (int i = 0; i < aMat.getDimSize(1); ++i) + { float v = aVec.getValue(i); - auto lambda = [=] __host__ __device__(const float& x) { + auto lambda = [=] __host__ __device__(const float &x) { return fmaxf(x, v); }; thrust::transform(thrust::device, - aMat.getData() + aMat.getDimSize(0)*i, - aMat.getData() + aMat.getDimSize(0) * (i+1), - data + aMat.getDimSize(0) * i, lambda); + aMat.getData() + aMat.getDimSize(0) * i, + aMat.getData() + aMat.getDimSize(0) * (i + 1), + data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), - aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); + aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } std::cerr - << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" - << std::endl; + << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + << std::endl; return CudaMatrix(); - } -CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) { - if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { +CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) +{ + if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) + { std::cerr - << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") - << std::endl; + << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") + << std::endl; return CudaMatrix(); } - if (aOther.getDimSize(2)>1 || aOther.isComplex()) { + if (aOther.getDimSize(2) > 1 || aOther.isComplex()) + { std::cerr - << (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") - << std::endl; + << (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") + << std::endl; return CudaMatrix(); } - //same shape - if (aMatrix.compareShape(aOther)){ + // same shape + if (aMatrix.compareShape(aOther)) + { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return fmaxf(x,y); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return fmaxf(x, y); }; - thrust::transform(thrust::device,aMatrix.getData(), - aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(), - data,lambda); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + thrust::transform(thrust::device, aMatrix.getData(), + aMatrix.getData() + aMatrix.getDataSize(), aOther.getData(), + data, lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } // one is scalar - else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){ - float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0); - auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix; + else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1) + { + float scalar = (aMatrix.getDataSize() == 1) ? aMatrix.getValue(0) : aOther.getValue(0); + auto matrix = (aMatrix.getDataSize() == 1) ? aOther : aMatrix; return max(matrix, scalar); } - else if (aMatrix.isVector()) { - return ::vxmMax(aMatrix,aOther); -} + else if (aMatrix.isVector()) + { + return ::vxmMax(aMatrix, aOther); + } else if (aOther.isVector()) { - return ::vxmMax(aOther,aMatrix); + return ::vxmMax(aOther, aMatrix); } std::cerr - << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" - << std::endl; + << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + << std::endl; return CudaMatrix(); } -CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue){ +CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue) +{ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__ (const float& x){ - return fmaxf(x,aValue); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x) { + return fmaxf(x, aValue); }; - thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), - data,lambda); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), + data, lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aColSize) +__global__ void minColKernel(float *aInputData, float *aOutput, unsigned int aColSize) { - //确定每个thread的index + // 确定每个thread的index unsigned int idx = blockIdx.x * aColSize + threadIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : FLT_MAX; + shared_data[threadIdx.x] = (threadIdx.x < aColSize) ? aInputData[idx] : FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset0; i>>=1) { + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { - if (threadIdx.x < i) { + if (threadIdx.x < i) + { shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { aOutput[blockIdx.x] = shared_data[0]; } } -__global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize) +__global__ void minRowKernel(float *aInputData, float *aOutput, unsigned int aColSize, unsigned int aRowSize) { - //确定每个thread的基础index - unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; + // 确定每个thread的基础index + unsigned int idx = threadIdx.x * aColSize + blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX; + shared_data[threadIdx.x] = (threadIdx.x < aRowSize) ? aInputData[idx] : FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) { - if(threadIdx.x+offset < aRowSize){ - shared_data[threadIdx.x]= fminf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]); + for (int offset = blockDim.x; offset < aRowSize; offset += blockDim.x) + { + if (threadIdx.x + offset < aRowSize) + { + shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], aInputData[idx + offset * aColSize]); } __syncthreads(); } // 规约最前面一段 - for (int i = blockDim.x/2; i >0; i>>=1) { - if (threadIdx.x < i) { - shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]); + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { + if (threadIdx.x < i) + { + shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { aOutput[blockIdx.x] = shared_data[0]; } } -CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) { - long a,b; - return min(aMatrix,direction,a,b); +CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) +{ + long a, b; + return min(aMatrix, direction, a, b); } -CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx) +CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long &rowIdx, long &colIdx) { - if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { - std::cerr<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") - << std::endl; + if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) + { + std::cerr << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") + << std::endl; return CudaMatrix(); } - //针对向量行等于列 - if (aMatrix.isVector()){ + // 针对向量行等于列 + if (aMatrix.isVector()) + { direction = All; } switch (direction) { - case All: { - thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); - auto max_iter = thrust::min_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize()); - int index = max_iter-d_ptr; - rowIdx = index%aMatrix.getDimSize(0); - colIdx = index/aMatrix.getDimSize(0); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float)); - auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1); - ret.setValue(0, *max_iter); - return ret; - } - case Row: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int rowCount = aMatrix.getDimSize(1); - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); - minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); - return ret; - } - case Column: - default: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int colCount = aMatrix.getDimSize(0); - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)); - minColKernel<<>>(matData,retData,colCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); - return ret; - } + case All: + { + thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); + auto max_iter = thrust::min_element(thrust::device, d_ptr, d_ptr + aMatrix.getDataSize()); + int index = max_iter - d_ptr; + rowIdx = index % aMatrix.getDimSize(0); + colIdx = index / aMatrix.getDimSize(0); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float)); + auto ret = Aurora::CudaMatrix::fromRawData(data, 1, 1, 1); + ret.setValue(0, *max_iter); + return ret; + } + case Row: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int rowCount = aMatrix.getDimSize(1); + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0)); + minRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); + return ret; + } + case Column: + default: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int colCount = aMatrix.getDimSize(0); + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1)); + minColKernel<<>>(matData, retData, colCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1)); + return ret; + } } } -CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) { - //col-vec x mat - if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { +CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) +{ + // col-vec x mat + if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) + { size_t size = aMat.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__(const float& x, const float& y) { + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x, const float &y) { return fminf(x, y); }; - for (int i = 0; i < aMat.getDimSize(1); ++i) { + for (int i = 0; i < aMat.getDimSize(1); ++i) + { thrust::transform(thrust::device, aVec.getData(), - aVec.getData() + aVec.getDataSize(), - aMat.getData() + aMat.getDimSize(0) * i, - data + aMat.getDimSize(0) * i, lambda); + aVec.getData() + aVec.getDataSize(), + aMat.getData() + aMat.getDimSize(0) * i, + data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), - aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); + aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } // row-vec x mat else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1)) { - size_t size = aMat.getDataSize() ; - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - for (int i = 0; i < aMat.getDimSize(1); ++i) { + size_t size = aMat.getDataSize(); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + for (int i = 0; i < aMat.getDimSize(1); ++i) + { float v = aVec.getValue(i); - auto lambda = [=] __host__ __device__(const float& x) { + auto lambda = [=] __host__ __device__(const float &x) { return fminf(x, v); }; thrust::transform(thrust::device, - aMat.getData() + aMat.getDimSize(0)*i, - aMat.getData() + aMat.getDimSize(0) * (i+1), - data + aMat.getDimSize(0) * i, lambda); + aMat.getData() + aMat.getDimSize(0) * i, + aMat.getData() + aMat.getDimSize(0) * (i + 1), + data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), - aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); + aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } std::cerr - << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" - << std::endl; + << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + << std::endl; return CudaMatrix(); - } -CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) { - if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { +CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) +{ + if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) + { std::cerr - << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") - << std::endl; + << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") + << std::endl; return CudaMatrix(); } - if (aOther.getDimSize(2)>1 || aOther.isComplex()) { + if (aOther.getDimSize(2) > 1 || aOther.isComplex()) + { std::cerr - << (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") - << std::endl; + << (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") + << std::endl; return CudaMatrix(); } - //same shape - if (aMatrix.compareShape(aOther)){ + // same shape + if (aMatrix.compareShape(aOther)) + { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__ (const float& x, const float& y){ - return fminf(x,y); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x, const float &y) { + return fminf(x, y); }; - thrust::transform(thrust::device,aMatrix.getData(), - aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(), - data,lambda); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + thrust::transform(thrust::device, aMatrix.getData(), + aMatrix.getData() + aMatrix.getDataSize(), aOther.getData(), + data, lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } // one is scalar - else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){ - float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0); - auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix; + else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1) + { + float scalar = (aMatrix.getDataSize() == 1) ? aMatrix.getValue(0) : aOther.getValue(0); + auto matrix = (aMatrix.getDataSize() == 1) ? aOther : aMatrix; return min(matrix, scalar); } - else if (aMatrix.isVector()) { - return ::vxmMin(aMatrix,aOther); -} + else if (aMatrix.isVector()) + { + return ::vxmMin(aMatrix, aOther); + } else if (aOther.isVector()) { - return ::vxmMin(aOther,aMatrix); + return ::vxmMin(aOther, aMatrix); } std::cerr - << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" - << std::endl; + << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + << std::endl; return CudaMatrix(); } - -CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){ +CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue) +{ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); - auto lambda = [=] __host__ __device__ (const float& x){ - return fminf(x,aValue); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__(const float &x) { + return fminf(x, aValue); }; - thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), - data,lambda); - return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), - aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), + data, lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } -__global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount) +__global__ void sumColKernel(float *aInputData, float *aOutput, int aColEleCount) { - //确定每个thread的index + // 确定每个thread的index unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; __shared__ double shared_data[256]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x]= (threadIdx.x< aColEleCount) ? aInputData[idx] : 0.0; + shared_data[threadIdx.x] = (threadIdx.x < aColEleCount) ? aInputData[idx] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset0; i>>=1) { - if (threadIdx.x < i) { + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { + if (threadIdx.x < i) + { shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { aOutput[blockIdx.x] = (float)shared_data[0]; } } -__global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aColEleCount, unsigned int aRowEleCount) +__global__ void sumRowKernel(float *aInputData, float *aOutput, unsigned int aColEleCount, unsigned int aRowEleCount) { - //确定每个thread的基础index - unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x; + // 确定每个thread的基础index + unsigned int idx = threadIdx.x * aColEleCount + blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x]= (threadIdx.x< aRowEleCount) ? aInputData[idx] : 0.0; + shared_data[threadIdx.x] = (threadIdx.x < aRowEleCount) ? aInputData[idx] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) { - if(threadIdx.x+offset < aRowEleCount){ - shared_data[threadIdx.x]+= aInputData[idx + offset*aColEleCount]; + for (int offset = blockDim.x; offset < aRowEleCount; offset += blockDim.x) + { + if (threadIdx.x + offset < aRowEleCount) + { + shared_data[threadIdx.x] += aInputData[idx + offset * aColEleCount]; } __syncthreads(); } // 规约最前面一段 - for (int i = blockDim.x/2; i >0; i>>=1) { - if (threadIdx.x < i) { - shared_data[threadIdx.x] += shared_data[threadIdx.x+i]; + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { + if (threadIdx.x < i) + { + shared_data[threadIdx.x] += shared_data[threadIdx.x + i]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { aOutput[blockIdx.x] = shared_data[0]; } } -__global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSize) +__global__ void sumZAllColKernel(float *aInputData, float *aOutput, int aTotalSize) { - //确定每个thread的index + // 确定每个thread的index unsigned int idx = blockIdx.x * 4096 + threadIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 - bool flag = threadIdx.x< 4096 && idx0; i>>=1) { - if (threadIdx.x < i) { + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { + if (threadIdx.x < i) + { shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; } @@ -565,34 +628,39 @@ __global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSi } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { aOutput[blockIdx.x] = shared_data[0][0]; - aOutput[blockIdx.x+gridDim.x] = shared_data[0][1]; + aOutput[blockIdx.x + gridDim.x] = shared_data[0][1]; } } -__global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCount) +__global__ void sumZColKernel(float *aInputData, float *aOutput, int aColEleCount) { - //确定每个thread的index + // 确定每个thread的index unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x][0]= (threadIdx.x< aColEleCount) ? aInputData[idx*2] : 0.0; - shared_data[threadIdx.x][1]= (threadIdx.x< aColEleCount) ? aInputData[idx*2+1] : 0.0; + shared_data[threadIdx.x][0] = (threadIdx.x < aColEleCount) ? aInputData[idx * 2] : 0.0; + shared_data[threadIdx.x][1] = (threadIdx.x < aColEleCount) ? aInputData[idx * 2 + 1] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset0; i>>=1) { - if (threadIdx.x < i) { + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { + if (threadIdx.x < i) + { shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; } @@ -600,319 +668,340 @@ __global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCoun } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { - aOutput[blockIdx.x*2] = shared_data[0][0]; - aOutput[blockIdx.x*2+1] = shared_data[0][1]; + if (threadIdx.x == 0) + { + aOutput[blockIdx.x * 2] = shared_data[0][0]; + aOutput[blockIdx.x * 2 + 1] = shared_data[0][1]; } } -__global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aColEleCount, unsigned int aRowEleCount) +__global__ void sumZRowKernel(float *aInputData, float *aOutput, unsigned int aColEleCount, unsigned int aRowEleCount) { - //确定每个thread的基础index - unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x; + // 确定每个thread的基础index + unsigned int idx = threadIdx.x * aColEleCount + blockIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 - shared_data[threadIdx.x][0]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2] : 0.0; - shared_data[threadIdx.x][1]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2+1] : 0.0; + shared_data[threadIdx.x][0] = (threadIdx.x < aRowEleCount) ? aInputData[idx * 2] : 0.0; + shared_data[threadIdx.x][1] = (threadIdx.x < aRowEleCount) ? aInputData[idx * 2 + 1] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 - for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) { - if(threadIdx.x+offset < aRowEleCount){ - shared_data[threadIdx.x][0]+= aInputData[idx*2 + offset*aColEleCount*2]; - shared_data[threadIdx.x][1]+= aInputData[idx*2 + offset*aColEleCount*2 + 1]; + for (int offset = blockDim.x; offset < aRowEleCount; offset += blockDim.x) + { + if (threadIdx.x + offset < aRowEleCount) + { + shared_data[threadIdx.x][0] += aInputData[idx * 2 + offset * aColEleCount * 2]; + shared_data[threadIdx.x][1] += aInputData[idx * 2 + offset * aColEleCount * 2 + 1]; } __syncthreads(); } // 规约最前面一段 - for (int i = blockDim.x/2; i >0; i>>=1) { - if (threadIdx.x < i) { - shared_data[threadIdx.x][0] += shared_data[threadIdx.x+i][0]; - shared_data[threadIdx.x][1] += shared_data[threadIdx.x+i][1]; - + for (int i = blockDim.x / 2; i > 0; i >>= 1) + { + if (threadIdx.x < i) + { + shared_data[threadIdx.x][0] += shared_data[threadIdx.x + i][0]; + shared_data[threadIdx.x][1] += shared_data[threadIdx.x + i][1]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 - if (threadIdx.x == 0) { - aOutput[blockIdx.x*2] = shared_data[0][0]; - aOutput[blockIdx.x*2+1] = shared_data[0][1]; - + if (threadIdx.x == 0) + { + aOutput[blockIdx.x * 2] = shared_data[0][0]; + aOutput[blockIdx.x * 2 + 1] = shared_data[0][1]; } } -CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){ - if (aMatrix.getDimSize(2)>1 ) { - std::cerr<< "sum() not support 3D data!" - << std::endl; +CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction) +{ + if (aMatrix.getDimSize(2) > 1) + { + std::cerr << "sum() not support 3D data!" + << std::endl; return CudaMatrix(); } - //针对向量行等于列 - if (direction == Column && aMatrix.getDimSize(0)==1){ + // 针对向量行等于列 + if (direction == Column && aMatrix.getDimSize(0) == 1) + { direction = Row; } if (!aMatrix.isComplex()) { switch (direction) { - case All: - { - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float)); - auto ret = CudaMatrix::fromRawData(data,1,1,1); - float result = thrust::reduce(thrust::device, aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),0.0000000f,thrust::plus()); - ret.setValue(0,result); - return ret; - } - case Row: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int rowCount = aMatrix.getDimSize(1); - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); - sumRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); - return ret; - } - case Column: - default: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int colElementCount = aMatrix.getDimSize(0); - if (colElementCount == 1) return aMatrix; - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)); - sumColKernel<<>>(matData,retData,colElementCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); - return ret; - } + case All: + { + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float)); + auto ret = CudaMatrix::fromRawData(data, 1, 1, 1); + float result = thrust::reduce(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), 0.0000000f, thrust::plus()); + ret.setValue(0, result); + return ret; + } + case Row: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int rowCount = aMatrix.getDimSize(1); + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0)); + sumRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); + return ret; + } + case Column: + default: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int colElementCount = aMatrix.getDimSize(0); + if (colElementCount == 1) + return aMatrix; + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1)); + sumColKernel<<>>(matData, retData, colElementCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1)); + return ret; + } } } - else{ + else + { switch (direction) { - case All: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - //divide the whole array to some 4096 blocks, then caculate as columns sum - int fakeCol = (int)ceilf((float)aMatrix.getDataSize()/4096.0f); - cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol); - auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex); - sumZAllColKernel<<>>(matData,retData, aMatrix.getDataSize()); - cudaDeviceSynchronize(); - float* result_data = nullptr; - cudaMalloc((void**)&result_data, sizeof(float)*2); - auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex); - float result = thrust::reduce(thrust::device, ret.getData(),ret.getData()+ ret.getDataSize(),0,thrust::plus()); - ret2.setValue(0,result); - result = thrust::reduce(thrust::device, ret.getData()+ ret.getDataSize(),ret.getData()+ ret.getDataSize()*2,0,thrust::plus()); - ret2.setValue(1,result); - return ret2; - } - case Row: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int rowElementCount = aMatrix.getDimSize(1); - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)*2); - sumZRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowElementCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); - return ret; - } - case Column: - default: - { - float* matData = aMatrix.getData(); - float* retData = nullptr; - int colElementCount = aMatrix.getDimSize(0); - if (colElementCount == 1) return aMatrix; - cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)*2); - sumZColKernel<<>>(matData,retData,colElementCount); - cudaDeviceSynchronize(); - CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1),1,Complex); - return ret; - } + case All: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + // divide the whole array to some 4096 blocks, then caculate as columns sum + int fakeCol = (int)ceilf((float)aMatrix.getDataSize() / 4096.0f); + cudaMalloc((void **)&retData, sizeof(float) * 2 * fakeCol); + auto ret = CudaMatrix::fromRawData(retData, 1, fakeCol, 1, Complex); + sumZAllColKernel<<>>(matData, retData, aMatrix.getDataSize()); + cudaDeviceSynchronize(); + float *result_data = nullptr; + cudaMalloc((void **)&result_data, sizeof(float) * 2); + auto ret2 = CudaMatrix::fromRawData(result_data, 1, 1, 1, Complex); + float result = thrust::reduce(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), 0, thrust::plus()); + ret2.setValue(0, result); + result = thrust::reduce(thrust::device, ret.getData() + ret.getDataSize(), ret.getData() + ret.getDataSize() * 2, 0, thrust::plus()); + ret2.setValue(1, result); + return ret2; + } + case Row: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int rowElementCount = aMatrix.getDimSize(1); + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0) * 2); + sumZRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowElementCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); + return ret; + } + case Column: + default: + { + float *matData = aMatrix.getData(); + float *retData = nullptr; + int colElementCount = aMatrix.getDimSize(0); + if (colElementCount == 1) + return aMatrix; + cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1) * 2); + sumZColKernel<<>>(matData, retData, colElementCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1), 1, Complex); + return ret; + } } } } -CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){ - if (aMatrix.getDimSize(2)>1 ) { - std::cerr<< "sum() not support 3D data!" - << std::endl; +CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction) +{ + if (aMatrix.getDimSize(2) > 1) + { + std::cerr << "sum() not support 3D data!" + << std::endl; return CudaMatrix(); } - //针对向量行等于列 - if (direction == Column && aMatrix.getDimSize(0)==1){ + // 针对向量行等于列 + if (direction == Column && aMatrix.getDimSize(0) == 1) + { direction = Row; } if (!aMatrix.isComplex()) { switch (direction) { - case All: + case All: + { + auto ret = sum(aMatrix, All); + ret.setValue(0, ret.getValue(0) / ((float)aMatrix.getDataSize())); + return ret; + } + case Row: + { + auto ret = sum(aMatrix, Row); + float count = (float)aMatrix.getDimSize(1); + auto lambda = [=] __device__(const float &v) { - auto ret = sum(aMatrix,All); - ret.setValue(0,ret.getValue(0)/((float)aMatrix.getDataSize())); - return ret; - } - case Row: + return v / count; + }; + thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), ret.getData(), lambda); + return ret; + } + case Column: + default: + { + auto ret = sum(aMatrix, Column); + float count = (float)aMatrix.getDimSize(0); + auto lambda = [=] __device__(const float &v) { - auto ret = sum(aMatrix, Row); - float count = (float)aMatrix.getDimSize(1); - auto lambda = [=] __device__ (const float& v){ - return v/count; - }; - thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda); - return ret; - } - case Column: - default: - { - auto ret = sum(aMatrix, Column); - float count = (float)aMatrix.getDimSize(0); - auto lambda = [=] __device__ (const float& v){ - return v/count; - }; - thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda); - return ret; - } + return v / count; + }; + thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), ret.getData(), lambda); + return ret; + } } } - else{ - std::cerr<< "mean() not support complex data!" - << std::endl; + else + { + std::cerr << "mean() not support complex data!" + << std::endl; return CudaMatrix(); } } -CudaMatrix Aurora::std(const CudaMatrix &aMatrix){ - if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { +CudaMatrix Aurora::std(const CudaMatrix &aMatrix) +{ + if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) + { std::cerr - << (aMatrix.getDimSize(2) > 1 ? "std() not support 3D data!" : "std() not support complex value type!") - << std::endl; + << (aMatrix.getDimSize(2) > 1 ? "std() not support 3D data!" : "std() not support complex value type!") + << std::endl; return CudaMatrix(); } auto src = aMatrix.isComplex() ? Aurora::abs(aMatrix) : aMatrix.deepCopy(); int calc_size = src.getDimSize(0) == 1 ? src.getDimSize(1) : src.getDimSize(0); auto meanM = Aurora::mean(src); - return sqrt(Aurora::sum((src-meanM)^2.0)/((float)calc_size-1.0f)); + return sqrt(Aurora::sum((src - meanM) ^ 2.0) / ((float)calc_size - 1.0f)); } template -class RowElementIterator:public thrust::iterator_facade< - RowElementIterator, - ValueType, - thrust::device_system_tag, - thrust::random_access_traversal_tag, - ValueType& >{ - public: +class RowElementIterator : public thrust::iterator_facade< + RowElementIterator, + ValueType, + thrust::device_system_tag, + thrust::random_access_traversal_tag, + ValueType &> +{ +public: // 构造函数 __host__ __device__ - RowElementIterator(ValueType* ptr, int aColElementCount=1) : ptr_(ptr),col_elements_(aColElementCount) {} + RowElementIterator(ValueType *ptr, int aColElementCount = 1) : ptr_(ptr), col_elements_(aColElementCount) {} - __host__ __device__ - ValueType& dereference() const{ + __host__ __device__ ValueType &dereference() const + { return *ptr_; } // 实现递增操作符 - __host__ __device__ - void increment() { - ptr_ = ptr_+col_elements_; + __host__ __device__ void increment() + { + ptr_ = ptr_ + col_elements_; } // 实现递减操作符 - __host__ __device__ - void decrement() { + __host__ __device__ void decrement() + { ptr_ = ptr_ - col_elements_; } // 实现加法操作符 - __host__ __device__ - void advance(typename RowElementIterator::difference_type n) { - ptr_ += col_elements_*n; + __host__ __device__ void advance(typename RowElementIterator::difference_type n) + { + ptr_ += col_elements_ * n; } // 实现减法操作符 __host__ __device__ - typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const { - return (other.ptr_ - ptr_)/col_elements_; + typename RowElementIterator::difference_type + distance_to(const RowElementIterator &other) const + { + return (other.ptr_ - ptr_) / col_elements_; } // 实现比较操作符 - __host__ __device__ - bool equal(const RowElementIterator& other) const { + __host__ __device__ bool equal(const RowElementIterator &other) const + { return ptr_ == other.ptr_; } - private: - ValueType* ptr_; - int col_elements_; +private: + ValueType *ptr_; + int col_elements_; }; -CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction) +CudaMatrix Aurora::sort(const CudaMatrix &aMatrix, FunctionDirection direction) { - if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { + if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) + { std::cerr - << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") - << std::endl; + << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") + << std::endl; return CudaMatrix(); } return sort(std::forward(aMatrix.deepCopy()), direction); } -CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction) +CudaMatrix Aurora::sort(CudaMatrix &&aMatrix, FunctionDirection direction) { - if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { + if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) + { std::cerr - << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") - << std::endl; - return CudaMatrix(); + << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") + << std::endl; + return CudaMatrix(); } - float * data = aMatrix.getData(); + float *data = aMatrix.getData(); int colElementCount = aMatrix.getDimSize(0); switch (direction) { - case Row: + case Row: + { + for (size_t i = 0; i < colElementCount; i++) { - for (size_t i = 0; i < colElementCount; i++) - { - thrust::sort(thrust::device, RowElementIterator(data+i,colElementCount), - RowElementIterator(data+aMatrix.getDataSize()+i,colElementCount)); - } - return aMatrix; + thrust::sort(thrust::device, RowElementIterator(data + i, colElementCount), + RowElementIterator(data + aMatrix.getDataSize() + i, colElementCount)); } - case Column: - { - int rowElementCount = aMatrix.getDimSize(1); - for (size_t i = 0; i < rowElementCount; i++) - { - thrust::sort(thrust::device, data+i*colElementCount, - data+(i+1)*colElementCount); - } - - return aMatrix; - } - default: - { - std::cerr - << "Unsupported direction for sort!" - << std::endl; - return CudaMatrix(); - } - + return aMatrix; + } + case Column: + { + int rowElementCount = aMatrix.getDimSize(1); + for (size_t i = 0; i < rowElementCount; i++) + { + thrust::sort(thrust::device, data + i * colElementCount, + data + (i + 1) * colElementCount); + } + + return aMatrix; + } + default: + { + std::cerr + << "Unsupported direction for sort!" + << std::endl; + return CudaMatrix(); + } } - } -__global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputSize) +__global__ void immseKernel(float *aInputData1, float *aInputData2, float *aOutputData, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) @@ -923,15 +1012,15 @@ __global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutp float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB) { - if (aImageA.getDims()!=2|| aImageB.getDims()!=2) + if (aImageA.getDims() != 2 || aImageB.getDims() != 2) { - std::cerr<<"Fail! cuda immse args must all 2d matrix!"; + std::cerr << "Fail! cuda immse args must all 2d matrix!"; return 0.0; } if (!aImageB.compareShape(aImageA)) { - std::cerr<<"Fail! cuda immse args must be same shape!"; + std::cerr << "Fail! cuda immse args must be same shape!"; return 0.0; } @@ -942,12 +1031,12 @@ float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB) } unsigned int size = aImageA.getDataSize(); - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; immseKernel<<>>(aImageA.getData(), aImageB.getData(), data, size); cudaDeviceSynchronize(); - float result = thrust::reduce(thrust::device, data, data+size, 0.0, thrust::plus()) / size; + float result = thrust::reduce(thrust::device, data, data + size, 0.0, thrust::plus()) / size; cudaFree(data); return result; } @@ -955,20 +1044,18 @@ float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB) struct compareMatrixByRows { compareMatrixByRows(unsigned int aSize) - : mSize(aSize) - { - }; + : mSize(aSize) { + }; unsigned int mSize; - __host__ __device__ - bool operator()(const float* aVector1, const float* aVector2) const + __host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const { - for(unsigned int i=0; i aVector2[i]) + else if (aVector1[i] > aVector2[i]) { return false; } @@ -977,28 +1064,28 @@ struct compareMatrixByRows } }; -CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix) +CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix &indexMatrix) { CudaMatrix transposeMatrix = transpose(aMatrix); size_t rows = transposeMatrix.getDimSize(0); size_t columns = transposeMatrix.getDimSize(1); - thrust::device_vector vector(columns); - for(unsigned int i=0; i vector(columns); + for (unsigned int i = 0; i < columns; ++i) { - vector[i] = transposeMatrix.getData() + i*rows; + vector[i] = transposeMatrix.getData() + i * rows; } - thrust::device_vector vectorBack = vector; + thrust::device_vector vectorBack = vector; thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows)); - float* data = nullptr; - float* indexResult = new float[columns]; - cudaMalloc((void**)&data, sizeof(float) * rows * columns); - for(unsigned int i=0; i>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data); cudaDeviceSynchronize(); - int* devicePivotArray; - cudaMalloc((void**)&devicePivotArray, n * sizeof(int)); - int* deviceInfoArray; - cudaMalloc((void**)&deviceInfoArray, sizeof(int)); + int *devicePivotArray; + cudaMalloc((void **)&devicePivotArray, n * sizeof(int)); + int *deviceInfoArray; + cudaMalloc((void **)&deviceInfoArray, sizeof(int)); cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1); cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1); cudaFree(devicePivotArray); @@ -1087,21 +1174,21 @@ CudaMatrix Aurora::inv(CudaMatrix &&aMatrix) unsigned int size = aMatrix.getDataSize(); cublasHandle_t handle; cublasCreate(&handle); - float* data; - cudaMalloc((void**)&data, sizeof(float) * size); + float *data; + cudaMalloc((void **)&data, sizeof(float) * size); - float** deviceInputPinter; - cudaMalloc((void**)&deviceInputPinter, sizeof(float *)); + float **deviceInputPinter; + cudaMalloc((void **)&deviceInputPinter, sizeof(float *)); float **deviceOutputPointer; - cudaMalloc((void**)&deviceOutputPointer, sizeof(float *)); + cudaMalloc((void **)&deviceOutputPointer, sizeof(float *)); invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data); cudaDeviceSynchronize(); - int* devicePivotArray; - cudaMalloc((void**)&devicePivotArray, n * sizeof(int)); - int* deviceInfoArray; - cudaMalloc((void**)&deviceInfoArray, sizeof(int)); + int *devicePivotArray; + cudaMalloc((void **)&devicePivotArray, n * sizeof(int)); + int *deviceInfoArray; + cudaMalloc((void **)&deviceInfoArray, sizeof(int)); cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1); cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1); cudaFree(devicePivotArray); @@ -1113,155 +1200,154 @@ CudaMatrix Aurora::inv(CudaMatrix &&aMatrix) return CudaMatrix::fromRawData(data, n, n); } -__global__ void dotColumnKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputRowSize) +__global__ void dotColumnKernel(float *aInputData1, float *aInputData2, float *aOutputData, unsigned int aInputRowSize) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 0; - for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + for (unsigned int i = 0; i <= aInputRowSize / blockDim.x; ++i) { - unsigned int indexByRows = i*blockDim.x + threadIdx.x; - if(indexByRows < aInputRowSize) + unsigned int indexByRows = i * blockDim.x + threadIdx.x; + if (indexByRows < aInputRowSize) { - sharedValue[threadIdx.x] += aInputData1[blockIdx.x*aInputRowSize + indexByRows] * aInputData2[blockIdx.x*aInputRowSize + indexByRows]; + sharedValue[threadIdx.x] += aInputData1[blockIdx.x * aInputRowSize + indexByRows] * aInputData2[blockIdx.x * aInputRowSize + indexByRows]; } - } + } __syncthreads(); - for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { - if(threadIdx.x < i) + if (threadIdx.x < i) { sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; } __syncthreads(); } - aOutputData[blockIdx.x] = sharedValue[0]; + aOutputData[blockIdx.x] = sharedValue[0]; } CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction) { - if ( direction != Aurora::Column) + if (direction != Aurora::Column) { - std::cerr<< "cuda dot() only support column data!"<< std::endl; + std::cerr << "cuda dot() only support column data!" << std::endl; return CudaMatrix(); } if (!aMatrix.compareShape(aOther)) { - std::cerr<< "cuda dot() matrix must be same shape!"<< std::endl; + std::cerr << "cuda dot() matrix must be same shape!" << std::endl; return CudaMatrix(); } - if(aMatrix.getValueType() == Aurora::Complex || aOther.getValueType() == Aurora::Complex) + if (aMatrix.getValueType() == Aurora::Complex || aOther.getValueType() == Aurora::Complex) { - std::cerr<< "cuda dot() do not support complex data!"<< std::endl; + std::cerr << "cuda dot() do not support complex data!" << std::endl; return CudaMatrix(); } unsigned int column = aMatrix.getDimSize(1); unsigned int row = aMatrix.getDimSize(0); - if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) + if (aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) { column = 1; row = aMatrix.getDataSize(); } - float* data = nullptr; - cudaMalloc((void**)&data, sizeof(float) * column); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * column); dotColumnKernel<<>>(aMatrix.getData(), aOther.getData(), data, row); cudaDeviceSynchronize(); return CudaMatrix::fromRawData(data, 1, column); } - __global__ void prodKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize) +__global__ void prodKernel(float *aInputData, float *aOutputData, unsigned int aInputRowSize) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 1; - for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) + for (unsigned int i = 0; i <= aInputRowSize / blockDim.x; ++i) { - unsigned int indexByRows = i*blockDim.x + threadIdx.x; - if(indexByRows < aInputRowSize) + unsigned int indexByRows = i * blockDim.x + threadIdx.x; + if (indexByRows < aInputRowSize) { - sharedValue[threadIdx.x] *= aInputData[blockIdx.x*aInputRowSize + indexByRows]; + sharedValue[threadIdx.x] *= aInputData[blockIdx.x * aInputRowSize + indexByRows]; } - } + } __syncthreads(); - for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { - if(threadIdx.x < i) + if (threadIdx.x < i) { - sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i]; + sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i]; } __syncthreads(); } - if(threadIdx.x == 0) + if (threadIdx.x == 0) { aOutputData[blockIdx.x] = sharedValue[0]; - } + } } -__global__ void prodComplexKernel(float* aInputData, float* aOutputData, unsigned int aInputRowSize) +__global__ void prodComplexKernel(float *aInputData, float *aOutputData, unsigned int aInputRowSize) { __shared__ float sharedValue[THREADS_PER_BLOCK * 2]; unsigned int complexIdx = threadIdx.x * 2; - + sharedValue[complexIdx] = 1; sharedValue[complexIdx + 1] = 0; - for(unsigned int i=0; i<=(aInputRowSize/blockDim.x); ++i) + for (unsigned int i = 0; i <= (aInputRowSize / blockDim.x); ++i) { - unsigned int indexByRows = i*blockDim.x + threadIdx.x; - if(indexByRows < aInputRowSize) + unsigned int indexByRows = i * blockDim.x + threadIdx.x; + if (indexByRows < aInputRowSize) { - unsigned int index = 2 * (blockIdx.x*aInputRowSize + indexByRows); + unsigned int index = 2 * (blockIdx.x * aInputRowSize + indexByRows); float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1]; float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index]; - sharedValue[complexIdx] = real; + sharedValue[complexIdx] = real; sharedValue[complexIdx + 1] = imag; } - } + } __syncthreads(); - for(unsigned int i = blockDim.x/2; i>0; i >>= 1) + for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { - if(threadIdx.x < i) + if (threadIdx.x < i) { unsigned int index = 2 * (threadIdx.x + i); float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1]; float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index]; sharedValue[complexIdx] = real; - sharedValue[complexIdx + 1] = imag; + sharedValue[complexIdx + 1] = imag; } __syncthreads(); } - if(threadIdx.x == 0) + if (threadIdx.x == 0) { - aOutputData[2 * blockIdx.x] = sharedValue[0]; + aOutputData[2 * blockIdx.x] = sharedValue[0]; aOutputData[2 * blockIdx.x + 1] = sharedValue[1]; } - } CudaMatrix Aurora::prod(const CudaMatrix &aMatrix) { - if (aMatrix.getDimSize(2) > 1 ) + if (aMatrix.getDimSize(2) > 1) { - std::cerr<< "cuda prod() not support 3D data!"<< std::endl; + std::cerr << "cuda prod() not support 3D data!" << std::endl; return CudaMatrix(); } unsigned int row = aMatrix.getDimSize(0); unsigned int column = aMatrix.getDimSize(1); - if(aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) + if (aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) { column = 1; row = aMatrix.getDataSize(); } - float* data = nullptr; + float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType()); - if(aMatrix.isComplex()) + if (aMatrix.isComplex()) { prodComplexKernel<<>>(aMatrix.getData(), data, row); } @@ -1274,13 +1360,14 @@ CudaMatrix Aurora::prod(const CudaMatrix &aMatrix) } /** - * @brief - * + * @brief * - * @param aMatrix + * + * @param aMatrix * @param aDirection 0 FORWARD, 1 BACKWARD */ -void ExecFFT(CudaMatrix& aMatrix,int aDirection){ +void ExecFFT(CudaMatrix &aMatrix, int aDirection) +{ cufftHandle plan; cudaStream_t stream = NULL; int batch_size = aMatrix.getDimSize(1); @@ -1291,144 +1378,161 @@ void ExecFFT(CudaMatrix& aMatrix,int aDirection){ cufftPlan1d(&plan, aMatrix.getDimSize(0), CUFFT_C2C, batch_size); cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); cufftSetStream(plan, stream); - cufftExecC2C(plan, d_data, d_data, aDirection==0?CUFFT_FORWARD:CUFFT_INVERSE); + cufftExecC2C(plan, d_data, d_data, aDirection == 0 ? CUFFT_FORWARD : CUFFT_INVERSE); cudaStreamSynchronize(stream); cufftDestroy(plan); cudaStreamDestroy(stream); } -__global__ void complexFillKernel(float* aInputData, float* aOutput,unsigned int aCopySize, - unsigned int aSrcColEleCount, unsigned int aDesColEleCount) +__global__ void complexFillKernel(float *aInputData, float *aOutput, unsigned int aCopySize, + unsigned int aSrcColEleCount, unsigned int aDesColEleCount) { unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; - for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x) + for (int offset = 0; offset < aDesColEleCount; offset += blockDim.x) { - if(threadIdx.x + offset< aCopySize){ + if (threadIdx.x + offset < aCopySize) + { aOutput[2 * idx_d + offset * 2] = aInputData[idx_s + offset]; aOutput[2 * idx_d + offset * 2 + 1] = 0; - } - else if(threadIdx.x + offset< aDesColEleCount){ + else if (threadIdx.x + offset < aDesColEleCount) + { aOutput[2 * idx_d + offset * 2] = 0; aOutput[2 * idx_d + offset * 2 + 1] = 0; } - else{ + else + { return; } } } -__global__ void complexCopyKernel(float* aInputData, float* aOutput,unsigned int aCopySize, - unsigned int aSrcColEleCount, unsigned int aDesColEleCount) +__global__ void complexCopyKernel(float *aInputData, float *aOutput, unsigned int aCopySize, + unsigned int aSrcColEleCount, unsigned int aDesColEleCount) { unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; - for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x) + for (int offset = 0; offset < aDesColEleCount; offset += blockDim.x) { - if(threadIdx.x + offset< aCopySize){ - aOutput[2*idx_d + offset * 2 ] = aInputData[idx_s*2 + offset*2]; - aOutput[2*idx_d + offset*2+ 1] = aInputData[idx_s*2+ offset*2+1]; + if (threadIdx.x + offset < aCopySize) + { + aOutput[2 * idx_d + offset * 2] = aInputData[idx_s * 2 + offset * 2]; + aOutput[2 * idx_d + offset * 2 + 1] = aInputData[idx_s * 2 + offset * 2 + 1]; } - else if(threadIdx.x + offset< aDesColEleCount){ - aOutput[2*idx_d + offset*2] = 0; - aOutput[2*idx_d + offset*2+ 1] = 0; + else if (threadIdx.x + offset < aDesColEleCount) + { + aOutput[2 * idx_d + offset * 2] = 0; + aOutput[2 * idx_d + offset * 2 + 1] = 0; } - else{ + else + { return; } } } -CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){ - size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0); - //实际需要copy赋值的非0值 - size_t needCopySize = (ColEleCount 0) ? aFFTSize : aMatrix.getDimSize(0); + // 实际需要copy赋值的非0值 + size_t needCopySize = (ColEleCount < aMatrix.getDimSize(0)) ? ColEleCount : aMatrix.getDimSize(0); + size_t bufferSize = ColEleCount * aMatrix.getDimSize(1); + float *data = nullptr; - cudaMalloc((void**)&data, sizeof(float)*2*bufferSize); - if (aMatrix.isComplex()){ - complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); + cudaMalloc((void **)&data, sizeof(float) * 2 * bufferSize); + if (aMatrix.isComplex()) + { + complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount); } - else{ - complexFillKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); + else + { + complexFillKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount); } cudaDeviceSynchronize(); - auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); - ExecFFT(ret,0); + auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex); + ExecFFT(ret, 0); return ret; } -CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){ - if (!aMatrix.isComplex()){ - std::cerr<<"ifft input must be complex value"<0)?aFFTSize:aMatrix.getDimSize(0); - //实际需要copy赋值的非0值 - size_t needCopySize = (ColEleCount 0) ? aFFTSize : aMatrix.getDimSize(0); + // 实际需要copy赋值的非0值 + size_t needCopySize = (ColEleCount < aMatrix.getDimSize(0)) ? ColEleCount : aMatrix.getDimSize(0); + size_t bufferSize = ColEleCount * aMatrix.getDimSize(1); + float *data = nullptr; + cudaMalloc((void **)&data, sizeof(float) * 2 * bufferSize); - complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); + complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount); cudaDeviceSynchronize(); - auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); - ExecFFT(ret,1); - float colEleCountf = 1.f/ColEleCount; - auto lambda = [=]__device__(const float& v){ - return v*colEleCountf; - }; - thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize()*2,ret.getData(),lambda); + auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex); + ExecFFT(ret, 1); + float colEleCountf = 1.f / ColEleCount; + auto lambda = [=] __device__(const float &v) + { + return v * colEleCountf; + }; + thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize() * 2, ret.getData(), lambda); return ret; } -__global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){ - unsigned int idx = blockIdx.x *aColEleCount + threadIdx.x; - unsigned int stride = aColEleCount/2; - for (int i = 0; i < stride; i+=blockDim.x) +__global__ void fftshiftSwapKernel(float *aData, unsigned int aColEleCount) +{ + unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; + unsigned int stride = aColEleCount / 2; + for (int i = 0; i < stride; i += blockDim.x) { if (threadIdx.x + i < stride) { float temp = aData[idx]; - aData[idx] = aData[idx+stride]; - aData[idx+stride] = temp; - } + aData[idx] = aData[idx + stride]; + aData[idx + stride] = temp; + } } } -__global__ void memcpyColKernel(float* aInputData,float* aOutputData, unsigned int aCopySize, - unsigned int aSrcColEleCount,unsigned int aDesColEleCount){ - unsigned int idx = blockIdx.x *aSrcColEleCount + threadIdx.x; - unsigned int idx2 = blockIdx.x *aDesColEleCount + threadIdx.x; - for (int i = 0; i < aCopySize; i+=blockDim.x) +__global__ void memcpyColKernel(float *aInputData, float *aOutputData, unsigned int aCopySize, + unsigned int aSrcColEleCount, unsigned int aDesColEleCount) +{ + unsigned int idx = blockIdx.x * aSrcColEleCount + threadIdx.x; + unsigned int idx2 = blockIdx.x * aDesColEleCount + threadIdx.x; + for (int i = 0; i < aCopySize; i += blockDim.x) { if (threadIdx.x + i < aCopySize) { - aOutputData[idx2+i] = aInputData[idx+i]; - } + aOutputData[idx2 + i] = aInputData[idx + i]; + } } } -void Aurora::fftshift(CudaMatrix &aMatrix){ - if (aMatrix.getDimSize(0) % 2 == 0) { +void Aurora::fftshift(CudaMatrix &aMatrix) +{ + if (aMatrix.getDimSize(0) % 2 == 0) + { fftshiftSwapKernel<<>>( aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); - cudaDeviceSynchronize(); - } else { + cudaDeviceSynchronize(); + } + else + { int copySize = aMatrix.getDimSize(0) / 2 + 1; float *data = nullptr; - cudaMalloc((void **)&data,copySize*sizeof(float)*aMatrix.getValueType()); + cudaMalloc((void **)&data, copySize * sizeof(float) * aMatrix.getValueType()); memcpyColKernel<<>>( aMatrix.getData(), data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); cudaDeviceSynchronize(); memcpyColKernel<<>>( - aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(), + aMatrix.getData() + copySize * aMatrix.getValueType(), aMatrix.getData(), (copySize - 1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); @@ -1442,23 +1546,27 @@ void Aurora::fftshift(CudaMatrix &aMatrix){ } } -void Aurora::ifftshift(CudaMatrix &aMatrix){ - if (aMatrix.getDimSize(0) % 2 == 0) { +void Aurora::ifftshift(CudaMatrix &aMatrix) +{ + if (aMatrix.getDimSize(0) % 2 == 0) + { fftshiftSwapKernel<<>>( aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); - } else { + } + else + { int copySize = aMatrix.getDimSize(0) / 2 + 1; float *data = nullptr; - cudaMalloc((void **)&data,copySize*sizeof(float)*aMatrix.getValueType()); + cudaMalloc((void **)&data, copySize * sizeof(float) * aMatrix.getValueType()); memcpyColKernel<<>>( - aMatrix.getData()+(copySize-1)* aMatrix.getValueType(), + aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(), data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); cudaDeviceSynchronize(); memcpyColKernel<<>>( - aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(), - (copySize-1) * aMatrix.getValueType(), + aMatrix.getData(), aMatrix.getData() + (copySize)*aMatrix.getValueType(), + (copySize - 1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaDeviceSynchronize(); @@ -1471,21 +1579,21 @@ void Aurora::ifftshift(CudaMatrix &aMatrix){ } } -__global__ void sub2indKernel(float* aVMatrixSize, float** aindexMatrix, float* aOutputData, unsigned int aRowSize, unsigned int aColumnSize) +__global__ void sub2indKernel(float *aVMatrixSize, float **aindexMatrix, float *aOutputData, unsigned int aRowSize, unsigned int aColumnSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; - if(idx < aRowSize) + if (idx < aRowSize) { aOutputData[idx] = 0; - for(unsigned int i=aColumnSize; i>0; --i) + for (unsigned int i = aColumnSize; i > 0; --i) { unsigned int subSize = 1; - for(unsigned int j=0; jgetDataSize(); unsigned int indexMatrixColumns = aSliceIdxs.size(); - float** indexMatrixData = nullptr; - float** tempPointer = new float*[indexMatrixColumns]; - cudaMalloc((void **)&indexMatrixData, sizeof(float*) * indexMatrixColumns); - for(unsigned int i=0; i>>(aVMatrixSize.getData(), indexMatrixData, data, indexMatrixRows, indexMatrixColumns); @@ -1525,27 +1633,27 @@ CudaMatrix Aurora::sub2ind(const CudaMatrix &aVMatrixSize, std::vector>>(data, aLength / 2 - 1); cudaDeviceSynchronize(); - return real(ifft(CudaMatrix::fromRawData(data,aLength,1,1,Complex))); + return real(ifft(CudaMatrix::fromRawData(data, aLength, 1, 1, Complex))); } -CudaMatrix Aurora::hilbert(const CudaMatrix &aMatrix){ +CudaMatrix Aurora::hilbert(const CudaMatrix &aMatrix) +{ auto x = fft(aMatrix); - auto h = Aurora::zerosCuda(aMatrix.getDimSize(0),1,1); + auto h = Aurora::zerosCuda(aMatrix.getDimSize(0), 1, 1); thrust::fill_n(thrust::device, h.getData(), aMatrix.getDimSize(0) / 2, 2.0f); h.setValue(aMatrix.getDimSize(0) / 2, ((aMatrix.getDimSize(0) << 31) >> 31) ? 2.0 : 1.0); h.setValue(0, 1.0f); - x = x*h; - auto result = ifft( x); + x = x * h; + auto result = ifft(x); return result; } \ No newline at end of file