diff --git a/src/AuroraDefs.h b/src/AuroraDefs.h index 53fa6a1..c0affee 100644 --- a/src/AuroraDefs.h +++ b/src/AuroraDefs.h @@ -8,5 +8,13 @@ #include "mkl.h" #define PI 3.141592653589793238462 - +namespace Aurora +{ + enum FunctionDirection + { + Column, + Row, + All + }; +} #endif //AURORA_AURORADEFS_H diff --git a/src/Function2D.cpp b/src/Function2D.cpp index 0f91796..8bbfd93 100644 --- a/src/Function2D.cpp +++ b/src/Function2D.cpp @@ -204,6 +204,34 @@ Matrix Aurora::min(const Matrix &aMatrix, FunctionDirection direction) { return min(aMatrix,direction,a,b); } +Matrix vxmMin(const Matrix &aVec, const Matrix &aMat) +{ + //只有一列 对Other逐列求最小值 + if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { + float* output = Aurora::malloc(aMat.getDataSize()); + for (int i = 0; i < aMat.getDimSize(1); ++i) { + vsFminI(aVec.getDataSize(), aVec.getData(), 1, aMat.getData() + aMat.getDimSize(0) * i, 1, + output + aMat.getDimSize(0) * i, 1); + } + // std::cout<<"min col-vec aMatrix and mat other"<1 || aMatrix.isComplex()) { std::cerr @@ -231,30 +259,22 @@ Matrix Aurora::min(const Matrix &aMatrix, const Matrix &aOther) { vsFminI(matrix.getDataSize(),matrix.getData(),1,&scalar,0,output,1); return Matrix::New(output,matrix); } - else if (aMatrix.getDimSize(1) == 1 || aOther.getDimSize(0) == 1) { - if (aMatrix.getDimSize(1) == 1){ - float* output = malloc(aOther.getDataSize()); - for (int i = 0; i < aOther.getDimSize(1); ++i) { - vsFminI(aMatrix.getDataSize(), aMatrix.getData(), 1, aOther.getData() + aOther.getDimSize(0) * i, 1, - output + aOther.getDimSize(0) * i, 1); - } - return Matrix::New(output,aOther); - } - else{ - float* output = malloc(aMatrix.getDataSize()); - for (int i = 0; i < aMatrix.getDimSize(0); ++i) { - vsFminI(aOther.getDataSize(), aOther.getData(), 1, aMatrix.getData() + i, aMatrix.getDimSize(0), - output + i, aOther.getDimSize(0)); - } - return Matrix::New(output,aMatrix); - } + else if (aMatrix.isVector()) { + return ::vxmMin(aMatrix,aOther); } - else{ - std::cerr - << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" - << std::endl; - return Matrix(); + else if (aOther.isVector()) { + return ::vxmMin(aOther,aMatrix); } + std::cerr + << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + << std::endl; + return Matrix(); +} + +Matrix Aurora::min(const Matrix &aMatrix, const float aValue){ + float *output = malloc(1); + output[0] = aValue; + return min(aMatrix,Matrix::New(output, 1,1,1)); } Matrix Aurora::max(const Matrix &aMatrix, FunctionDirection direction) { @@ -314,6 +334,34 @@ Matrix Aurora::max(const Matrix &aMatrix, FunctionDirection direction, long& row } } +Matrix vxmMax(const Matrix &aVec, const Matrix &aMat) +{ + //只有一列 对Other逐列求最小值 + if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { + float* output = Aurora::malloc(aMat.getDataSize()); + for (int i = 0; i < aMat.getDimSize(1); ++i) { + vsFmaxI(aVec.getDataSize(), aVec.getData(), 1, aMat.getData() + aMat.getDimSize(0) * i, 1, + output + aMat.getDimSize(0) * i, 1); + } + // std::cout<<"max col-vec aMatrix and mat other"<1 || aMatrix.isComplex()) { std::cerr @@ -341,30 +389,16 @@ Matrix Aurora::max(const Matrix &aMatrix, const Matrix &aOther) { vsFmaxI(matrix.getDataSize(),matrix.getData(),1,&scalar,0,output,1); return Matrix::New(output,matrix); } - else if (aMatrix.getDimSize(1) == 1 || aOther.getDimSize(0) == 1) { - if (aMatrix.getDimSize(1) == 1){ - float* output = malloc(aOther.getDataSize()); - for (int i = 0; i < aOther.getDimSize(1); ++i) { - vsFmaxI(aMatrix.getDataSize(), aMatrix.getData(), 1, aOther.getData() + aOther.getDimSize(0) * i, 1, - output + aOther.getDimSize(0) * i, 1); - } - return Matrix::New(output,aOther); - } - else{ - float* output = malloc(aMatrix.getDataSize()); - for (int i = 0; i < aMatrix.getDimSize(0); ++i) { - vsFmaxI(aOther.getDataSize(), aOther.getData(), 1, aMatrix.getData() + i, aMatrix.getDimSize(0), - output + i, aOther.getDimSize(0)); - } - return Matrix::New(output,aMatrix); - } + else if (aMatrix.isVector()) { + return ::vxmMax(aMatrix,aOther); } - else{ - std::cerr - << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + else if (aOther.isVector()) { + return ::vxmMax(aOther,aMatrix); + } + std::cerr + << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" << std::endl; return Matrix(); - } } Matrix Aurora::max(const Matrix &aMatrix, const float aValue){ diff --git a/src/Function2D.cu b/src/Function2D.cu new file mode 100644 index 0000000..fde7a60 --- /dev/null +++ b/src/Function2D.cu @@ -0,0 +1,478 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +using namespace Aurora; + + +__global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize) +{ + //确定每个thread的index + unsigned int idx = blockIdx.x * aColSize + threadIdx.x; + __shared__ float shared_data[256]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : -FLT_MAX; + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset0; offset>>=1) { + int idx2 = offset + threadIdx.x; + if (idx2 < blockDim.x) { + shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]); + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = shared_data[0]; + } +} + +__global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize) +{ + //确定每个thread的基础index + unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; + __shared__ float shared_data[512]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX; + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) { + if(threadIdx.x+offset < aRowSize){ + shared_data[threadIdx.x]= fmaxf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]); + } + __syncthreads(); + } + // 规约最前面一段 + for (int offset = blockDim.x/2; offset >0; offset>>=1) { + int idx2 = offset + threadIdx.x; + if (idx2 < blockDim.x) { + shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]); + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = shared_data[0]; + } +} + +CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) { + long a,b; + return max(aMatrix,direction,a,b); +} + +CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx) +{ + if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { + std::cerr<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + //针对向量行等于列 + if (aMatrix.isVector()){ + direction = All; + } + switch (direction) + { + case All: { + thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); + auto max_iter = thrust::max_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize()); + int index = max_iter-d_ptr; + rowIdx = index%aMatrix.getDimSize(0); + colIdx = index/aMatrix.getDimSize(0); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float)); + auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1); + ret.setValue(0, *max_iter); + return ret; + } + case Row: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + int rowCount = aMatrix.getDimSize(1); + cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); + if (rowCount<512){ + maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); + } + else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){ + maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); + } + else{ + maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); + } + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); + return ret; + } + case Column: + default: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + int colCount = aMatrix.getDimSize(0); + cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)); + maxColKernel<<>>(matData,retData,colCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); + return ret; + } + } +} + + +CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) { + //col-vec x mat + if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { + std::cout<<"max mat and col-vec "<1 || aMatrix.isComplex()) { + std::cerr + << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + if (aOther.getDimSize(2)>1 || aOther.isComplex()) { + std::cerr + << (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + //same shape + if (aMatrix.compareShape(aOther)){ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__ (const float& x, const float& y){ + return fmaxf(x,y); + }; + thrust::transform(thrust::device,aMatrix.getData(), + aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(), + data,lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + } + // one is scalar + else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){ + float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0); + auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix; + return max(matrix, scalar); + } + else if (aMatrix.isVector()) { + return ::vxmMax(aMatrix,aOther); +} + else if (aOther.isVector()) + { + return ::vxmMax(aOther,aMatrix); + } + std::cerr + << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + << std::endl; + return CudaMatrix(); +} + +CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue){ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__ (const float& x){ + return fmaxf(x,aValue); + }; + thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), + data,lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +__global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aColSize) +{ + //确定每个thread的index + unsigned int idx = blockIdx.x * aColSize + threadIdx.x; + __shared__ float shared_data[256]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : FLT_MAX; + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset0; offset>>=1) { + int idx2 = offset + threadIdx.x; + if (idx2 < blockDim.x) { + shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]); + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = shared_data[0]; + } +} + +__global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize) +{ + //确定每个thread的基础index + unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; + __shared__ float shared_data[512]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX; + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) { + if(threadIdx.x+offset < aRowSize){ + shared_data[threadIdx.x]= fminf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]); + } + __syncthreads(); + } + // 规约最前面一段 + for (int offset = blockDim.x/2; offset >0; offset>>=1) { + int idx2 = offset + threadIdx.x; + if (idx2 < blockDim.x) { + shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]); + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = shared_data[0]; + } +} + +CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) { + long a,b; + return min(aMatrix,direction,a,b); +} + +CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx) +{ + if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { + std::cerr<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + //针对向量行等于列 + if (aMatrix.isVector()){ + direction = All; + } + switch (direction) + { + case All: { + thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); + auto max_iter = thrust::min_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize()); + int index = max_iter-d_ptr; + rowIdx = index%aMatrix.getDimSize(0); + colIdx = index/aMatrix.getDimSize(0); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float)); + auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1); + ret.setValue(0, *max_iter); + return ret; + } + case Row: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + int rowCount = aMatrix.getDimSize(1); + cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); + if (rowCount<512){ + minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); + } + else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){ + minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); + } + else{ + minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); + } + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); + return ret; + } + case Column: + default: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + int colCount = aMatrix.getDimSize(0); + cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)); + minColKernel<<>>(matData,retData,colCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); + return ret; + } + } +} + +CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) { + //col-vec x mat + if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { + std::cout<<"min mat and col-vec "<1 || aMatrix.isComplex()) { + std::cerr + << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + if (aOther.getDimSize(2)>1 || aOther.isComplex()) { + std::cerr + << (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + //same shape + if (aMatrix.compareShape(aOther)){ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__ (const float& x, const float& y){ + return fminf(x,y); + }; + thrust::transform(thrust::device,aMatrix.getData(), + aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(), + data,lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); + } + // one is scalar + else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){ + float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0); + auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix; + return min(matrix, scalar); + } + else if (aMatrix.isVector()) { + return ::vxmMin(aMatrix,aOther); +} + else if (aOther.isVector()) + { + return ::vxmMin(aOther,aMatrix); + } + std::cerr + << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" + << std::endl; + return CudaMatrix(); +} + + +CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){ + size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size); + auto lambda = [=] __host__ __device__ (const float& x){ + return fminf(x,aValue); + }; + thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), + data,lambda); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), + aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} \ No newline at end of file diff --git a/src/Function2D.cuh b/src/Function2D.cuh new file mode 100644 index 0000000..55befa9 --- /dev/null +++ b/src/Function2D.cuh @@ -0,0 +1,19 @@ +#ifndef __FUNCTION2D_CUDA__ +#define __FUNCTION2D_CUDA__ +#include "CudaMatrix.h" +#include "AuroraDefs.h" + +namespace Aurora +{ + CudaMatrix max(const CudaMatrix &aMatrix, FunctionDirection direction = Column); + CudaMatrix max(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx); + CudaMatrix max(const CudaMatrix &aMatrix, const float aValue); + CudaMatrix max(const CudaMatrix &aMatrix, const CudaMatrix &aOther); + + CudaMatrix min(const CudaMatrix &aMatrix, FunctionDirection direction = Column); + CudaMatrix min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx); + CudaMatrix min(const CudaMatrix &aMatrix, const float aValue); + CudaMatrix min(const CudaMatrix &aMatrix, const CudaMatrix &aOther); +} + +#endif // __FUNCTION2D_CUDA_H__ \ No newline at end of file diff --git a/src/Function2D.h b/src/Function2D.h index f056a3b..39a4a3d 100644 --- a/src/Function2D.h +++ b/src/Function2D.h @@ -6,15 +6,11 @@ #include "Matrix.h" #include "Function1D.h" +#include "AuroraDefs.h" namespace Aurora { - enum FunctionDirection - { - Column, - Row, - All - }; + float immse(const Matrix &aImageA, const Matrix &aImageB); Matrix inv(const Matrix &aMatrix); Matrix inv(Matrix &&aMatrix); @@ -51,6 +47,8 @@ namespace Aurora */ Matrix min(const Matrix &aMatrix, const Matrix &aOther); + Matrix min(const Matrix &aMatrix, const float aValue); + /** * 比较两个矩阵,求对应位置的最大值,不支持三维 * @attention 矩阵形状不一样时,如A为[MxN],则B应为标量或[1xN]的行向量 diff --git a/test/Function2D_Cuda_Test.cpp b/test/Function2D_Cuda_Test.cpp new file mode 100644 index 0000000..2750128 --- /dev/null +++ b/test/Function2D_Cuda_Test.cpp @@ -0,0 +1,461 @@ +#include +#include +#include "CudaMatrix.h" +#include "Function.h" +#include "Matrix.h" +#include "TestUtility.h" + +#include "Function2D.h" +#include "Function2D.cuh" + +class Function2D_Cuda_Test:public ::testing::Test +{ +protected: + static void SetUpFunction2DCudaTester(){ + + } + static void TearDownTestCase(){ + } + public: + Aurora::Matrix B; + Aurora::CudaMatrix dB; + + void SetUp(){ + + } + void TearDown(){ + } + +}; + +TEST_F(Function2D_Cuda_Test, min) +{ + { + float *dataB = Aurora::random(4096*41472); + B = Aurora::Matrix::fromRawData(dataB, 4096, 41472); + dB = B.toDeviceMatrix(); + long r,c; + auto start_time_ = std::chrono::high_resolution_clock::now(); + auto ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c); + auto end_time = std::chrono::high_resolution_clock::now(); + auto duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::min(dB, Aurora::FunctionDirection::Row,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, 500.5f); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, dA); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, dA); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, dA); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::max(dB, Aurora::FunctionDirection::Row,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, 500.5f); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, dA); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, dA); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); + std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; + start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, dA); + end_time = std::chrono::high_resolution_clock::now(); + duration = std::chrono::duration_cast(end_time - start_time_); + std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<