diff --git a/CMakeLists.txt b/CMakeLists.txt index 93cdf13..f23d332 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,7 +46,7 @@ if (Aurora_USE_CUDA) target_include_directories(Aurora PRIVATE ./src /usr/local/cuda/include) set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_compile_options(Aurora PRIVATE $<$: - -arch=sm_75 --expt-extended-lambda + -arch=sm_75 --expt-extended-lambda >) target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) target_link_libraries(Aurora PRIVATE ${CUDA_cublas_LIBRARY}) @@ -74,7 +74,7 @@ if (Aurora_USE_CUDA) target_include_directories(Aurora_Test PRIVATE ./src /usr/local/cuda/include) set_target_properties(Aurora_Test PROPERTIES CUDA_SEPARABLE_COMPILATION ON) target_compile_options(Aurora_Test PRIVATE $<$: - -arch=sm_75 --expt-extended-lambda + -arch=sm_75 --expt-extended-lambda -Icub/ >) target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) target_link_libraries(Aurora_Test PRIVATE ${CUDA_cublas_LIBRARY}) diff --git a/src/Function2D.cu b/src/Function2D.cu index fde7a60..1ce8904 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -1,3 +1,7 @@ +#include "AuroraDefs.h" +#include "CudaMatrix.h" +#include "Function1D.h" +#include "Matrix.h" #include #include #include @@ -10,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -33,10 +38,10 @@ __global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aCo __syncthreads(); } // 规约最前面一段 - for (int offset = blockDim.x/2; offset >0; offset>>=1) { - int idx2 = offset + threadIdx.x; - if (idx2 < blockDim.x) { - shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]); + for (int i = blockDim.x/2; i >0; i>>=1) { + + if (threadIdx.x < i) { + shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); } __syncthreads(); } @@ -51,7 +56,7 @@ __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aCol { //确定每个thread的基础index unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; - __shared__ float shared_data[512]; + __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX; __syncthreads(); @@ -63,10 +68,10 @@ __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aCol __syncthreads(); } // 规约最前面一段 - for (int offset = blockDim.x/2; offset >0; offset>>=1) { - int idx2 = offset + threadIdx.x; - if (idx2 < blockDim.x) { - shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]); + for (int i = blockDim.x/2; i >0; i>>=1) { + + if (threadIdx.x < i) { + shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]); } __syncthreads(); } @@ -113,15 +118,7 @@ CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, l float* retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); - if (rowCount<512){ - maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - } - else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){ - maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - } - else{ - maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - } + maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); return ret; @@ -263,10 +260,9 @@ __global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aCo __syncthreads(); } // 规约最前面一段 - for (int offset = blockDim.x/2; offset >0; offset>>=1) { - int idx2 = offset + threadIdx.x; - if (idx2 < blockDim.x) { - shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]); + for (int i = blockDim.x/2; i >0; i>>=1) { + if (threadIdx.x < i) { + shared_data[threadIdx.x] += fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]); } __syncthreads(); } @@ -281,7 +277,7 @@ __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aCol { //确定每个thread的基础index unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; - __shared__ float shared_data[512]; + __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX; __syncthreads(); @@ -293,10 +289,9 @@ __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aCol __syncthreads(); } // 规约最前面一段 - for (int offset = blockDim.x/2; offset >0; offset>>=1) { - int idx2 = offset + threadIdx.x; - if (idx2 < blockDim.x) { - shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]); + for (int i = blockDim.x/2; i >0; i>>=1) { + if (threadIdx.x < i) { + shared_data[threadIdx.x] += fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]); } __syncthreads(); } @@ -343,15 +338,7 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, l float* retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); - if (rowCount<512){ - minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - } - else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){ - minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - } - else{ - minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); - } + minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); return ret; @@ -475,4 +462,420 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){ data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +__global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount) +{ + //确定每个thread的index + unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; + __shared__ double shared_data[256]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x]= (threadIdx.x< aColEleCount) ? aInputData[idx] : 0.0; + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset0; i>>=1) { + if (threadIdx.x < i) { + shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x]; + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = (float)shared_data[0]; + } +} + +__global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aColEleCount, unsigned int aRowEleCount) +{ + //确定每个thread的基础index + unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x; + __shared__ float shared_data[256]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x]= (threadIdx.x< aRowEleCount) ? aInputData[idx] : 0.0; + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) { + if(threadIdx.x+offset < aRowEleCount){ + shared_data[threadIdx.x]+= aInputData[idx + offset*aColEleCount]; + } + __syncthreads(); + } + // 规约最前面一段 + for (int i = blockDim.x/2; i >0; i>>=1) { + if (threadIdx.x < i) { + shared_data[threadIdx.x] += shared_data[threadIdx.x+i]; + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = shared_data[0]; + } +} + +__global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSize) +{ + //确定每个thread的index + unsigned int idx = blockIdx.x * 4096 + threadIdx.x; + __shared__ float shared_data[256][2]; + // 每个线程加载一个元素到共享内存 + bool flag = threadIdx.x< 4096 && idx0; i>>=1) { + if (threadIdx.x < i) { + shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; + shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x] = shared_data[0][0]; + aOutput[blockIdx.x+gridDim.x] = shared_data[0][1]; + } +} + +__global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCount) +{ + //确定每个thread的index + unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; + __shared__ float shared_data[256][2]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x][0]= (threadIdx.x< aColEleCount) ? aInputData[idx*2] : 0.0; + shared_data[threadIdx.x][1]= (threadIdx.x< aColEleCount) ? aInputData[idx*2+1] : 0.0; + + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset0; i>>=1) { + if (threadIdx.x < i) { + shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; + shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x*2] = shared_data[0][0]; + aOutput[blockIdx.x*2+1] = shared_data[0][1]; + } +} + +__global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aColEleCount, unsigned int aRowEleCount) +{ + //确定每个thread的基础index + unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x; + __shared__ float shared_data[256][2]; + // 每个线程加载一个元素到共享内存 + shared_data[threadIdx.x][0]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2] : 0.0; + shared_data[threadIdx.x][1]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2+1] : 0.0; + + __syncthreads(); + // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 + for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) { + if(threadIdx.x+offset < aRowEleCount){ + shared_data[threadIdx.x][0]+= aInputData[idx*2 + offset*aColEleCount*2]; + shared_data[threadIdx.x][1]+= aInputData[idx*2 + offset*aColEleCount*2 + 1]; + } + __syncthreads(); + } + // 规约最前面一段 + for (int i = blockDim.x/2; i >0; i>>=1) { + if (threadIdx.x < i) { + shared_data[threadIdx.x][0] += shared_data[threadIdx.x+i][0]; + shared_data[threadIdx.x][1] += shared_data[threadIdx.x+i][1]; + + } + __syncthreads(); + } + + // 第一个线程存储每个分段的最大值到全局内存 + if (threadIdx.x == 0) { + aOutput[blockIdx.x*2] = shared_data[0][0]; + aOutput[blockIdx.x*2+1] = shared_data[0][1]; + + } +} + +CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){ + if (aMatrix.getDimSize(2)>1 ) { + std::cerr<< "sum() not support 3D data!" + << std::endl; + return CudaMatrix(); + } + //针对向量行等于列 + if (direction == Column && aMatrix.getDimSize(0)==1){ + direction = Row; + } + if (!aMatrix.isComplex()) + { + switch (direction) + { + case All: + { + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float)); + auto ret = CudaMatrix::fromRawData(data,1,1,1); + float result = thrust::reduce(thrust::device, aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),0.0000000f,thrust::plus()); + ret.setValue(0,result); + return ret; + } + case Row: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + int rowCount = aMatrix.getDimSize(1); + cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); + sumRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); + return ret; + } + case Column: + default: + { + std::cout<<"Column sum"<>>(matData,retData,colElementCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); + return ret; + } + } + } + else{ + switch (direction) + { + case All: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + //divide the whole array to some 4096 blocks, then caculate as columns sum + int fakeCol = (int)ceilf((float)aMatrix.getDataSize()/4096.0f); + cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol); + auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex); + sumZAllColKernel<<>>(matData,retData, aMatrix.getDataSize()); + float* result_data = nullptr; + cudaMalloc((void**)&result_data, sizeof(float)*2); + auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex); + float result = thrust::reduce(thrust::device, ret.getData(),ret.getData()+ ret.getDataSize(),0,thrust::plus()); + ret2.setValue(0,result); + result = thrust::reduce(thrust::device, ret.getData()+ ret.getDataSize(),ret.getData()+ ret.getDataSize()*2,0,thrust::plus()); + ret2.setValue(1,result); + return ret2; + } + case Row: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + int rowElementCount = aMatrix.getDimSize(1); + cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)*2); + sumZRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowElementCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); + return ret; + } + case Column: + default: + { + float* matData = aMatrix.getData(); + float* retData = nullptr; + int colElementCount = aMatrix.getDimSize(0); + if (colElementCount == 1) return aMatrix; + cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)*2); + sumZColKernel<<>>(matData,retData,colElementCount); + cudaDeviceSynchronize(); + CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1),1,Complex); + return ret; + } + } + } +} + +CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){ + if (aMatrix.getDimSize(2)>1 ) { + std::cerr<< "sum() not support 3D data!" + << std::endl; + return CudaMatrix(); + } + //针对向量行等于列 + if (direction == Column && aMatrix.getDimSize(0)==1){ + direction = Row; + } + if (!aMatrix.isComplex()) + { + switch (direction) + { + case All: + { + auto ret = sum(aMatrix,All); + ret.setValue(0,ret.getValue(0)/((float)aMatrix.getDataSize())); + return ret; + } + case Row: + { + auto ret = sum(aMatrix, Row); + float count = (float)aMatrix.getDimSize(1); + auto lambda = [=] __device__ (const float& v){ + return v/count; + }; + thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda); + return ret; + } + case Column: + default: + { + auto ret = sum(aMatrix, Column); + float count = (float)aMatrix.getDimSize(0); + auto lambda = [=] __device__ (const float& v){ + return v/count; + }; + thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda); + return ret; + } + } + } + else{ + std::cerr<< "mean() not support complex data!" + << std::endl; + return CudaMatrix(); + } +} +template +class RowElementIterator:public thrust::iterator_facade< + RowElementIterator, + ValueType, + thrust::device_system_tag, + thrust::random_access_traversal_tag, + ValueType& >{ + public: + // 构造函数 + __host__ __device__ + RowElementIterator(ValueType* ptr, int aColElementCount=1) : ptr_(ptr),col_elements_(aColElementCount) {} + + __host__ __device__ + ValueType& dereference() const{ + return *ptr_; + } + + // 实现递增操作符 + __host__ __device__ + void increment() { + ptr_ = ptr_+col_elements_; + } + + // 实现递减操作符 + __host__ __device__ + void decrement() { + ptr_ = ptr_ - col_elements_; + } + + // 实现加法操作符 + __host__ __device__ + void advance(typename RowElementIterator::difference_type n) { + ptr_ += col_elements_*n; + } + + // 实现减法操作符 + __host__ __device__ + typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const { + return (other.ptr_ - ptr_)/col_elements_; + } + + // 实现比较操作符 + __host__ __device__ + bool equal(const RowElementIterator& other) const { + return ptr_ == other.ptr_; + } + + private: + ValueType* ptr_; + int col_elements_; +}; + +CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction) +{ + if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { + std::cerr + << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + return sort(std::forward(aMatrix.deepCopy()), direction); +} + +CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction) +{ + if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { + std::cerr + << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") + << std::endl; + return CudaMatrix(); + } + float * data = aMatrix.getData(); + int colElementCount = aMatrix.getDimSize(0); + switch (direction) + { + case Row: + { + for (size_t i = 0; i < colElementCount; i++) + { + thrust::sort(thrust::device, RowElementIterator(data+i,colElementCount), + RowElementIterator(data+aMatrix.getDataSize()+i,colElementCount)); + } + return aMatrix; + } + case Column: + { + int rowElementCount = aMatrix.getDimSize(1); + // softKernel<<>>(data,colElementCount); + return aMatrix; + } + default: + { + std::cerr + << "Unsupported direction for sort!" + << std::endl; + return CudaMatrix(); + } + + } + } \ No newline at end of file diff --git a/src/Function2D.cuh b/src/Function2D.cuh index 55befa9..5de8a64 100644 --- a/src/Function2D.cuh +++ b/src/Function2D.cuh @@ -14,6 +14,20 @@ namespace Aurora CudaMatrix min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx); CudaMatrix min(const CudaMatrix &aMatrix, const float aValue); CudaMatrix min(const CudaMatrix &aMatrix, const CudaMatrix &aOther); + + CudaMatrix sum(const CudaMatrix &aMatrix, FunctionDirection direction = Column); + + /** + * @brief 平均值,注意不支持复数 + * + * @param aMatrix 需要处理的矩阵 + * @param direction 方向 + * @return CudaMatrix + */ + CudaMatrix mean(const CudaMatrix &aMatrix, FunctionDirection direction = Column); + CudaMatrix sort(const CudaMatrix &aMatrix,FunctionDirection direction = Column); + CudaMatrix sort(CudaMatrix &&aMatrix,FunctionDirection direction = Column); + } #endif // __FUNCTION2D_CUDA_H__ \ No newline at end of file diff --git a/test/Function2D_Cuda_Test.cpp b/test/Function2D_Cuda_Test.cpp index 2750128..e687b76 100644 --- a/test/Function2D_Cuda_Test.cpp +++ b/test/Function2D_Cuda_Test.cpp @@ -1,5 +1,6 @@ #include #include +#include "AuroraDefs.h" #include "CudaMatrix.h" #include "Function.h" #include "Matrix.h" @@ -30,21 +31,19 @@ protected: TEST_F(Function2D_Cuda_Test, min) { + // big data for test + // Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix, + // Aurora::FunctionDirection direction, long &rowIdx, long &colIdx) { float *dataB = Aurora::random(4096*41472); B = Aurora::Matrix::fromRawData(dataB, 4096, 41472); dB = B.toDeviceMatrix(); long r,c; - auto start_time_ = std::chrono::high_resolution_clock::now(); + auto ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -54,16 +53,11 @@ TEST_F(Function2D_Cuda_Test, min) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::min(dB, Aurora::FunctionDirection::Row,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -73,21 +67,20 @@ TEST_F(Function2D_Cuda_Test, min) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -98,16 +91,11 @@ TEST_F(Function2D_Cuda_Test, min) } B.forceReshape( 111,3157, 1); dB = B.toDeviceMatrix(); - start_time_ = std::chrono::high_resolution_clock::now(); + ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -117,6 +105,8 @@ TEST_F(Function2D_Cuda_Test, min) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + + auto ret2 = Aurora::min(dB, 500.5f); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -142,6 +128,10 @@ TEST_F(Function2D_Cuda_Test, min) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, dA); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -171,6 +155,10 @@ TEST_F(Function2D_Cuda_Test, min) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, dA); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -199,11 +181,9 @@ TEST_F(Function2D_Cuda_Test, min) { ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -213,6 +193,10 @@ TEST_F(Function2D_Cuda_Test, min) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::min(dB, dA); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -246,21 +225,18 @@ TEST_F(Function2D_Cuda_Test, min) TEST_F(Function2D_Cuda_Test, max) { + // big data for test + // Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix, + // Aurora::FunctionDirection direction, long &rowIdx, long &colIdx) { float *dataB = Aurora::random(4096*41472); B = Aurora::Matrix::fromRawData(dataB, 4096, 41472); dB = B.toDeviceMatrix(); long r,c; - auto start_time_ = std::chrono::high_resolution_clock::now(); auto ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c); - auto end_time = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -270,16 +246,10 @@ TEST_F(Function2D_Cuda_Test, max) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::max(dB, Aurora::FunctionDirection::Row,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -289,21 +259,20 @@ TEST_F(Function2D_Cuda_Test, max) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -314,16 +283,11 @@ TEST_F(Function2D_Cuda_Test, max) } B.forceReshape( 111,3157, 1); dB = B.toDeviceMatrix(); - start_time_ = std::chrono::high_resolution_clock::now(); + ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -333,22 +297,18 @@ TEST_F(Function2D_Cuda_Test, max) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, 500.5f); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -358,6 +318,10 @@ TEST_F(Function2D_Cuda_Test, max) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + auto ret2 = Aurora::max(dB, dA); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -387,26 +346,23 @@ TEST_F(Function2D_Cuda_Test, max) ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + + // mat x vec auto ret2 = Aurora::max(dB, dA); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); @@ -415,20 +371,20 @@ TEST_F(Function2D_Cuda_Test, max) { ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); - for (size_t i = 0; i < ret1.getDataSize(); i++) { ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<(end_time - start_time_); - std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl; - start_time_ = std::chrono::high_resolution_clock::now(); + // mat x vec auto ret2 = Aurora::max(dB, dA); - end_time = std::chrono::high_resolution_clock::now(); - duration = std::chrono::duration_cast(end_time - start_time_); - std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl; + + ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0)); + ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1)); + ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2)); + + for (size_t i = 0; i < ret1.getDataSize(); i++) + { + ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<