Add sum,mean, sort to function2D

This commit is contained in:
kradchen
2023-12-07 14:03:17 +08:00
parent 6f32a05ee0
commit cdf4777dbe
4 changed files with 744 additions and 172 deletions

View File

@@ -74,7 +74,7 @@ if (Aurora_USE_CUDA)
target_include_directories(Aurora_Test PRIVATE ./src /usr/local/cuda/include)
set_target_properties(Aurora_Test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_compile_options(Aurora_Test PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-arch=sm_75 --expt-extended-lambda
-arch=sm_75 --expt-extended-lambda -Icub/
>)
target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
target_link_libraries(Aurora_Test PRIVATE ${CUDA_cublas_LIBRARY})

View File

@@ -1,3 +1,7 @@
#include "AuroraDefs.h"
#include "CudaMatrix.h"
#include "Function1D.h"
#include "Matrix.h"
#include <Function2D.cuh>
#include <cfloat>
#include <cstddef>
@@ -10,6 +14,7 @@
#include <thrust/device_ptr.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/iterator_facade.h>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/complex.h>
@@ -33,10 +38,10 @@ __global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aCo
__syncthreads();
}
// 规约最前面一段
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
int idx2 = offset + threadIdx.x;
if (idx2 < blockDim.x) {
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]);
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
}
__syncthreads();
}
@@ -51,7 +56,7 @@ __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aCol
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
__shared__ float shared_data[512];
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX;
__syncthreads();
@@ -63,10 +68,10 @@ __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aCol
__syncthreads();
}
// 规约最前面一段
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
int idx2 = offset + threadIdx.x;
if (idx2 < blockDim.x) {
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]);
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]);
}
__syncthreads();
}
@@ -113,15 +118,7 @@ CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, l
float* retData = nullptr;
int rowCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
if (rowCount<512){
maxRowKernel<<<aMatrix.getDimSize(0),rowCount/2+1>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
}
else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){
maxRowKernel<<<aMatrix.getDimSize(0),512>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
}
else{
maxRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
}
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
@@ -263,10 +260,9 @@ __global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aCo
__syncthreads();
}
// 规约最前面一段
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
int idx2 = offset + threadIdx.x;
if (idx2 < blockDim.x) {
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]);
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] += fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]);
}
__syncthreads();
}
@@ -281,7 +277,7 @@ __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aCol
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
__shared__ float shared_data[512];
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX;
__syncthreads();
@@ -293,10 +289,9 @@ __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aCol
__syncthreads();
}
// 规约最前面一段
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
int idx2 = offset + threadIdx.x;
if (idx2 < blockDim.x) {
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]);
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] += fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]);
}
__syncthreads();
}
@@ -343,15 +338,7 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, l
float* retData = nullptr;
int rowCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
if (rowCount<512){
minRowKernel<<<aMatrix.getDimSize(0),rowCount/2+1>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
}
else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){
minRowKernel<<<aMatrix.getDimSize(0),512>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
}
else{
minRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
}
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
@@ -476,3 +463,419 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
__shared__ double shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aColEleCount) ? aInputData[idx] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
if(threadIdx.x + offset<aColEleCount){
shared_data[threadIdx.x] += (double)aInputData[idx + offset];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = (float)shared_data[0];
}
}
__global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aColEleCount, unsigned int aRowEleCount)
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aRowEleCount) ? aInputData[idx] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
if(threadIdx.x+offset < aRowEleCount){
shared_data[threadIdx.x]+= aInputData[idx + offset*aColEleCount];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] += shared_data[threadIdx.x+i];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0];
}
}
__global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSize)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * 4096 + threadIdx.x;
__shared__ float shared_data[256][2];
// 每个线程加载一个元素到共享内存
bool flag = threadIdx.x< 4096 && idx<aTotalSize;
shared_data[threadIdx.x][0]= flag ? aInputData[idx*2] : 0.0;
shared_data[threadIdx.x][1]= flag ? aInputData[idx*2+1] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<4096; offset+=blockDim.x) {
if(threadIdx.x + offset<4096 && idx + offset<aTotalSize){
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 +1];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0][0];
aOutput[blockIdx.x+gridDim.x] = shared_data[0][1];
}
}
__global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCount)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
__shared__ float shared_data[256][2];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x][0]= (threadIdx.x< aColEleCount) ? aInputData[idx*2] : 0.0;
shared_data[threadIdx.x][1]= (threadIdx.x< aColEleCount) ? aInputData[idx*2+1] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
if(threadIdx.x + offset<aColEleCount){
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 + 1];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x*2] = shared_data[0][0];
aOutput[blockIdx.x*2+1] = shared_data[0][1];
}
}
__global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aColEleCount, unsigned int aRowEleCount)
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
__shared__ float shared_data[256][2];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x][0]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2] : 0.0;
shared_data[threadIdx.x][1]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2+1] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
if(threadIdx.x+offset < aRowEleCount){
shared_data[threadIdx.x][0]+= aInputData[idx*2 + offset*aColEleCount*2];
shared_data[threadIdx.x][1]+= aInputData[idx*2 + offset*aColEleCount*2 + 1];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x][0] += shared_data[threadIdx.x+i][0];
shared_data[threadIdx.x][1] += shared_data[threadIdx.x+i][1];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x*2] = shared_data[0][0];
aOutput[blockIdx.x*2+1] = shared_data[0][1];
}
}
CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
if (aMatrix.getDimSize(2)>1 ) {
std::cerr<< "sum() not support 3D data!"
<< std::endl;
return CudaMatrix();
}
//针对向量行等于列
if (direction == Column && aMatrix.getDimSize(0)==1){
direction = Row;
}
if (!aMatrix.isComplex())
{
switch (direction)
{
case All:
{
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float));
auto ret = CudaMatrix::fromRawData(data,1,1,1);
float result = thrust::reduce(thrust::device, aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),0.0000000f,thrust::plus<float>());
ret.setValue(0,result);
return ret;
}
case Row:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int rowCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
sumRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
}
case Column:
default:
{
std::cout<<"Column sum"<<std::endl;
float* matData = aMatrix.getData();
float* retData = nullptr;
int colElementCount = aMatrix.getDimSize(0);
if (colElementCount == 1) return aMatrix;
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
sumColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
return ret;
}
}
}
else{
switch (direction)
{
case All:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
//divide the whole array to some 4096 blocks, then caculate as columns sum
int fakeCol = (int)ceilf((float)aMatrix.getDataSize()/4096.0f);
cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol);
auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex);
sumZAllColKernel<<<fakeCol,256>>>(matData,retData, aMatrix.getDataSize());
float* result_data = nullptr;
cudaMalloc((void**)&result_data, sizeof(float)*2);
auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex);
float result = thrust::reduce(thrust::device, ret.getData(),ret.getData()+ ret.getDataSize(),0,thrust::plus<float>());
ret2.setValue(0,result);
result = thrust::reduce(thrust::device, ret.getData()+ ret.getDataSize(),ret.getData()+ ret.getDataSize()*2,0,thrust::plus<float>());
ret2.setValue(1,result);
return ret2;
}
case Row:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int rowElementCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)*2);
sumZRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowElementCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
}
case Column:
default:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int colElementCount = aMatrix.getDimSize(0);
if (colElementCount == 1) return aMatrix;
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)*2);
sumZColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1),1,Complex);
return ret;
}
}
}
}
CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){
if (aMatrix.getDimSize(2)>1 ) {
std::cerr<< "sum() not support 3D data!"
<< std::endl;
return CudaMatrix();
}
//针对向量行等于列
if (direction == Column && aMatrix.getDimSize(0)==1){
direction = Row;
}
if (!aMatrix.isComplex())
{
switch (direction)
{
case All:
{
auto ret = sum(aMatrix,All);
ret.setValue(0,ret.getValue(0)/((float)aMatrix.getDataSize()));
return ret;
}
case Row:
{
auto ret = sum(aMatrix, Row);
float count = (float)aMatrix.getDimSize(1);
auto lambda = [=] __device__ (const float& v){
return v/count;
};
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
return ret;
}
case Column:
default:
{
auto ret = sum(aMatrix, Column);
float count = (float)aMatrix.getDimSize(0);
auto lambda = [=] __device__ (const float& v){
return v/count;
};
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
return ret;
}
}
}
else{
std::cerr<< "mean() not support complex data!"
<< std::endl;
return CudaMatrix();
}
}
template <typename ValueType>
class RowElementIterator:public thrust::iterator_facade<
RowElementIterator<ValueType>,
ValueType,
thrust::device_system_tag,
thrust::random_access_traversal_tag,
ValueType& >{
public:
// 构造函数
__host__ __device__
RowElementIterator(ValueType* ptr, int aColElementCount=1) : ptr_(ptr),col_elements_(aColElementCount) {}
__host__ __device__
ValueType& dereference() const{
return *ptr_;
}
// 实现递增操作符
__host__ __device__
void increment() {
ptr_ = ptr_+col_elements_;
}
// 实现递减操作符
__host__ __device__
void decrement() {
ptr_ = ptr_ - col_elements_;
}
// 实现加法操作符
__host__ __device__
void advance(typename RowElementIterator::difference_type n) {
ptr_ += col_elements_*n;
}
// 实现减法操作符
__host__ __device__
typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const {
return (other.ptr_ - ptr_)/col_elements_;
}
// 实现比较操作符
__host__ __device__
bool equal(const RowElementIterator& other) const {
return ptr_ == other.ptr_;
}
private:
ValueType* ptr_;
int col_elements_;
};
CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction)
{
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
return sort(std::forward<CudaMatrix &&>(aMatrix.deepCopy()), direction);
}
CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction)
{
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
float * data = aMatrix.getData();
int colElementCount = aMatrix.getDimSize(0);
switch (direction)
{
case Row:
{
for (size_t i = 0; i < colElementCount; i++)
{
thrust::sort(thrust::device, RowElementIterator<float>(data+i,colElementCount),
RowElementIterator<float>(data+aMatrix.getDataSize()+i,colElementCount));
}
return aMatrix;
}
case Column:
{
int rowElementCount = aMatrix.getDimSize(1);
// softKernel<<<rowElementCount,1>>>(data,colElementCount);
return aMatrix;
}
default:
{
std::cerr
<< "Unsupported direction for sort!"
<< std::endl;
return CudaMatrix();
}
}
}

View File

@@ -14,6 +14,20 @@ namespace Aurora
CudaMatrix min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx);
CudaMatrix min(const CudaMatrix &aMatrix, const float aValue);
CudaMatrix min(const CudaMatrix &aMatrix, const CudaMatrix &aOther);
CudaMatrix sum(const CudaMatrix &aMatrix, FunctionDirection direction = Column);
/**
* @brief 平均值,注意不支持复数
*
* @param aMatrix 需要处理的矩阵
* @param direction 方向
* @return CudaMatrix
*/
CudaMatrix mean(const CudaMatrix &aMatrix, FunctionDirection direction = Column);
CudaMatrix sort(const CudaMatrix &aMatrix,FunctionDirection direction = Column);
CudaMatrix sort(CudaMatrix &&aMatrix,FunctionDirection direction = Column);
}
#endif // __FUNCTION2D_CUDA_H__

View File

@@ -1,5 +1,6 @@
#include <gtest/gtest.h>
#include <chrono>
#include "AuroraDefs.h"
#include "CudaMatrix.h"
#include "Function.h"
#include "Matrix.h"
@@ -30,21 +31,19 @@ protected:
TEST_F(Function2D_Cuda_Test, min)
{
// big data for test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
{
float *dataB = Aurora::random(4096*41472);
B = Aurora::Matrix::fromRawData(dataB, 4096, 41472);
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -54,16 +53,11 @@ TEST_F(Function2D_Cuda_Test, min)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
start_time_ = std::chrono::high_resolution_clock::now();
ret1 = Aurora::min(B, Aurora::FunctionDirection::Row,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
ret2 = Aurora::min(dB, Aurora::FunctionDirection::Row,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -73,21 +67,20 @@ TEST_F(Function2D_Cuda_Test, min)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// different size speed
// Aurora::Matrix Aurora::min(const Aurora::Matrix &aMatrix,
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
// in col wise
{
float *dataB = Aurora::random(3157*111);
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -98,16 +91,11 @@ TEST_F(Function2D_Cuda_Test, min)
}
B.forceReshape( 111,3157, 1);
dB = B.toDeviceMatrix();
start_time_ = std::chrono::high_resolution_clock::now();
ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -117,6 +105,8 @@ TEST_F(Function2D_Cuda_Test, min)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix, float aValue)
{
float *dataB = Aurora::random(3157*111);
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
@@ -125,14 +115,10 @@ TEST_F(Function2D_Cuda_Test, min)
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::min(B, 500.5f);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::min(dB, 500.5f);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -142,6 +128,10 @@ TEST_F(Function2D_Cuda_Test, min)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// const Aurora::Matrix &aOther)
// with same size matrix
{
float *dataB = Aurora::random(3157*111);
float *dataA = Aurora::random(3157*111);
@@ -151,17 +141,11 @@ TEST_F(Function2D_Cuda_Test, min)
auto dA = A.toDeviceMatrix();
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::min(B, A);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::min(dB, dA);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -171,6 +155,10 @@ TEST_F(Function2D_Cuda_Test, min)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// const Aurora::Matrix &aOther)
// with col-vec and matrix
{
float *dataB = Aurora::random(3157*111);
float *dataA = Aurora::random(3157);
@@ -180,17 +168,11 @@ TEST_F(Function2D_Cuda_Test, min)
auto dA = A.toDeviceMatrix();
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::min(B, A);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::min(dB, dA);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -199,11 +181,9 @@ TEST_F(Function2D_Cuda_Test, min)
{
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
start_time_ = std::chrono::high_resolution_clock::now();
ret2 = Aurora::min(dA, dB);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -213,6 +193,10 @@ TEST_F(Function2D_Cuda_Test, min)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// const Aurora::Matrix &aOther)
// with row-vec and matrix
{
float *dataB = Aurora::random(3157*111);
float *dataA = Aurora::random(111);
@@ -225,14 +209,9 @@ TEST_F(Function2D_Cuda_Test, min)
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::min(B, A);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::min(dB, dA);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -246,21 +225,18 @@ TEST_F(Function2D_Cuda_Test, min)
TEST_F(Function2D_Cuda_Test, max)
{
// big data for test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
{
float *dataB = Aurora::random(4096*41472);
B = Aurora::Matrix::fromRawData(dataB, 4096, 41472);
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -270,16 +246,10 @@ TEST_F(Function2D_Cuda_Test, max)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
start_time_ = std::chrono::high_resolution_clock::now();
ret1 = Aurora::max(B, Aurora::FunctionDirection::Row,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
ret2 = Aurora::max(dB, Aurora::FunctionDirection::Row,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -289,21 +259,20 @@ TEST_F(Function2D_Cuda_Test, max)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// different size speed
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
// in col wise
{
float *dataB = Aurora::random(3157*111);
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -314,16 +283,11 @@ TEST_F(Function2D_Cuda_Test, max)
}
B.forceReshape( 111,3157, 1);
dB = B.toDeviceMatrix();
start_time_ = std::chrono::high_resolution_clock::now();
ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -333,22 +297,18 @@ TEST_F(Function2D_Cuda_Test, max)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix, float aValue)
{
float *dataB = Aurora::random(3157*111);
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::max(B, 500.5f);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::max(dB, 500.5f);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -358,6 +318,10 @@ TEST_F(Function2D_Cuda_Test, max)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// const Aurora::Matrix &aOther)
// with same size matrix
{
float *dataB = Aurora::random(3157*111);
float *dataA = Aurora::random(3157*111);
@@ -370,14 +334,9 @@ TEST_F(Function2D_Cuda_Test, max)
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::max(B, A);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
auto ret2 = Aurora::max(dB, dA);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -387,26 +346,23 @@ TEST_F(Function2D_Cuda_Test, max)
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// const Aurora::Matrix &aOther)
// with col-vec and matrix
{
float *dataB = Aurora::random(3157*111);
float *dataA = Aurora::random(3157);
auto A = Aurora::Matrix::fromRawData(dataA, 3157, 1);
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
auto dA = A.toDeviceMatrix();
dB = B.toDeviceMatrix();
long r,c;
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::max(B, A);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
// mat x vec
auto ret2 = Aurora::max(dB, dA);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -415,20 +371,20 @@ TEST_F(Function2D_Cuda_Test, max)
{
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
start_time_ = std::chrono::high_resolution_clock::now();
// vec x mat
ret2 = Aurora::max(dA, dB);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
// test
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
// const Aurora::Matrix &aOther)
// with row-vec and matrix
{
float *dataB = Aurora::random(3157*111);
float *dataA = Aurora::random(111);
@@ -441,14 +397,20 @@ TEST_F(Function2D_Cuda_Test, max)
auto start_time_ = std::chrono::high_resolution_clock::now();
auto ret1 = Aurora::max(B, A);
auto end_time = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
start_time_ = std::chrono::high_resolution_clock::now();
// mat x vec
auto ret2 = Aurora::max(dB, dA);
end_time = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
//vec x mat
ret2 = Aurora::max(dA, dB);
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
@@ -459,3 +421,196 @@ TEST_F(Function2D_Cuda_Test, max)
}
}
}
TEST_F(Function2D_Cuda_Test, sum)
{
//
{
float *dataB = Aurora::random(4096*50000);
// float* dataB = new float[4096*50000];
// for (size_t i = 0; i < 4096*50000; i++)
// {
// dataB[i] = (float)(i/4096);
// }
B = Aurora::Matrix::fromRawData(dataB, 4096, 50000);
// B = Aurora::Matrix::fromRawData(dataB, 200, 200);
auto dD = B.toDeviceMatrix();
auto ret1 = Aurora::sum(B, Aurora::FunctionDirection::All);
auto ret2 = Aurora::sum(dD, Aurora::FunctionDirection::All);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Column);
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Column);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
}
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Row);
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Row);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
}
}
//complex type
{
float* dataB = new float[3000*2000*2];
for (size_t i = 0; i < 3000*4000; i++)
{
dataB[i] = i%2==0?2.0f:1.0f;
}
B = Aurora::Matrix::fromRawData(dataB,3000, 2000,1,Aurora::Complex);
auto dD = B.toDeviceMatrix();
auto ret1 = Aurora::sum(B, Aurora::FunctionDirection::All);
auto ret2 = Aurora::sum(dD, Aurora::FunctionDirection::All);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize()*2; i++)
{
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Column);
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Column);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
}
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Row);
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Row);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
}
}
}
TEST_F(Function2D_Cuda_Test, mean)
{
//
{
float* dataB = new float[4096*500];
for (size_t i = 0; i < 4096*500; i++)
{
dataB[i] = (float)(i%2==0?1:0);
}
B = Aurora::Matrix::fromRawData(dataB, 4096, 500);
// B = Aurora::Matrix::fromRawData(dataB, 200, 200);
auto dD = B.toDeviceMatrix();
auto ret1 = Aurora::mean(B, Aurora::FunctionDirection::All);
auto ret2 = Aurora::mean(dD, Aurora::FunctionDirection::All);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
ret1 = Aurora::mean(B, Aurora::FunctionDirection::Column);
ret2 = Aurora::mean(dD, Aurora::FunctionDirection::Column);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
}
ret1 = Aurora::mean(B, Aurora::FunctionDirection::Row);
ret2 = Aurora::mean(dD, Aurora::FunctionDirection::Row);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
}
}
}
TEST_F(Function2D_Cuda_Test, sort)
{
//
{
float* dataB = Aurora::random(25000000);
B = Aurora::Matrix::fromRawData(dataB, 500, 500);
// B = Aurora::Matrix::fromRawData(dataB, 200, 200);
auto dD = B.toDeviceMatrix();
auto ret1 = Aurora::sort(B, Aurora::Column);
auto ret2 = Aurora::sort(dD,Aurora::Column);
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
for (size_t i = 0; i < ret1.getDataSize(); i++)
{
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
}
}
}