Add sum,mean, sort to function2D
This commit is contained in:
@@ -74,7 +74,7 @@ if (Aurora_USE_CUDA)
|
||||
target_include_directories(Aurora_Test PRIVATE ./src /usr/local/cuda/include)
|
||||
set_target_properties(Aurora_Test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
target_compile_options(Aurora_Test PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
|
||||
-arch=sm_75 --expt-extended-lambda
|
||||
-arch=sm_75 --expt-extended-lambda -Icub/
|
||||
>)
|
||||
target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
|
||||
target_link_libraries(Aurora_Test PRIVATE ${CUDA_cublas_LIBRARY})
|
||||
|
||||
@@ -1,3 +1,7 @@
|
||||
#include "AuroraDefs.h"
|
||||
#include "CudaMatrix.h"
|
||||
#include "Function1D.h"
|
||||
#include "Matrix.h"
|
||||
#include <Function2D.cuh>
|
||||
#include <cfloat>
|
||||
#include <cstddef>
|
||||
@@ -10,6 +14,7 @@
|
||||
#include <thrust/device_ptr.h>
|
||||
#include <thrust/iterator/constant_iterator.h>
|
||||
#include <thrust/iterator/counting_iterator.h>
|
||||
#include <thrust/iterator/iterator_facade.h>
|
||||
#include <thrust/copy.h>
|
||||
#include <thrust/functional.h>
|
||||
#include <thrust/complex.h>
|
||||
@@ -33,10 +38,10 @@ __global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aCo
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
|
||||
int idx2 = offset + threadIdx.x;
|
||||
if (idx2 < blockDim.x) {
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]);
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -51,7 +56,7 @@ __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aCol
|
||||
{
|
||||
//确定每个thread的基础index
|
||||
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
|
||||
__shared__ float shared_data[512];
|
||||
__shared__ float shared_data[256];
|
||||
// 每个线程加载一个元素到共享内存
|
||||
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX;
|
||||
__syncthreads();
|
||||
@@ -63,10 +68,10 @@ __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aCol
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
|
||||
int idx2 = offset + threadIdx.x;
|
||||
if (idx2 < blockDim.x) {
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[idx2]);
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -113,15 +118,7 @@ CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, l
|
||||
float* retData = nullptr;
|
||||
int rowCount = aMatrix.getDimSize(1);
|
||||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
|
||||
if (rowCount<512){
|
||||
maxRowKernel<<<aMatrix.getDimSize(0),rowCount/2+1>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
}
|
||||
else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){
|
||||
maxRowKernel<<<aMatrix.getDimSize(0),512>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
}
|
||||
else{
|
||||
maxRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
}
|
||||
maxRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
cudaDeviceSynchronize();
|
||||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||||
return ret;
|
||||
@@ -263,10 +260,9 @@ __global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aCo
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
|
||||
int idx2 = offset + threadIdx.x;
|
||||
if (idx2 < blockDim.x) {
|
||||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]);
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x] += fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -281,7 +277,7 @@ __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aCol
|
||||
{
|
||||
//确定每个thread的基础index
|
||||
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
|
||||
__shared__ float shared_data[512];
|
||||
__shared__ float shared_data[256];
|
||||
// 每个线程加载一个元素到共享内存
|
||||
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX;
|
||||
__syncthreads();
|
||||
@@ -293,10 +289,9 @@ __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aCol
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int offset = blockDim.x/2; offset >0; offset>>=1) {
|
||||
int idx2 = offset + threadIdx.x;
|
||||
if (idx2 < blockDim.x) {
|
||||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[idx2]);
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x] += fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
@@ -343,15 +338,7 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, l
|
||||
float* retData = nullptr;
|
||||
int rowCount = aMatrix.getDimSize(1);
|
||||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
|
||||
if (rowCount<512){
|
||||
minRowKernel<<<aMatrix.getDimSize(0),rowCount/2+1>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
}
|
||||
else if (aMatrix.getDimSize(1)/aMatrix.getDimSize(0)>4){
|
||||
minRowKernel<<<aMatrix.getDimSize(0),512>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
}
|
||||
else{
|
||||
minRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
}
|
||||
minRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
cudaDeviceSynchronize();
|
||||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||||
return ret;
|
||||
@@ -476,3 +463,419 @@ CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
||||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
|
||||
__global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount)
|
||||
{
|
||||
//确定每个thread的index
|
||||
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
|
||||
__shared__ double shared_data[256];
|
||||
// 每个线程加载一个元素到共享内存
|
||||
shared_data[threadIdx.x]= (threadIdx.x< aColEleCount) ? aInputData[idx] : 0.0;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<aColEleCount){
|
||||
shared_data[threadIdx.x] += (double)aInputData[idx + offset];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
aOutput[blockIdx.x] = (float)shared_data[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aColEleCount, unsigned int aRowEleCount)
|
||||
{
|
||||
//确定每个thread的基础index
|
||||
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
|
||||
__shared__ float shared_data[256];
|
||||
// 每个线程加载一个元素到共享内存
|
||||
shared_data[threadIdx.x]= (threadIdx.x< aRowEleCount) ? aInputData[idx] : 0.0;
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x+offset < aRowEleCount){
|
||||
shared_data[threadIdx.x]+= aInputData[idx + offset*aColEleCount];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x] += shared_data[threadIdx.x+i];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
aOutput[blockIdx.x] = shared_data[0];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSize)
|
||||
{
|
||||
//确定每个thread的index
|
||||
unsigned int idx = blockIdx.x * 4096 + threadIdx.x;
|
||||
__shared__ float shared_data[256][2];
|
||||
// 每个线程加载一个元素到共享内存
|
||||
bool flag = threadIdx.x< 4096 && idx<aTotalSize;
|
||||
shared_data[threadIdx.x][0]= flag ? aInputData[idx*2] : 0.0;
|
||||
shared_data[threadIdx.x][1]= flag ? aInputData[idx*2+1] : 0.0;
|
||||
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<4096; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<4096 && idx + offset<aTotalSize){
|
||||
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
|
||||
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 +1];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
|
||||
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
aOutput[blockIdx.x] = shared_data[0][0];
|
||||
aOutput[blockIdx.x+gridDim.x] = shared_data[0][1];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCount)
|
||||
{
|
||||
//确定每个thread的index
|
||||
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
|
||||
__shared__ float shared_data[256][2];
|
||||
// 每个线程加载一个元素到共享内存
|
||||
shared_data[threadIdx.x][0]= (threadIdx.x< aColEleCount) ? aInputData[idx*2] : 0.0;
|
||||
shared_data[threadIdx.x][1]= (threadIdx.x< aColEleCount) ? aInputData[idx*2+1] : 0.0;
|
||||
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x + offset<aColEleCount){
|
||||
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
|
||||
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 + 1];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
|
||||
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
aOutput[blockIdx.x*2] = shared_data[0][0];
|
||||
aOutput[blockIdx.x*2+1] = shared_data[0][1];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aColEleCount, unsigned int aRowEleCount)
|
||||
{
|
||||
//确定每个thread的基础index
|
||||
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
|
||||
__shared__ float shared_data[256][2];
|
||||
// 每个线程加载一个元素到共享内存
|
||||
shared_data[threadIdx.x][0]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2] : 0.0;
|
||||
shared_data[threadIdx.x][1]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2+1] : 0.0;
|
||||
|
||||
__syncthreads();
|
||||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||||
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
|
||||
if(threadIdx.x+offset < aRowEleCount){
|
||||
shared_data[threadIdx.x][0]+= aInputData[idx*2 + offset*aColEleCount*2];
|
||||
shared_data[threadIdx.x][1]+= aInputData[idx*2 + offset*aColEleCount*2 + 1];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// 规约最前面一段
|
||||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||||
if (threadIdx.x < i) {
|
||||
shared_data[threadIdx.x][0] += shared_data[threadIdx.x+i][0];
|
||||
shared_data[threadIdx.x][1] += shared_data[threadIdx.x+i][1];
|
||||
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 第一个线程存储每个分段的最大值到全局内存
|
||||
if (threadIdx.x == 0) {
|
||||
aOutput[blockIdx.x*2] = shared_data[0][0];
|
||||
aOutput[blockIdx.x*2+1] = shared_data[0][1];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
if (aMatrix.getDimSize(2)>1 ) {
|
||||
std::cerr<< "sum() not support 3D data!"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
//针对向量行等于列
|
||||
if (direction == Column && aMatrix.getDimSize(0)==1){
|
||||
direction = Row;
|
||||
}
|
||||
if (!aMatrix.isComplex())
|
||||
{
|
||||
switch (direction)
|
||||
{
|
||||
case All:
|
||||
{
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float));
|
||||
auto ret = CudaMatrix::fromRawData(data,1,1,1);
|
||||
float result = thrust::reduce(thrust::device, aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),0.0000000f,thrust::plus<float>());
|
||||
ret.setValue(0,result);
|
||||
return ret;
|
||||
}
|
||||
case Row:
|
||||
{
|
||||
float* matData = aMatrix.getData();
|
||||
float* retData = nullptr;
|
||||
int rowCount = aMatrix.getDimSize(1);
|
||||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
|
||||
sumRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||||
cudaDeviceSynchronize();
|
||||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||||
return ret;
|
||||
}
|
||||
case Column:
|
||||
default:
|
||||
{
|
||||
std::cout<<"Column sum"<<std::endl;
|
||||
float* matData = aMatrix.getData();
|
||||
float* retData = nullptr;
|
||||
int colElementCount = aMatrix.getDimSize(0);
|
||||
if (colElementCount == 1) return aMatrix;
|
||||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
|
||||
sumColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
|
||||
cudaDeviceSynchronize();
|
||||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
switch (direction)
|
||||
{
|
||||
case All:
|
||||
{
|
||||
float* matData = aMatrix.getData();
|
||||
float* retData = nullptr;
|
||||
//divide the whole array to some 4096 blocks, then caculate as columns sum
|
||||
int fakeCol = (int)ceilf((float)aMatrix.getDataSize()/4096.0f);
|
||||
cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol);
|
||||
auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex);
|
||||
sumZAllColKernel<<<fakeCol,256>>>(matData,retData, aMatrix.getDataSize());
|
||||
float* result_data = nullptr;
|
||||
cudaMalloc((void**)&result_data, sizeof(float)*2);
|
||||
auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex);
|
||||
float result = thrust::reduce(thrust::device, ret.getData(),ret.getData()+ ret.getDataSize(),0,thrust::plus<float>());
|
||||
ret2.setValue(0,result);
|
||||
result = thrust::reduce(thrust::device, ret.getData()+ ret.getDataSize(),ret.getData()+ ret.getDataSize()*2,0,thrust::plus<float>());
|
||||
ret2.setValue(1,result);
|
||||
return ret2;
|
||||
}
|
||||
case Row:
|
||||
{
|
||||
float* matData = aMatrix.getData();
|
||||
float* retData = nullptr;
|
||||
int rowElementCount = aMatrix.getDimSize(1);
|
||||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)*2);
|
||||
sumZRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowElementCount);
|
||||
cudaDeviceSynchronize();
|
||||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||||
return ret;
|
||||
}
|
||||
case Column:
|
||||
default:
|
||||
{
|
||||
float* matData = aMatrix.getData();
|
||||
float* retData = nullptr;
|
||||
int colElementCount = aMatrix.getDimSize(0);
|
||||
if (colElementCount == 1) return aMatrix;
|
||||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)*2);
|
||||
sumZColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
|
||||
cudaDeviceSynchronize();
|
||||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1),1,Complex);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
if (aMatrix.getDimSize(2)>1 ) {
|
||||
std::cerr<< "sum() not support 3D data!"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
//针对向量行等于列
|
||||
if (direction == Column && aMatrix.getDimSize(0)==1){
|
||||
direction = Row;
|
||||
}
|
||||
if (!aMatrix.isComplex())
|
||||
{
|
||||
switch (direction)
|
||||
{
|
||||
case All:
|
||||
{
|
||||
auto ret = sum(aMatrix,All);
|
||||
ret.setValue(0,ret.getValue(0)/((float)aMatrix.getDataSize()));
|
||||
return ret;
|
||||
}
|
||||
case Row:
|
||||
{
|
||||
auto ret = sum(aMatrix, Row);
|
||||
float count = (float)aMatrix.getDimSize(1);
|
||||
auto lambda = [=] __device__ (const float& v){
|
||||
return v/count;
|
||||
};
|
||||
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
|
||||
return ret;
|
||||
}
|
||||
case Column:
|
||||
default:
|
||||
{
|
||||
auto ret = sum(aMatrix, Column);
|
||||
float count = (float)aMatrix.getDimSize(0);
|
||||
auto lambda = [=] __device__ (const float& v){
|
||||
return v/count;
|
||||
};
|
||||
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
std::cerr<< "mean() not support complex data!"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
}
|
||||
template <typename ValueType>
|
||||
class RowElementIterator:public thrust::iterator_facade<
|
||||
RowElementIterator<ValueType>,
|
||||
ValueType,
|
||||
thrust::device_system_tag,
|
||||
thrust::random_access_traversal_tag,
|
||||
ValueType& >{
|
||||
public:
|
||||
// 构造函数
|
||||
__host__ __device__
|
||||
RowElementIterator(ValueType* ptr, int aColElementCount=1) : ptr_(ptr),col_elements_(aColElementCount) {}
|
||||
|
||||
__host__ __device__
|
||||
ValueType& dereference() const{
|
||||
return *ptr_;
|
||||
}
|
||||
|
||||
// 实现递增操作符
|
||||
__host__ __device__
|
||||
void increment() {
|
||||
ptr_ = ptr_+col_elements_;
|
||||
}
|
||||
|
||||
// 实现递减操作符
|
||||
__host__ __device__
|
||||
void decrement() {
|
||||
ptr_ = ptr_ - col_elements_;
|
||||
}
|
||||
|
||||
// 实现加法操作符
|
||||
__host__ __device__
|
||||
void advance(typename RowElementIterator::difference_type n) {
|
||||
ptr_ += col_elements_*n;
|
||||
}
|
||||
|
||||
// 实现减法操作符
|
||||
__host__ __device__
|
||||
typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const {
|
||||
return (other.ptr_ - ptr_)/col_elements_;
|
||||
}
|
||||
|
||||
// 实现比较操作符
|
||||
__host__ __device__
|
||||
bool equal(const RowElementIterator& other) const {
|
||||
return ptr_ == other.ptr_;
|
||||
}
|
||||
|
||||
private:
|
||||
ValueType* ptr_;
|
||||
int col_elements_;
|
||||
};
|
||||
|
||||
CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction)
|
||||
{
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
std::cerr
|
||||
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
return sort(std::forward<CudaMatrix &&>(aMatrix.deepCopy()), direction);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction)
|
||||
{
|
||||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||||
std::cerr
|
||||
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
float * data = aMatrix.getData();
|
||||
int colElementCount = aMatrix.getDimSize(0);
|
||||
switch (direction)
|
||||
{
|
||||
case Row:
|
||||
{
|
||||
for (size_t i = 0; i < colElementCount; i++)
|
||||
{
|
||||
thrust::sort(thrust::device, RowElementIterator<float>(data+i,colElementCount),
|
||||
RowElementIterator<float>(data+aMatrix.getDataSize()+i,colElementCount));
|
||||
}
|
||||
return aMatrix;
|
||||
}
|
||||
case Column:
|
||||
{
|
||||
int rowElementCount = aMatrix.getDimSize(1);
|
||||
// softKernel<<<rowElementCount,1>>>(data,colElementCount);
|
||||
return aMatrix;
|
||||
}
|
||||
default:
|
||||
{
|
||||
std::cerr
|
||||
<< "Unsupported direction for sort!"
|
||||
<< std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@@ -14,6 +14,20 @@ namespace Aurora
|
||||
CudaMatrix min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx);
|
||||
CudaMatrix min(const CudaMatrix &aMatrix, const float aValue);
|
||||
CudaMatrix min(const CudaMatrix &aMatrix, const CudaMatrix &aOther);
|
||||
|
||||
CudaMatrix sum(const CudaMatrix &aMatrix, FunctionDirection direction = Column);
|
||||
|
||||
/**
|
||||
* @brief 平均值,注意不支持复数
|
||||
*
|
||||
* @param aMatrix 需要处理的矩阵
|
||||
* @param direction 方向
|
||||
* @return CudaMatrix
|
||||
*/
|
||||
CudaMatrix mean(const CudaMatrix &aMatrix, FunctionDirection direction = Column);
|
||||
CudaMatrix sort(const CudaMatrix &aMatrix,FunctionDirection direction = Column);
|
||||
CudaMatrix sort(CudaMatrix &&aMatrix,FunctionDirection direction = Column);
|
||||
|
||||
}
|
||||
|
||||
#endif // __FUNCTION2D_CUDA_H__
|
||||
@@ -1,5 +1,6 @@
|
||||
#include <gtest/gtest.h>
|
||||
#include <chrono>
|
||||
#include "AuroraDefs.h"
|
||||
#include "CudaMatrix.h"
|
||||
#include "Function.h"
|
||||
#include "Matrix.h"
|
||||
@@ -30,21 +31,19 @@ protected:
|
||||
|
||||
TEST_F(Function2D_Cuda_Test, min)
|
||||
{
|
||||
// big data for test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
|
||||
{
|
||||
float *dataB = Aurora::random(4096*41472);
|
||||
B = Aurora::Matrix::fromRawData(dataB, 4096, 41472);
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -54,16 +53,11 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret1 = Aurora::min(B, Aurora::FunctionDirection::Row,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret2 = Aurora::min(dB, Aurora::FunctionDirection::Row,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -73,21 +67,20 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// different size speed
|
||||
// Aurora::Matrix Aurora::min(const Aurora::Matrix &aMatrix,
|
||||
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
|
||||
// in col wise
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -98,16 +91,11 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
}
|
||||
B.forceReshape( 111,3157, 1);
|
||||
dB = B.toDeviceMatrix();
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret1 = Aurora::min(B, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret2 = Aurora::min(dB, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -117,6 +105,8 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix, float aValue)
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
|
||||
@@ -125,14 +115,10 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::min(B, 500.5f);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
auto ret2 = Aurora::min(dB, 500.5f);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -142,6 +128,10 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// const Aurora::Matrix &aOther)
|
||||
// with same size matrix
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
float *dataA = Aurora::random(3157*111);
|
||||
@@ -151,17 +141,11 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
auto dA = A.toDeviceMatrix();
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::min(B, A);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::min(dB, dA);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -171,6 +155,10 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// const Aurora::Matrix &aOther)
|
||||
// with col-vec and matrix
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
float *dataA = Aurora::random(3157);
|
||||
@@ -180,17 +168,11 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
auto dA = A.toDeviceMatrix();
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::min(B, A);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::min(dB, dA);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -199,11 +181,9 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
{
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret2 = Aurora::min(dA, dB);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -213,6 +193,10 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// const Aurora::Matrix &aOther)
|
||||
// with row-vec and matrix
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
float *dataA = Aurora::random(111);
|
||||
@@ -225,14 +209,9 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::min(B, A);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::min(dB, dA);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -246,21 +225,18 @@ TEST_F(Function2D_Cuda_Test, min)
|
||||
|
||||
TEST_F(Function2D_Cuda_Test, max)
|
||||
{
|
||||
// big data for test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
|
||||
{
|
||||
float *dataB = Aurora::random(4096*41472);
|
||||
B = Aurora::Matrix::fromRawData(dataB, 4096, 41472);
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
auto ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -270,16 +246,10 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
ret1 = Aurora::max(B, Aurora::FunctionDirection::Row,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret2 = Aurora::max(dB, Aurora::FunctionDirection::Row,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -289,21 +259,20 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// different size speed
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// Aurora::FunctionDirection direction, long &rowIdx, long &colIdx)
|
||||
// in col wise
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -314,16 +283,11 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
}
|
||||
B.forceReshape( 111,3157, 1);
|
||||
dB = B.toDeviceMatrix();
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret1 = Aurora::max(B, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
ret2 = Aurora::max(dB, Aurora::FunctionDirection::Column,r,c);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -333,22 +297,18 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix, float aValue)
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::max(B, 500.5f);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::max(dB, 500.5f);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -358,6 +318,10 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// const Aurora::Matrix &aOther)
|
||||
// with same size matrix
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
float *dataA = Aurora::random(3157*111);
|
||||
@@ -370,14 +334,9 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::max(B, A);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret2 = Aurora::max(dB, dA);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -387,26 +346,23 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// const Aurora::Matrix &aOther)
|
||||
// with col-vec and matrix
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
float *dataA = Aurora::random(3157);
|
||||
auto A = Aurora::Matrix::fromRawData(dataA, 3157, 1);
|
||||
|
||||
B = Aurora::Matrix::fromRawData(dataB, 3157, 111);
|
||||
auto dA = A.toDeviceMatrix();
|
||||
dB = B.toDeviceMatrix();
|
||||
long r,c;
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::max(B, A);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// mat x vec
|
||||
auto ret2 = Aurora::max(dB, dA);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -415,20 +371,20 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
{
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
// vec x mat
|
||||
ret2 = Aurora::max(dA, dB);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
}
|
||||
// test
|
||||
// Aurora::Matrix Aurora::max(const Aurora::Matrix &aMatrix,
|
||||
// const Aurora::Matrix &aOther)
|
||||
// with row-vec and matrix
|
||||
{
|
||||
float *dataB = Aurora::random(3157*111);
|
||||
float *dataA = Aurora::random(111);
|
||||
@@ -441,14 +397,20 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
auto start_time_ = std::chrono::high_resolution_clock::now();
|
||||
|
||||
auto ret1 = Aurora::max(B, A);
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test1 duration: " << duration.count() << " ms" << std::endl;
|
||||
start_time_ = std::chrono::high_resolution_clock::now();
|
||||
// mat x vec
|
||||
auto ret2 = Aurora::max(dB, dA);
|
||||
end_time = std::chrono::high_resolution_clock::now();
|
||||
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time_);
|
||||
std::cout << "Test2 duration: " << duration.count() << " ms" << std::endl;
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
ASSERT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
//vec x mat
|
||||
ret2 = Aurora::max(dA, dB);
|
||||
|
||||
ASSERT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
ASSERT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
ASSERT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
@@ -459,3 +421,196 @@ TEST_F(Function2D_Cuda_Test, max)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(Function2D_Cuda_Test, sum)
|
||||
{
|
||||
//
|
||||
{
|
||||
float *dataB = Aurora::random(4096*50000);
|
||||
// float* dataB = new float[4096*50000];
|
||||
// for (size_t i = 0; i < 4096*50000; i++)
|
||||
// {
|
||||
// dataB[i] = (float)(i/4096);
|
||||
// }
|
||||
|
||||
B = Aurora::Matrix::fromRawData(dataB, 4096, 50000);
|
||||
// B = Aurora::Matrix::fromRawData(dataB, 200, 200);
|
||||
|
||||
auto dD = B.toDeviceMatrix();
|
||||
|
||||
auto ret1 = Aurora::sum(B, Aurora::FunctionDirection::All);
|
||||
|
||||
auto ret2 = Aurora::sum(dD, Aurora::FunctionDirection::All);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
|
||||
|
||||
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Column);
|
||||
|
||||
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Column);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
|
||||
}
|
||||
|
||||
|
||||
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Row);
|
||||
|
||||
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Row);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
|
||||
}
|
||||
}
|
||||
//complex type
|
||||
{
|
||||
float* dataB = new float[3000*2000*2];
|
||||
for (size_t i = 0; i < 3000*4000; i++)
|
||||
{
|
||||
dataB[i] = i%2==0?2.0f:1.0f;
|
||||
}
|
||||
|
||||
B = Aurora::Matrix::fromRawData(dataB,3000, 2000,1,Aurora::Complex);
|
||||
|
||||
auto dD = B.toDeviceMatrix();
|
||||
|
||||
auto ret1 = Aurora::sum(B, Aurora::FunctionDirection::All);
|
||||
|
||||
auto ret2 = Aurora::sum(dD, Aurora::FunctionDirection::All);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize()*2; i++)
|
||||
{
|
||||
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
|
||||
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Column);
|
||||
|
||||
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Column);
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
|
||||
}
|
||||
|
||||
|
||||
ret1 = Aurora::sum(B, Aurora::FunctionDirection::Row);
|
||||
|
||||
ret2 = Aurora::sum(dD, Aurora::FunctionDirection::Row);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(Function2D_Cuda_Test, mean)
|
||||
{
|
||||
//
|
||||
{
|
||||
float* dataB = new float[4096*500];
|
||||
for (size_t i = 0; i < 4096*500; i++)
|
||||
{
|
||||
dataB[i] = (float)(i%2==0?1:0);
|
||||
}
|
||||
|
||||
B = Aurora::Matrix::fromRawData(dataB, 4096, 500);
|
||||
// B = Aurora::Matrix::fromRawData(dataB, 200, 200);
|
||||
|
||||
auto dD = B.toDeviceMatrix();
|
||||
|
||||
auto ret1 = Aurora::mean(B, Aurora::FunctionDirection::All);
|
||||
|
||||
auto ret2 = Aurora::mean(dD, Aurora::FunctionDirection::All);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
|
||||
|
||||
ret1 = Aurora::mean(B, Aurora::FunctionDirection::Column);
|
||||
|
||||
ret2 = Aurora::mean(dD, Aurora::FunctionDirection::Column);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
|
||||
}
|
||||
|
||||
|
||||
ret1 = Aurora::mean(B, Aurora::FunctionDirection::Row);
|
||||
|
||||
ret2 = Aurora::mean(dD, Aurora::FunctionDirection::Row);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_AE(ret1[i], ret2.getValue(i))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(Function2D_Cuda_Test, sort)
|
||||
{
|
||||
//
|
||||
{
|
||||
float* dataB = Aurora::random(25000000);
|
||||
B = Aurora::Matrix::fromRawData(dataB, 500, 500);
|
||||
// B = Aurora::Matrix::fromRawData(dataB, 200, 200);
|
||||
|
||||
auto dD = B.toDeviceMatrix();
|
||||
|
||||
auto ret1 = Aurora::sort(B, Aurora::Column);
|
||||
|
||||
auto ret2 = Aurora::sort(dD,Aurora::Column);
|
||||
|
||||
EXPECT_EQ(ret1.getDimSize(0),ret2.getDimSize(0));
|
||||
EXPECT_EQ(ret1.getDimSize(1),ret2.getDimSize(1));
|
||||
EXPECT_EQ(ret1.getDimSize(2),ret2.getDimSize(2));
|
||||
for (size_t i = 0; i < ret1.getDataSize(); i++)
|
||||
{
|
||||
EXPECT_FLOAT_EQ(ret1[i], ret2.getValue(i))<<", index at :"<<i;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user