1 Commits

Author SHA1 Message Date
kradchen
087b02af88 feat: Add remove cuda option and cmak config support 2024-11-04 10:58:34 +08:00
21 changed files with 1554 additions and 2068 deletions

View File

@@ -8,10 +8,9 @@ set(Aurora_USE_CUDA ON)
if (Aurora_USE_CUDA) if (Aurora_USE_CUDA)
set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
set(CUDA_TOOLKIT_ROOT /usr/local/cuda)
enable_language(CUDA) enable_language(CUDA)
find_package(CUDA)
find_package(CUDAToolkit REQUIRED) find_package(CUDAToolkit REQUIRED)
add_definitions(-DUSE_CUDA) add_definitions(-DUSE_CUDA)
endif(Aurora_USE_CUDA) endif(Aurora_USE_CUDA)
@@ -44,12 +43,12 @@ target_link_libraries(Aurora PUBLIC $<LINK_ONLY:MKL::MKL>)
target_link_libraries(Aurora PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(Aurora PUBLIC OpenMP::OpenMP_CXX)
target_link_libraries(Aurora PUBLIC matio) target_link_libraries(Aurora PUBLIC matio)
if (Aurora_USE_CUDA) if (Aurora_USE_CUDA)
target_include_directories(Aurora PRIVATE ./src ${CUDA_INCLUDE_DIRS}) target_include_directories(Aurora PRIVATE ./src /usr/local/cuda/include)
set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_compile_options(Aurora PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: target_compile_options(Aurora PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-arch=sm_75 --expt-extended-lambda -arch=sm_75 --expt-extended-lambda
>) >)
target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} ${CUDA_cufft_LIBRARY}) target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
target_link_libraries(Aurora PRIVATE ${CUDA_cublas_LIBRARY}) target_link_libraries(Aurora PRIVATE ${CUDA_cublas_LIBRARY})
target_link_libraries(Aurora PRIVATE ${CUDA_cusolver_LIBRARY}) target_link_libraries(Aurora PRIVATE ${CUDA_cusolver_LIBRARY})
endif(Aurora_USE_CUDA) endif(Aurora_USE_CUDA)

View File

@@ -1,6 +1,10 @@
set(MKL_INTERFACE_FULL intel_lp64) set(MKL_INTERFACE_FULL intel_lp64)
find_package(OpenMP REQUIRED) find_package(OpenMP REQUIRED)
find_package(MKL CONFIG REQUIRED) find_package(MKL CONFIG REQUIRED)
if(${USE_CUDA})
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
endif()
set(Aurora_MAJOR_VERSION 1) set(Aurora_MAJOR_VERSION 1)
set(Aurora_MINOR_VERSION 0) set(Aurora_MINOR_VERSION 0)
@@ -9,11 +13,17 @@ set(Aurora_BUILD_VERSION 0)
get_filename_component(Aurora_DIR "${CMAKE_CURRENT_LIST_DIR}/" PATH) get_filename_component(Aurora_DIR "${CMAKE_CURRENT_LIST_DIR}/" PATH)
message("Aurora_DIR: ${Aurora_DIR}") message("Aurora_DIR: ${Aurora_DIR}")
file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/[AFSCM]*.cpp" "${Aurora_DIR}/src/Matrix*.cpp" "${Aurora_DIR}/src/*.cu") if(${USE_CUDA})
file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/*.cpp" "${Aurora_DIR}/src/*.cu")
set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY})
else()
set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX )
file(GLOB_RECURSE Aurora_Source "${Aurora_DIR}/src/*.cpp" )
endif()
message( ${Aurora_Source}) message( ${Aurora_Source})
set(Aurora_INCLUDE_DIRS "${Aurora_DIR}/src" "${Aurora_DIR}/thirdparty/include" $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>) set(Aurora_INCLUDE_DIRS "${Aurora_DIR}/src" "${Aurora_DIR}/thirdparty/include" $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
set(Aurora_Complie_Options $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS> ) set(Aurora_Complie_Options $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS> )
set(Aurora_Libraries $<LINK_ONLY:MKL::MKL> OpenMP::OpenMP_CXX ${CUDA_cublas_LIBRARY} ${CUDA_cusolver_LIBRARY})
set(Aurora_FOUND TRUE) set(Aurora_FOUND TRUE)
message(Aurora Found) message(Aurora Found)

View File

@@ -1,9 +1,8 @@
#include "AuroraDefs.h" #include "AuroraDefs.h"
#include "Function1D.cuh"
#include <complex> #include <complex>
#include <utility> #include <utility>
#ifdef USE_CUDA #ifdef USE_CUDA
#include "Function1D.cuh"
#include "CudaMatrix.h" #include "CudaMatrix.h"
#include "Function.h" #include "Function.h"

View File

@@ -5,11 +5,129 @@
#include <thrust/functional.h> #include <thrust/functional.h>
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include "AuroraDefs.h" #include "AuroraDefs.h"
#include "AuroraThrustIterator.cuh" #include "AuroraThrustIterator.cuh"
using namespace thrust::placeholders; using namespace thrust::placeholders;
struct PowOp: public thrust::unary_function<float, float>{
float exponent;
PowOp(float v):exponent(v) {}
void setExponent(float v){
exponent = v;
}
__host__ __device__
float operator()(const float& x) {
return powf(x, exponent);
}
};
struct CompareGOp: public thrust::unary_function<float, float>{
float exponent;
CompareGOp(float v):exponent(v) {}
void setExponent(float v){
exponent = v;
}
__host__ __device__
float operator()(const float& x) {
return (exponent<x?1.0:.0);
}
};
struct CompareGEOp: public thrust::unary_function<float, float>{
float exponent;
CompareGEOp(float v):exponent(v) {}
void setExponent(float v){
exponent = v;
}
__host__ __device__
float operator()(const float& x) {
return (exponent<=x?1.0:.0);
}
};
struct CompareEOp: public thrust::unary_function<float, float>{
float exponent;
CompareEOp(float v):exponent(v) {}
void setExponent(float v){
exponent = v;
}
__host__ __device__
float operator()(const float& x) {
return (exponent==x?1.0:.0);
}
};
struct CompareNEOp: public thrust::unary_function<float, float>{
float exponent;
CompareNEOp(float v):exponent(v) {}
void setExponent(float v){
exponent = v;
}
__host__ __device__
float operator()(const float& x) {
return (exponent!=x?1.0:.0);
}
};
struct CompareLOp: public thrust::unary_function<float, float>{
float exponent;
CompareLOp(float v):exponent(v) {}
void setExponent(float v){
exponent = v;
}
__host__ __device__
float operator()(const float& x) {
return (exponent>x?1.0:.0);
}
};
struct CompareLEOp: public thrust::unary_function<float, float>{
float exponent;
CompareLEOp(float v):exponent(v) {}
void setExponent(float v){
exponent = v;
}
__host__ __device__
float operator()(const float& x) {
return (exponent>=x?1.0:.0);
}
};
struct CompareAGOp{
__host__ __device__
float operator()(const float& x,const float& y) {
return x>y?1:0;
}
};
struct CompareAGEOp{
__host__ __device__
float operator()(const float& x,const float& y) {
return x>=y?1:0;
}
};
struct CompareAEOp{
__host__ __device__
float operator()(const float& x,const float& y) {
return x==y?1:0;
}
};
struct CompareANEOp{
__host__ __device__
float operator()(const float& x,const float& y) {
return x!=y?1:0;
}
};
typedef thrust::complex<float> complexf; typedef thrust::complex<float> complexf;
@@ -533,51 +651,29 @@ void unaryPow(float* in1, float N,float* out, unsigned long length){
thrust::transform(thrust::device,in1,in1+length,out,op); thrust::transform(thrust::device,in1,in1+length,out,op);
return; return;
} }
auto lambdaPow = [N] __host__ __device__(float x) { thrust::transform(thrust::device,in1,in1+length,out,PowOp(N));
return powf(x,N);
};
thrust::transform(thrust::device,in1,in1+length,out,lambdaPow);
} }
void unaryCompare(float* in1, const float& in2, float* out, unsigned long length, int type){ void unaryCompare(float* in1, const float& in2, float* out, unsigned long length, int type){
switch (type) switch (type)
{ {
case G: case G:
thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x) thrust::transform(thrust::device,in1,in1+length,out,CompareGOp(in2));
{
return in2 < x ? 1.0 : .0;
});
break; break;
case GE: case GE:
thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x) thrust::transform(thrust::device,in1,in1+length,out,CompareGEOp(in2));
{
return in2 <= x ? 1.0 : .0;
});
break; break;
case E: case E:
thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x) thrust::transform(thrust::device,in1,in1+length,out,CompareEOp(in2));
{
return in2 == x ? 1.0 : .0;
});
break; break;
case NE: case NE:
thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x) thrust::transform(thrust::device,in1,in1+length,out,CompareNEOp(in2));
{
return in2 != x ? 1.0 : .0;
});
break; break;
case LE: case LE:
thrust::transform(thrust::device,in1,in1+length,out,[in2] __host__ __device__ (const float &x) thrust::transform(thrust::device,in1,in1+length,out,CompareLEOp(in2));
{
return in2 >= x ? 1.0 : .0;
});
break; break;
case L: case L:
thrust::transform(thrust::device,in1,in1+length,out,[in2]__host__ __device__(const float &x) thrust::transform(thrust::device,in1,in1+length,out,CompareLOp(in2));
{
return in2 > x ? 1.0 : .0;
});
break; break;
default: default:
break; break;
@@ -587,89 +683,51 @@ void unaryCompare(const float& in1, float* in2, float* out, unsigned long length
switch (type) switch (type)
{ {
case G: case G:
thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x) thrust::transform(thrust::device,in2,in2+length,out,CompareLOp(in1));
{
return in1 > x ? 1.0 : .0;
});
break; break;
case GE: case GE:
thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x) thrust::transform(thrust::device,in2,in2+length,out,CompareLEOp(in1));
{
return in1 >= x ? 1.0 : .0;
});
break; break;
case E: case E:
thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x) thrust::transform(thrust::device,in2,in2+length,out,CompareEOp(in1));
{
return in1 == x ? 1.0 : .0;
});
break; break;
case NE: case NE:
thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x) thrust::transform(thrust::device,in2,in2+length,out,CompareNEOp(in1));
{
return in1 != x ? 1.0 : .0;
});
break; break;
case LE: case LE:
thrust::transform(thrust::device,in2,in2+length,out, [in1] __host__ __device__ (const float &x) thrust::transform(thrust::device,in2,in2+length,out,CompareGEOp(in1));
{
return in1 <= x ? 1.0 : .0;
});
break; break;
case L: case L:
thrust::transform(thrust::device,in2,in2+length,out,[in1] __host__ __device__ (const float &x) thrust::transform(thrust::device,in2,in2+length,out,CompareGOp(in1));
{
return in1 < x ? 1.0 : .0;
});
break; break;
default: default:
break; break;
} }
} }
void unaryCompare(float* in1, float* in2, float* out, unsigned long length, int type){ void unaryCompare(float* in1, float* in2, float* out, unsigned long length, int type){
switch (type) switch (type)
{ {
case G: case G:
thrust::transform(thrust::device,in1,in1+length,in2,out, []__host__ __device__(float x, float y) thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAGOp());
{
return x > y ? 1. : .0;
});
break; break;
case GE: case GE:
thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__(float x, float y) thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAGEOp());
{
return x >= y ? 1. : .0;
});
break; break;
case E: case E:
thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__(float x, float y) thrust::transform(thrust::device,in1,in1+length,in2,out,CompareAEOp());
{
return x == y ? 1. : .0;
});
break; break;
case NE: case NE:
thrust::transform(thrust::device,in1,in1+length,in2,out, []__host__ __device__(float x, float y) thrust::transform(thrust::device,in1,in1+length,in2,out,CompareANEOp());
{
return x != y ? 1. : .0;
});
break; break;
case LE: case LE:
thrust::transform(thrust::device,in1,in1+length,in2,out,[]__host__ __device__ (float x, float y) thrust::transform(thrust::device,in2,in2+length,in1,out,CompareAGEOp());
{
return x <= y ? 1. : .0;
});
break; break;
case L: case L:
thrust::transform(thrust::device,in1,in1+length,in2,out, [] __host__ __device__ (float x, float y) thrust::transform(thrust::device,in2,in2+length,in1,out,CompareAGOp());
{
return x < y ? 1. : .0;
});
break; break;
default: default:
break; break;
} }
} }
void thrustFill(float* aBegin, float* aEnd, float aValue) void thrustFill(float* aBegin, float* aEnd, float aValue)

View File

@@ -35,7 +35,55 @@ namespace {
uint CONVERT_ADD_VALUE = UINT32_MAX - 4095; uint CONVERT_ADD_VALUE = UINT32_MAX - 4095;
inline void convertValue(short* aValue ,float* des){ inline void convertValue(float aValue ,float* des){
float value = aValue;
ushort *exponentPtr = (ushort *)&value;
exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
exponentPtr[2] = (exponentPtr[2] >> 11) & CONVERT_AND_VALUE;
exponentPtr[3] = (exponentPtr[3] >> 11) & CONVERT_AND_VALUE;
float signValue = aValue;
short *signPtr = (short *)&signValue;
uint sign_bit[4] = {
(uint)(signPtr[0] < 0 ? 1 : 0), (uint)(signPtr[1] < 0 ? 1 : 0),
(uint)(signPtr[2] < 0 ? 1 : 0), (uint)(signPtr[3] < 0 ? 1 : 0)};
float fraction3Value = aValue;
ushort *fraction3Ptr = (ushort *)&fraction3Value;
fraction3Ptr[0] &= CONVERT_AND_VALUE_2;
fraction3Ptr[1] &= CONVERT_AND_VALUE_2;
fraction3Ptr[2] &= CONVERT_AND_VALUE_2;
fraction3Ptr[3] &= CONVERT_AND_VALUE_2;
uint hidden_bit[4] = {
sign_bit[0] * (!exponentPtr[0] ? 1 : 0) * CONVERT_MUL_VALUE +
((!sign_bit[0] && exponentPtr[0]) ? 1 : 0) * CONVERT_MUL_VALUE,
sign_bit[1] * (!exponentPtr[1] ? 1 : 0) * 2048 +
((!sign_bit[1] && exponentPtr[1]) ? 1 : 0) * CONVERT_MUL_VALUE,
sign_bit[2] * (!exponentPtr[2] ? 1 : 0) * CONVERT_MUL_VALUE +
((!sign_bit[2] && exponentPtr[2]) ? 1 : 0) * CONVERT_MUL_VALUE,
sign_bit[3] * (!exponentPtr[3] ? 1 : 0) * 2048 +
((!sign_bit[3] && exponentPtr[3]) ? 1 : 0) * CONVERT_MUL_VALUE,
};
int outputPtr[4] = {0};
uint temp = fraction3Ptr[0] + hidden_bit[0] + sign_bit[0] * CONVERT_ADD_VALUE;
outputPtr[0] = exponentPtr[0] > 1 ? (temp << (exponentPtr[0] - 1))
: (temp >> std::abs(exponentPtr[0] - 1));
temp = fraction3Ptr[1] + hidden_bit[1] + sign_bit[1] * CONVERT_ADD_VALUE;
outputPtr[1] = exponentPtr[1] > 1 ? (temp << (exponentPtr[1] - 1))
: (temp >> std::abs(exponentPtr[1] - 1));
temp = fraction3Ptr[2] + hidden_bit[2] + sign_bit[2] * CONVERT_ADD_VALUE;
outputPtr[2] = exponentPtr[2] > 1 ? (temp << (exponentPtr[2] - 1))
: (temp >> std::abs(exponentPtr[2] - 1));
temp = fraction3Ptr[3] + hidden_bit[3] + sign_bit[3] * CONVERT_ADD_VALUE;
outputPtr[3] = exponentPtr[3] > 1 ? (temp << (exponentPtr[3] - 1))
: (temp >> std::abs(exponentPtr[3] - 1));
des[0] = outputPtr[0];
des[1] = outputPtr[1];
des[2] = outputPtr[2];
des[3] = outputPtr[3];
}
inline void convertValue2(short* aValue ,float* des){
ushort exponentPtr[4] = {(ushort)aValue[0],(ushort)aValue[1],(ushort)aValue[2],(ushort)aValue[3]}; ushort exponentPtr[4] = {(ushort)aValue[0],(ushort)aValue[1],(ushort)aValue[2],(ushort)aValue[3]};
exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE; exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE; exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
@@ -568,9 +616,7 @@ Matrix Aurora::acosd(const Matrix& aMatrix)
{ {
resultData[i] = resultData[i] * 180 / PI; resultData[i] = resultData[i] * 180 / PI;
} }
Matrix result = Matrix::New(resultData, aMatrix); return Matrix::New(resultData, aMatrix);
nantoval(result, 0);
return result;
} }
Matrix Aurora::conj(const Matrix& aMatrix) Matrix Aurora::conj(const Matrix& aMatrix)
@@ -1050,14 +1096,14 @@ Matrix Aurora::convertfp16tofloat(short* aData, int aRows, int aColumns)
#pragma omp parallel for #pragma omp parallel for
for (size_t i = 0; i < quaterSize; i+=8) { for (size_t i = 0; i < quaterSize; i+=8) {
//循环展开以避免过度的线程调用 //循环展开以避免过度的线程调用
if (i < quaterSize)::convertValue((short*)(input+i*4), output + (i) * 4); if (i < quaterSize)::convertValue2((short*)(input+i*4), output + (i) * 4);
if (i+1 < quaterSize)::convertValue((short*)(input+(i+1)*4), output + (i+1) * 4); if (i+1 < quaterSize)::convertValue2((short*)(input+(i+1)*4), output + (i+1) * 4);
if (i+2 < quaterSize)::convertValue((short*)(input+(i+2)*4), output + (i+2) * 4); if (i+2 < quaterSize)::convertValue2((short*)(input+(i+2)*4), output + (i+2) * 4);
if (i+3 < quaterSize)::convertValue((short*)(input+(i+3)*4), output + (i+3) * 4); if (i+3 < quaterSize)::convertValue2((short*)(input+(i+3)*4), output + (i+3) * 4);
if (i+4 < quaterSize)::convertValue((short*)(input+(i+4)*4), output + (i+4) * 4); if (i+4 < quaterSize)::convertValue2((short*)(input+(i+4)*4), output + (i+4) * 4);
if (i+5 < quaterSize)::convertValue((short*)(input+(i+5)*4), output + (i+5) * 4); if (i+5 < quaterSize)::convertValue2((short*)(input+(i+5)*4), output + (i+5) * 4);
if (i+6 < quaterSize)::convertValue((short*)(input+(i+6)*4), output + (i+6) * 4); if (i+6 < quaterSize)::convertValue2((short*)(input+(i+6)*4), output + (i+6) * 4);
if (i+7 < quaterSize)::convertValue((short*)(input+(i+7)*4), output + (i+7) * 4); if (i+7 < quaterSize)::convertValue2((short*)(input+(i+7)*4), output + (i+7) * 4);
} }
return Matrix::New(output,aRows,aColumns,1); return Matrix::New(output,aRows,aColumns,1);
} }

View File

@@ -8,12 +8,6 @@
#include <cstddef> #include <cstddef>
#include <cstdlib> #include <cstdlib>
#include <sys/types.h> #include <sys/types.h>
//this 3 header, only need by cuda version >= 12
#include <thrust/unique.h>
#include <thrust/sort.h>
#include <thrust/set_operations.h>
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include <thrust/iterator/constant_iterator.h> #include <thrust/iterator/constant_iterator.h>
@@ -55,7 +49,6 @@ __global__ void complexKernel(float *aInputRealData, float *aInputImagData, floa
} }
} }
CudaMatrix Aurora::complex(const CudaMatrix& aMatrix) CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
{ {
if(aMatrix.isComplex()) if(aMatrix.isComplex())
@@ -360,8 +353,7 @@ CudaMatrix Aurora::sign(const CudaMatrix &&aMatrix)
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
} }
void Aurora::nantoval(CudaMatrix &aMatrix, float val) void Aurora::nantoval(CudaMatrix& aMatrix,float val){
{
auto lambda = [=] __host__ __device__ (const float& x){ auto lambda = [=] __host__ __device__ (const float& x){
return ::isnan(x)?val:x; return ::isnan(x)?val:x;
}; };
@@ -369,8 +361,7 @@ void Aurora::nantoval(CudaMatrix &aMatrix, float val)
aMatrix.getData()+aMatrix.getDataSize(),aMatrix.getData(),lambda); aMatrix.getData()+aMatrix.getDataSize(),aMatrix.getData(),lambda);
} }
CudaMatrix Aurora::isnan(const CudaMatrix &aMatrix) CudaMatrix Aurora::isnan(const CudaMatrix& aMatrix){
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr; float* data = nullptr;
@@ -384,8 +375,7 @@ CudaMatrix Aurora::isnan(const CudaMatrix &aMatrix)
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
} }
CudaMatrix Aurora::isfinite(const CudaMatrix &aMatrix) CudaMatrix Aurora::isfinite(const CudaMatrix& aMatrix){
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr; float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size); cudaMalloc((void**)&data, sizeof(float) * size);
@@ -398,15 +388,13 @@ CudaMatrix Aurora::isfinite(const CudaMatrix &aMatrix)
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
} }
void Aurora::padding(CudaMatrix &aMatrix, int aIndex, float aValue) void Aurora::padding(CudaMatrix& aMatrix, int aIndex, float aValue){
{
if(aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex()) if(aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex())
{ {
std::cerr<<"padding only support real vector"<<std::endl; std::cerr<<"padding only support real vector"<<std::endl;
return; return;
} }
if (aMatrix.getDataSize() > aIndex) if (aMatrix.getDataSize()>aIndex){
{
aMatrix.setValue(aIndex, aValue); aMatrix.setValue(aIndex, aValue);
return; return;
} }
@@ -419,13 +407,12 @@ void Aurora::padding(CudaMatrix &aMatrix, int aIndex, float aValue)
aMatrix=CudaMatrix::fromRawData(data,size,1,1,aMatrix.getValueType()); aMatrix=CudaMatrix::fromRawData(data,size,1,1,aMatrix.getValueType());
} }
CudaMatrix Aurora::auroraNot(const CudaMatrix &aMatrix)
{ CudaMatrix Aurora::auroraNot(const CudaMatrix& aMatrix){
return auroraNot(std::forward<CudaMatrix&&>(aMatrix.deepCopy())); return auroraNot(std::forward<CudaMatrix&&>(aMatrix.deepCopy()));
} }
CudaMatrix Aurora::auroraNot(CudaMatrix &&aMatrix) CudaMatrix Aurora::auroraNot(CudaMatrix&& aMatrix){
{
size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr; float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size); cudaMalloc((void**)&data, sizeof(float) * size);
@@ -438,6 +425,7 @@ CudaMatrix Aurora::auroraNot(CudaMatrix &&aMatrix)
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
} }
void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op) void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op)
{ {
switch (op) switch (op)
@@ -450,40 +438,35 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, float compareValue, float newV
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
break; break;
} }
case NG: case NG:{
{
auto lambda = [=] __host__ __device__ (const float& x){ auto lambda = [=] __host__ __device__ (const float& x){
return x<=compareValue?newValue:x; return x<=compareValue?newValue:x;
}; };
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
break; break;
} }
case EQ: case EQ:{
{
auto lambda = [=] __host__ __device__ (const float& x){ auto lambda = [=] __host__ __device__ (const float& x){
return x==compareValue?newValue:x; return x==compareValue?newValue:x;
}; };
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
break; break;
} }
case NE: case NE:{
{
auto lambda = [=] __host__ __device__ (const float& x){ auto lambda = [=] __host__ __device__ (const float& x){
return x!=compareValue?newValue:x; return x!=compareValue?newValue:x;
}; };
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
break; break;
} }
case NL: case NL:{
{
auto lambda = [=] __host__ __device__ (const float& x){ auto lambda = [=] __host__ __device__ (const float& x){
return x>=compareValue?newValue:x; return x>=compareValue?newValue:x;
}; };
thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda);
break; break;
} }
case LT: case LT:{
{
auto lambda = [=] __host__ __device__ (const float& x){ auto lambda = [=] __host__ __device__ (const float& x){
return x<compareValue?newValue:x; return x<compareValue?newValue:x;
}; };
@@ -493,10 +476,10 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, float compareValue, float newV
default: default:
break; break;
} }
} }
void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, float compareValue, float newValue, CompareOp op) void Aurora::compareSet(CudaMatrix& aValueMatrix,CudaMatrix& aCompareMatrix,float compareValue, float newValue,CompareOp op){
{
switch (op) switch (op)
{ {
case GT: case GT:
@@ -510,8 +493,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
lambda); lambda);
break; break;
} }
case NG: case NG:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x<=compareValue?newValue:y; return x<=compareValue?newValue:y;
}; };
@@ -521,8 +503,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
lambda); lambda);
break; break;
} }
case EQ: case EQ:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x==compareValue?newValue:y; return x==compareValue?newValue:y;
}; };
@@ -532,8 +513,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
lambda); lambda);
break; break;
} }
case NE: case NE:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x!=compareValue?newValue:y; return x!=compareValue?newValue:y;
}; };
@@ -543,19 +523,17 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
lambda); lambda);
break; break;
} }
case NL: case NL:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x>=compareValue?newValue:y; return x>=compareValue?newValue:y;
}; };
thrust::transform(aCompareMatrix.getData(), thrust::transform(thrust::device,aCompareMatrix.getData(),
aCompareMatrix.getData()+aValueMatrix.getDataSize(), aCompareMatrix.getData()+aValueMatrix.getDataSize(),
aValueMatrix.getData(), aValueMatrix.getData(), aValueMatrix.getData(), aValueMatrix.getData(),
lambda); lambda);
break; break;
} }
case LT: case LT:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x<compareValue?newValue:y; return x<compareValue?newValue:y;
}; };
@@ -570,8 +548,7 @@ void Aurora::compareSet(CudaMatrix &aValueMatrix, CudaMatrix &aCompareMatrix, fl
} }
} }
void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherCompareMatrix, float newValue, CompareOp op) void Aurora::compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op){
{
switch (op) switch (op)
{ {
case GT: case GT:
@@ -585,8 +562,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
lambda); lambda);
break; break;
} }
case NG: case NG:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x<=y?newValue:x; return x<=y?newValue:x;
}; };
@@ -596,8 +572,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
lambda); lambda);
break; break;
} }
case EQ: case EQ:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x==y?newValue:x; return x==y?newValue:x;
}; };
@@ -607,8 +582,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
lambda); lambda);
break; break;
} }
case NE: case NE:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x!=y?newValue:x; return x!=y?newValue:x;
}; };
@@ -618,8 +592,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
lambda); lambda);
break; break;
} }
case NL: case NL:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x>=y?newValue:x; return x>=y?newValue:x;
}; };
@@ -629,8 +602,7 @@ void Aurora::compareSet(CudaMatrix &aDesAndCompareMatrix, CudaMatrix &aOtherComp
lambda); lambda);
break; break;
} }
case LT: case LT:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x<y?newValue:x; return x<y?newValue:x;
}; };
@@ -660,8 +632,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
lambda); lambda);
break; break;
} }
case NG: case NG:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x<=compareValue?y:x; return x<=compareValue?y:x;
}; };
@@ -671,8 +642,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
lambda); lambda);
break; break;
} }
case EQ: case EQ:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x==compareValue?y:x; return x==compareValue?y:x;
}; };
@@ -682,8 +652,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
lambda); lambda);
break; break;
} }
case NE: case NE:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x!=compareValue?y:x; return x!=compareValue?y:x;
}; };
@@ -693,8 +662,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
lambda); lambda);
break; break;
} }
case NL: case NL:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x>=compareValue?y:x; return x>=compareValue?y:x;
}; };
@@ -704,8 +672,7 @@ void Aurora::compareSet(CudaMatrix &aCompareMatrix, float compareValue, CudaMatr
lambda); lambda);
break; break;
} }
case LT: case LT:{
{
auto lambda = [=] __host__ __device__ (const float& x, const float& y){ auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return x<compareValue?y:x; return x<compareValue?y:x;
}; };
@@ -811,6 +778,7 @@ __global__ void repMat3DKernel(float *aInputData, unsigned int aInputRows, unsig
{ {
aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows]; aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows];
} }
} }
CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes) CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
@@ -988,17 +956,6 @@ __global__ void conjKernel(float *aInputData, float *aOutput, unsigned int aInpu
} }
} }
__global__ void conjInplaceKernel(float *aInputData, unsigned int aInputSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aInputSize)
{
unsigned int index = idx * 2;
aInputData[index + 1] = -aInputData[index + 1];
}
}
CudaMatrix Aurora::conj(const CudaMatrix& aMatrix) CudaMatrix Aurora::conj(const CudaMatrix& aMatrix)
{ {
if(!aMatrix.isComplex()) if(!aMatrix.isComplex())
@@ -1014,18 +971,6 @@ CudaMatrix Aurora::conj(const CudaMatrix &aMatrix)
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
} }
CudaMatrix Aurora::conj(CudaMatrix &&aMatrix)
{
if (!aMatrix.isComplex())
{
return CudaMatrix::copyFromRawData(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
}
size_t size = aMatrix.getDataSize();
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
conjInplaceKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), size);
cudaDeviceSynchronize();
return aMatrix;
}
float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod) float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod)
{ {
@@ -1129,20 +1074,13 @@ float Aurora::norm(const CudaMatrix &aMatrix, NormMethod aNormMethod)
} }
} }
if (d_S) if (d_S ) cudaFree(d_S);
cudaFree(d_S); if (d_U ) cudaFree(d_U);
if (d_U) if (d_VT) cudaFree(d_VT);
cudaFree(d_U); if (devInfo) cudaFree(devInfo);
if (d_VT) if (d_work) cudaFree(d_work);
cudaFree(d_VT); if (cusolverH) cusolverDnDestroy(cusolverH);
if (devInfo) if (stream) cudaStreamDestroy(stream);
cudaFree(devInfo);
if (d_work)
cudaFree(d_work);
if (cusolverH)
cusolverDnDestroy(cusolverH);
if (stream)
cudaStreamDestroy(stream);
return resultValue; return resultValue;
} }
} }
@@ -1167,6 +1105,7 @@ __global__ void transposeKernel(float *aInputData, float *aOutput, unsigned int
} }
} }
CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix) CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix)
{ {
//not surpport for 3 dims. //not surpport for 3 dims.
@@ -1252,6 +1191,7 @@ __global__ void vertcatKernel(float *aInputData1, unsigned int aInputData1RowSiz
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize]; aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize];
} }
} }
} }
CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
@@ -1571,10 +1511,12 @@ CudaMatrix Aurora::createCudaVectorMatrix(float aStartValue, float aStepValue, f
struct compareMatrixByRows struct compareMatrixByRows
{ {
compareMatrixByRows(unsigned int aSize) compareMatrixByRows(unsigned int aSize)
: mSize(aSize) { : mSize(aSize)
{
}; };
unsigned int mSize; unsigned int mSize;
__host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const __host__ __device__
bool operator()(const float* aVector1, const float* aVector2) const
{ {
for(unsigned int i=0; i<mSize; ++i) for(unsigned int i=0; i<mSize; ++i)
{ {
@@ -1594,10 +1536,12 @@ struct compareMatrixByRows
struct uniqueMatrixByRows struct uniqueMatrixByRows
{ {
uniqueMatrixByRows(unsigned int aSize) uniqueMatrixByRows(unsigned int aSize)
: mSize(aSize) { : mSize(aSize)
{
}; };
unsigned int mSize; unsigned int mSize;
__host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const __host__ __device__
bool operator()(const float* aVector1, const float* aVector2) const
{ {
for(unsigned int i=0; i<mSize; ++i) for(unsigned int i=0; i<mSize; ++i)
{ {
@@ -1613,11 +1557,14 @@ struct uniqueMatrixByRows
struct equalMatrixByRows struct equalMatrixByRows
{ {
equalMatrixByRows(unsigned int aSize, float* aValue) equalMatrixByRows(unsigned int aSize, float* aValue)
: mSize(aSize), mValue(aValue) { : mSize(aSize)
, mValue(aValue)
{
}; };
unsigned int mSize; unsigned int mSize;
float* mValue; float* mValue;
__host__ __device__ bool operator()(const float *aVector) const __host__ __device__
bool operator()(const float* aVector) const
{ {
for(unsigned int i=0; i<mSize; ++i) for(unsigned int i=0; i<mSize; ++i)
{ {
@@ -1630,6 +1577,8 @@ struct equalMatrixByRows
} }
}; };
CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexResult) CudaMatrix Aurora::uniqueByRows(const CudaMatrix& aMatrix, CudaMatrix& aIndexResult)
{ {
CudaMatrix transposeMatrix = transpose(aMatrix); CudaMatrix transposeMatrix = transpose(aMatrix);
@@ -1668,14 +1617,12 @@ CudaMatrix Aurora::uniqueByRows(const CudaMatrix &aMatrix, CudaMatrix &aIndexRes
return transpose(CudaMatrix::fromRawData(resultData, rows, columns)); return transpose(CudaMatrix::fromRawData(resultData, rows, columns));
} }
__global__ void convertValueKernel(float *aSrc, float *aDes, unsigned int size) __global__ void convertValueKernel(float* aSrc ,float* aDes, unsigned int size){
{
__shared__ ushort CONVERT_AND_VALUE; __shared__ ushort CONVERT_AND_VALUE;
__shared__ ushort CONVERT_AND_VALUE_2; __shared__ ushort CONVERT_AND_VALUE_2;
__shared__ ushort CONVERT_MUL_VALUE; __shared__ ushort CONVERT_MUL_VALUE;
__shared__ unsigned int CONVERT_ADD_VALUE; __shared__ unsigned int CONVERT_ADD_VALUE;
if (threadIdx.x == 0) if (threadIdx.x == 0){
{
CONVERT_AND_VALUE = 15u; CONVERT_AND_VALUE = 15u;
CONVERT_AND_VALUE_2 = 2047u; CONVERT_AND_VALUE_2 = 2047u;
CONVERT_MUL_VALUE = 2048u; CONVERT_MUL_VALUE = 2048u;
@@ -1683,8 +1630,7 @@ __global__ void convertValueKernel(float *aSrc, float *aDes, unsigned int size)
} }
__syncthreads(); __syncthreads();
unsigned int idx = blockIdx.x*blockDim.x +threadIdx.x; unsigned int idx = blockIdx.x*blockDim.x +threadIdx.x;
if (idx >= size) if (idx >= size) return;
return;
short value = aSrc[idx]; short value = aSrc[idx];
ushort exponent=(ushort)value; ushort exponent=(ushort)value;
exponent = (exponent >> 11) & CONVERT_AND_VALUE; exponent = (exponent >> 11) & CONVERT_AND_VALUE;

View File

@@ -63,8 +63,6 @@ namespace Aurora
CudaMatrix conj(const CudaMatrix& aMatrix); CudaMatrix conj(const CudaMatrix& aMatrix);
CudaMatrix conj(CudaMatrix&& aMatrix);
float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod); float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
CudaMatrix transpose(const CudaMatrix& aMatrix); CudaMatrix transpose(const CudaMatrix& aMatrix);
@@ -126,4 +124,5 @@ namespace Aurora
void compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op); void compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op);
void compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatrix& aNewValueMatrix,CompareOp op); void compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatrix& aNewValueMatrix,CompareOp op);
} }
#endif //AURORA_CUDA_FUNCTION1D_H #endif //AURORA_CUDA_FUNCTION1D_H

View File

@@ -1034,87 +1034,3 @@ Matrix Aurora::sub2ind(const Matrix &aVMatrixSize, std::initializer_list<Matrix>
delete [] strides; delete [] strides;
return Matrix::New(output,returnVectorSize,1,1); return Matrix::New(output,returnVectorSize,1,1);
} }
void Aurora::findPeaksHost(const Matrix & aData, int aNPeaks, float aMinPeakHeight, float MinPeakProminece,
int* outIndex)
{
int signalSize = aData.getDimSize(0);
int signalCount = aData.getDimSize(1);
#pragma omp parallel for
for (size_t threadIndex = 0; threadIndex < signalCount; threadIndex++)
{
float* dataPointer = aData.getData() + threadIndex*signalSize ;
float newPeak = dataPointer[0];
float newValley = dataPointer[0];
int peakIndex = 0;
float higherValley;
int indexs[32];
float values[32];
for (size_t i = 0; i < aNPeaks; i++)
{
indexs[i] = signalSize;
values[i] = 0;
}
int save_index=0;
for (int i = 1; i < signalSize - 1; ++i)
{
// find peaks
if (dataPointer[i] > dataPointer[i - 1] && dataPointer[i] > dataPointer[i + 1])
{
newPeak = dataPointer[i];
peakIndex = i;
};
// find valley
if (dataPointer[i] < dataPointer[i - 1] && dataPointer[i] < dataPointer[i + 1])
{
higherValley = std::max(newValley, dataPointer[i]);
newValley = dataPointer[i];
if (newPeak >= aMinPeakHeight)
{
float prominece = newPeak - higherValley;
if (prominece >= MinPeakProminece)
{
if (save_index < aNPeaks)
{
values[save_index] = newPeak;
indexs[save_index] = peakIndex;
save_index++;
}
else
{
for (size_t j = 0; j < aNPeaks; j++)
{
if (values[j] < newPeak)
{
std::swap(values[j], newPeak);
std::swap(indexs[j], peakIndex);
}
}
}
}
}
}
}
if (save_index>=aNPeaks)
{
for (size_t i = 0; i < aNPeaks; i++)
{
for (size_t j = i+1; j < aNPeaks; j++)
{
if (indexs[i]>indexs[j])
{
std::swap(values[j], values[i]);
std::swap(indexs[j], indexs[i]);
}
}
}
}
for (size_t i = 0; i < aNPeaks; i++)
{
// aOutPeaks[threadIndex*aNPeaks+i] = values[i];
outIndex[threadIndex*aNPeaks+i] = indexs[i];
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -64,7 +64,6 @@ namespace Aurora
CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1); CudaMatrix fft(const CudaMatrix &aMatrix, long aFFTSize = -1);
CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1); CudaMatrix ifft(const CudaMatrix &aMatrix, long aFFTSize = -1);
CudaMatrix ifft(CudaMatrix && aMatrix);
CudaMatrix hilbert(const CudaMatrix &aMatrix); CudaMatrix hilbert(const CudaMatrix &aMatrix);
@@ -87,19 +86,6 @@ namespace Aurora
*/ */
CudaMatrix ifft_symmetric(const CudaMatrix &aMatrix,long aLength); CudaMatrix ifft_symmetric(const CudaMatrix &aMatrix,long aLength);
CudaMatrix valid(const CudaMatrix& aData, const CudaMatrix aValid);
/**
* findPeaks 按列进行峰查找和匹配
* @attention 不要给aOutIndexs提前分配内存
* @param aData 输入
* @param aNpeaks 峰数量
* @param aMinPeakHeight 最小高度
* @param aMinPeakProminence 最小相对高度
* @param aOutIndexs 空指针会在函数内分配device内存
* @return 筛选出的峰高度
*/
CudaMatrix findPeaks(const CudaMatrix& aData, int aNpeaks, float aMinPeakHeight, float aMinPeakProminence, int** aOutIndexs);
} }
#endif // __FUNCTION2D_CUDA_H__ #endif // __FUNCTION2D_CUDA_H__

View File

@@ -107,10 +107,10 @@ namespace Aurora
* 当第一列包含重复的元素时sortrows 会根据下一列中的值进行排序,并对后续的相等值重复此行为。 * 当第一列包含重复的元素时sortrows 会根据下一列中的值进行排序,并对后续的相等值重复此行为。
* @attention 目前不支持三维,不支持复数 * @attention 目前不支持三维,不支持复数
* @param aMatrix 目标矩阵 * @param aMatrix 目标矩阵
* @param indexMatrix 排序后各行的原索引矩阵指针,必须要有 * @param indexMatrix 排序后各行的原索引矩阵指针,必须
* @return 排序后矩阵 * @return 排序后矩阵
*/ */
Matrix sortrows(const Matrix &aMatrix, Matrix* indexMatrix); Matrix sortrows(const Matrix &aMatrix, Matrix* indexMatrix=nullptr);
/** /**
* 对矩阵求中间值 按列, 目前不支持三维,不支持复数 * 对矩阵求中间值 按列, 目前不支持三维,不支持复数
@@ -178,7 +178,7 @@ namespace Aurora
* @return * @return
*/ */
Matrix sub2ind(const Matrix &aVMatrixSize, std::initializer_list<Matrix> aSliceIdxs); Matrix sub2ind(const Matrix &aVMatrixSize, std::initializer_list<Matrix> aSliceIdxs);
void findPeaksHost(const Matrix & aData, int aNPeaks, float aMinPeakHeight, float MinPeakProminece,int* outIndex);
}; };
#endif // AURORA_FUNCTION2D_H #endif // AURORA_FUNCTION2D_H

View File

@@ -5,10 +5,12 @@
#include "Function.h" #include "Function.h"
#ifdef USE_CUDA #ifdef USE_CUDA
#include "CudaMatrix.h"
#include "CudaMatrixPrivate.cuh"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif // USE_CUDA #include "CudaMatrixPrivate.cuh"
#include "CudaMatrix.h"
#endif
//必须在Eigen之前 //必须在Eigen之前
#include "AuroraDefs.h" #include "AuroraDefs.h"
@@ -72,6 +74,7 @@ Matrix Aurora::ones(int aRow, int aColumn, int aSlice) {
return Matrix::New(data,rowSize,colSize,aSlice); return Matrix::New(data,rowSize,colSize,aSlice);
} }
Matrix Aurora::ones(int aSquareRow) { Matrix Aurora::ones(int aSquareRow) {
return Aurora::ones(aSquareRow, aSquareRow); return Aurora::ones(aSquareRow, aSquareRow);
} }
@@ -92,6 +95,7 @@ Matrix Aurora::zeros(int aRow, int aColumn, int aSlice) {
return Matrix::New(data,rowSize,colSize,sliceSize); return Matrix::New(data,rowSize,colSize,sliceSize);
} }
Matrix Aurora::zeros(int aSquareRow) { Matrix Aurora::zeros(int aSquareRow) {
return Aurora::zeros(aSquareRow, aSquareRow); return Aurora::zeros(aSquareRow, aSquareRow);
} }
@@ -214,7 +218,7 @@ Matrix Aurora::meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix&
return result; return result;
} }
#if USE_CUDA #ifdef USE_CUDA
CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){ CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){
if (aRow == 0 || aColumn == 0) if (aRow == 0 || aColumn == 0)
{ {
@@ -251,7 +255,6 @@ CudaMatrix Aurora::zerosCuda(int aRow, int aColumn, int aSlice) {
return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize); return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize);
} }
CudaMatrix Aurora::zerosCuda(int aSquareRow) { CudaMatrix Aurora::zerosCuda(int aSquareRow) {
return Aurora::zerosCuda(aSquareRow, aSquareRow); return Aurora::zerosCuda(aSquareRow, aSquareRow);
} }

View File

@@ -1,82 +0,0 @@
#include "Function3D.cuh"
using namespace Aurora;
__global__ void interp3Kernel(cudaTextureObject_t aTexObj, float* aOutputData, float aStartX, float aDx, float aEndX, float aStartY, float aDy
, float aEndY, float aStartZ, float aDz, float aEndZ, float* aNewX, float* aNewY, float* aNewZ
, int aOutputRowSize, int aOutputColumnSize, int aOutputSliceSize, float aOutValue)
{
int xIndex = blockIdx.x * blockDim.x + threadIdx.x;
int yIndex = blockIdx.y * blockDim.y + threadIdx.y;
int zIndex = blockIdx.z * blockDim.z + threadIdx.z;
if(xIndex > aOutputRowSize - 1 || yIndex > aOutputColumnSize - 1 || zIndex > aOutputSliceSize - 1)
{
return;
}
size_t index = zIndex * aOutputRowSize * aOutputColumnSize + yIndex * aOutputRowSize + xIndex;
float x = aNewX[index];
float y = aNewY[index];
float z = aNewZ[index];
if(x > aEndX || x < aStartX || y > aEndY || y < aStartY || z > aEndZ || z < aStartZ)
{
aOutputData[index] = aOutValue;
}
else
{
aOutputData[index] = tex3D<float>(aTexObj, (x - aStartX) / aDx + 0.5, (y - aStartY) / aDy + 0.5, (z - aStartZ) / aDz + 0.5);
}
}
CudaMatrix Aurora::interp3(float aStartX, float aDx, float aEndX, float aStartY, float aDy, float aEndY,
float aStartZ, float aDz, float aEndZ, const CudaMatrix& aValue,
const CudaMatrix& aNewX, const CudaMatrix& aNewY, const CudaMatrix& aNewZ, float aOutValue)
{
cudaTextureObject_t texObj;
size_t dimX = aValue.getDimSize(1);
size_t dimY = aValue.getDimSize(0);
size_t dimZ = aValue.getDimSize(2);
cudaExtent extent = make_cudaExtent(dimX, dimY, dimZ);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
cudaMalloc3DArray(&cuArray, &channelDesc, extent);
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr(aValue.getData(), dimX * sizeof(float), dimX, dimY);
copyParams.dstArray = cuArray;
copyParams.extent = extent;
copyParams.kind = cudaMemcpyDeviceToDevice;
cudaMemcpy3D(&copyParams);
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.filterMode = cudaFilterModeLinear;
texDesc.addressMode[0] = cudaAddressModeClamp;
texDesc.addressMode[1] = cudaAddressModeClamp;
texDesc.addressMode[2] = cudaAddressModeClamp;
texDesc.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
int row = aNewX.getDimSize(0);
int column = aNewX.getDimSize(1);
int slice = aNewX.getDimSize(2);
dim3 blockDim(4,4,4);
dim3 gridDim(row / 4 + 1, column / 4 + 1, slice / 4 + 1);
float *data = nullptr;
cudaMalloc((void **)&data, sizeof(float) * row * column * slice);
CudaMatrix result = Aurora::CudaMatrix::fromRawData(data, row, column, slice);
interp3Kernel<<<gridDim, blockDim>>>(texObj, data, aStartX, aDx, aEndX, aStartY, aDy, aEndY, aStartZ, aDz, aEndZ,
aNewX.getData(), aNewY.getData(), aNewZ.getData(), row, column, slice, aOutValue);
cudaDeviceSynchronize();
return result;
}

View File

@@ -1,13 +0,0 @@
#ifndef __FUNCTION3D_CUDA__
#define __FUNCTION3D_CUDA__
#include "CudaMatrix.h"
#include "AuroraDefs.h"
namespace Aurora
{
CudaMatrix interp3(float aStartX, float aDx, float aEndX, float aStartY, float aDy, float aEndY,
float aStartZ, float aDz, float aEndZ, const CudaMatrix& aValue,
const CudaMatrix& aNewX, const CudaMatrix& aNewY, const CudaMatrix& aNewZ, float aOutValue);
}
#endif // __FUNCTION3D_CUDA_H__

View File

@@ -4,10 +4,8 @@
#include "Matrix.h" #include "Matrix.h"
#include "Function1D.h" #include "Function1D.h"
#if USE_CUDA
#include "CudaMatrix.h" #include "CudaMatrix.h"
#endif
namespace Aurora { namespace Aurora {
/** /**
@@ -19,6 +17,8 @@ namespace Aurora {
*/ */
Matrix ones(int aRow, int aColumn, int aSlice = 0); Matrix ones(int aRow, int aColumn, int aSlice = 0);
CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
/** /**
* 创建全部为1的方阵 * 创建全部为1的方阵
* @param aSquareRow * @param aSquareRow
@@ -26,6 +26,8 @@ namespace Aurora {
*/ */
Matrix ones(int aSquareRow); Matrix ones(int aSquareRow);
CudaMatrix onesCuda(int aSquareRow);
/** /**
* 创建全部为0的数组矩阵 * 创建全部为0的数组矩阵
* @param aRow 行数必须大于0 * @param aRow 行数必须大于0
@@ -35,30 +37,25 @@ namespace Aurora {
*/ */
Matrix zeros(int aRow, int aColumn, int aSlice = 0); Matrix zeros(int aRow, int aColumn, int aSlice = 0);
CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
/** /**
* 创建全部为0的方阵 * 创建全部为0的方阵
* @param aSquareRow * @param aSquareRow
* @return 全部为0的方阵 * @return 全部为0的方阵
*/ */
Matrix zeros(int aSquareRow); Matrix zeros(int aSquareRow);
CudaMatrix zerosCuda(int aSquareRow);
Matrix interp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod); Matrix interp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
Matrix meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval); Matrix meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval);
Matrix interpn(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod); Matrix interpn(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
Matrix size(const Matrix &aMatrix); Matrix size(const Matrix &aMatrix);
CudaMatrix size(const CudaMatrix &aMatrix);
int size(const Matrix &aMatrix,int dims); int size(const Matrix &aMatrix,int dims);
#if USE_CUDA
CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
CudaMatrix onesCuda(int aSquareRow);
CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
CudaMatrix zerosCuda(int aSquareRow);
CudaMatrix size(const CudaMatrix &aMatrix);
int size(const CudaMatrix &aMatrix,int dims); int size(const CudaMatrix &aMatrix,int dims);
#endif
}; };

View File

@@ -1,4 +1,6 @@
#include "Matrix.h" #include "Matrix.h"
#include <cmath> #include <cmath>
#include <complex> #include <complex>
#include <cstddef> #include <cstddef>

View File

@@ -10,10 +10,7 @@
namespace Aurora { namespace Aurora {
const int $ = -1; const int $ = -1;
#if USE_CUDA
class CudaMatrix; class CudaMatrix;
#endif
class Matrix { class Matrix {
public: public:
@@ -289,9 +286,8 @@ namespace Aurora {
void forceReshape(int rows, int columns, int slices); void forceReshape(int rows, int columns, int slices);
#if USE_CUDA
CudaMatrix toDeviceMatrix() const; CudaMatrix toDeviceMatrix() const;
#endif
private: private:
ValueType mValueType = Normal; ValueType mValueType = Normal;

View File

@@ -7,19 +7,15 @@
#include <complex> #include <complex>
#include "Matrix.h" #include "Matrix.h"
#include "CudaMatrix.h"
#include "Function.h" #include "Function.h"
#include "Function1D.h" #include "Function1D.h"
#include "Function2D.h" #include "Function2D.h"
#include "Function3D.h" #include "Function3D.h"
#include "MatlabReader.h" #include "MatlabReader.h"
#if USE_CUDA
#include "CudaMatrix.h"
#endif //USE_CUDA
int main() int main()
{ {
#if USE_CUDA
auto A = Aurora::zeros(1000,1,1); auto A = Aurora::zeros(1000,1,1);
auto B = Aurora::zeros(1000,1,1); auto B = Aurora::zeros(1000,1,1);
for (size_t i = 0; i < 1000; i++) for (size_t i = 0; i < 1000; i++)
@@ -119,6 +115,5 @@ int main()
} }
} }
} }
#endif //USE_CUDA
return 0; return 0;
} }

View File

@@ -2558,55 +2558,7 @@ TEST_F(CudaMatrix_Test, MatrixCompare){
} }
{ {
auto R= (9!=B); auto R= (9!=B);
auto dhR = (dB!=9).toHostMatrix(); auto dhR = (9!=dB).toHostMatrix();
for (size_t i = 0; i < 1000; i++)
{
EXPECT_FLOAT_EQ(R[i],dhR[i]);
}
}
{
auto R= (9<B);
auto dhR = (dB>9).toHostMatrix();
for (size_t i = 0; i < 1000; i++)
{
EXPECT_FLOAT_EQ(R[i],dhR[i]);
}
}
{
auto R= (9>B);
auto dhR = (dB<9).toHostMatrix();
for (size_t i = 0; i < 1000; i++)
{
EXPECT_FLOAT_EQ(R[i],dhR[i]);
}
}
{
auto R= (9<=B);
auto dhR = (dB>=9).toHostMatrix();
for (size_t i = 0; i < 1000; i++)
{
EXPECT_FLOAT_EQ(R[i],dhR[i]);
}
}
{
auto R= (9>=B);
auto dhR = (dB<=9).toHostMatrix();
for (size_t i = 0; i < 1000; i++)
{
EXPECT_FLOAT_EQ(R[i],dhR[i]);
}
}
{
auto R= (9==B);
auto dhR = (dB == 9).toHostMatrix();
for (size_t i = 0; i < 1000; i++)
{
EXPECT_FLOAT_EQ(R[i],dhR[i]);
}
}
{
auto R= (9!=B);
auto dhR = (dB!=9).toHostMatrix();
for (size_t i = 0; i < 1000; i++) for (size_t i = 0; i < 1000; i++)
{ {
EXPECT_FLOAT_EQ(R[i],dhR[i]); EXPECT_FLOAT_EQ(R[i],dhR[i]);

View File

@@ -5,7 +5,6 @@
#include "Function.h" #include "Function.h"
#include "Matrix.h" #include "Matrix.h"
#include "TestUtility.h" #include "TestUtility.h"
#include "MatlabReader.h"
#include "Function2D.h" #include "Function2D.h"
#include "Function2D.cuh" #include "Function2D.cuh"
@@ -19,15 +18,11 @@ protected:
static void TearDownTestCase(){ static void TearDownTestCase(){
} }
public: public:
Aurora::Matrix mSignal;
Aurora::CudaMatrix dmSignal;
Aurora::Matrix B; Aurora::Matrix B;
Aurora::CudaMatrix dB; Aurora::CudaMatrix dB;
void SetUp(){ void SetUp(){
MatlabReader m("/home/krad/TestData/peaks.mat");
mSignal = m.read("AScan_env_norm");
dmSignal = mSignal.toDeviceMatrix();
} }
void TearDown(){ void TearDown(){
} }
@@ -1002,17 +997,3 @@ TEST_F(Function2D_Cuda_Test, hilbert) {
EXPECT_NEAR(ret1[i], ret2.getValue(i), 0.01); EXPECT_NEAR(ret1[i], ret2.getValue(i), 0.01);
} }
} }
TEST_F(Function2D_Cuda_Test, findPeaks) {
int* indexs = new int[mSignal.getDimSize(1)*10];
auto ret1 = Aurora::findPeaks(dmSignal,10, 0.2, 0.05,indexs);
auto reH = ret1.toHostMatrix();
for(unsigned int i=0; i<10; ++i)
{
printf("%d,",indexs[i]);
}
delete [] indexs;
}

View File

@@ -2,8 +2,6 @@
#include <vector> #include <vector>
#include "TestUtility.h" #include "TestUtility.h"
#include "MatlabReader.h"
#include "Matrix.h" #include "Matrix.h"
#include "Function.h" #include "Function.h"
#include "Function1D.h" #include "Function1D.h"
@@ -18,11 +16,7 @@ protected:
} }
static void TearDownTestCase(){ static void TearDownTestCase(){
} }
public:
Aurora::Matrix mSignal;
void SetUp(){ void SetUp(){
MatlabReader m("/home/krad/TestData/peaks.mat");
mSignal = m.read("AScan_env_norm");
} }
void TearDown(){ void TearDown(){
} }
@@ -579,15 +573,3 @@ TEST_F(Function2D_Test, sub2ind) {
} }
TEST_F(Function2D_Test, findPeaks) {
int* indexs = new int[mSignal.getDimSize(1)*10];
Aurora::findPeaksHost(mSignal,10, 0.2, 0.05,indexs);
for(unsigned int i=0; i<10; ++i)
{
printf("%d,",indexs[i]);
}
delete [] indexs;
}