feat: make cuda version build by USE_CUDA args
This commit is contained in:
@@ -1,8 +1,9 @@
|
||||
#include "AuroraDefs.h"
|
||||
#include "Function1D.cuh"
|
||||
#include <complex>
|
||||
#include <utility>
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "Function1D.cuh"
|
||||
#include "CudaMatrix.h"
|
||||
|
||||
#include "Function.h"
|
||||
|
||||
@@ -35,55 +35,7 @@ namespace {
|
||||
|
||||
uint CONVERT_ADD_VALUE = UINT32_MAX - 4095;
|
||||
|
||||
inline void convertValue(float aValue ,float* des){
|
||||
float value = aValue;
|
||||
ushort *exponentPtr = (ushort *)&value;
|
||||
exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
|
||||
exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
|
||||
exponentPtr[2] = (exponentPtr[2] >> 11) & CONVERT_AND_VALUE;
|
||||
exponentPtr[3] = (exponentPtr[3] >> 11) & CONVERT_AND_VALUE;
|
||||
float signValue = aValue;
|
||||
short *signPtr = (short *)&signValue;
|
||||
uint sign_bit[4] = {
|
||||
(uint)(signPtr[0] < 0 ? 1 : 0), (uint)(signPtr[1] < 0 ? 1 : 0),
|
||||
(uint)(signPtr[2] < 0 ? 1 : 0), (uint)(signPtr[3] < 0 ? 1 : 0)};
|
||||
float fraction3Value = aValue;
|
||||
ushort *fraction3Ptr = (ushort *)&fraction3Value;
|
||||
fraction3Ptr[0] &= CONVERT_AND_VALUE_2;
|
||||
fraction3Ptr[1] &= CONVERT_AND_VALUE_2;
|
||||
fraction3Ptr[2] &= CONVERT_AND_VALUE_2;
|
||||
fraction3Ptr[3] &= CONVERT_AND_VALUE_2;
|
||||
uint hidden_bit[4] = {
|
||||
sign_bit[0] * (!exponentPtr[0] ? 1 : 0) * CONVERT_MUL_VALUE +
|
||||
((!sign_bit[0] && exponentPtr[0]) ? 1 : 0) * CONVERT_MUL_VALUE,
|
||||
sign_bit[1] * (!exponentPtr[1] ? 1 : 0) * 2048 +
|
||||
((!sign_bit[1] && exponentPtr[1]) ? 1 : 0) * CONVERT_MUL_VALUE,
|
||||
sign_bit[2] * (!exponentPtr[2] ? 1 : 0) * CONVERT_MUL_VALUE +
|
||||
((!sign_bit[2] && exponentPtr[2]) ? 1 : 0) * CONVERT_MUL_VALUE,
|
||||
sign_bit[3] * (!exponentPtr[3] ? 1 : 0) * 2048 +
|
||||
((!sign_bit[3] && exponentPtr[3]) ? 1 : 0) * CONVERT_MUL_VALUE,
|
||||
};
|
||||
int outputPtr[4] = {0};
|
||||
uint temp = fraction3Ptr[0] + hidden_bit[0] + sign_bit[0] * CONVERT_ADD_VALUE;
|
||||
outputPtr[0] = exponentPtr[0] > 1 ? (temp << (exponentPtr[0] - 1))
|
||||
: (temp >> std::abs(exponentPtr[0] - 1));
|
||||
temp = fraction3Ptr[1] + hidden_bit[1] + sign_bit[1] * CONVERT_ADD_VALUE;
|
||||
outputPtr[1] = exponentPtr[1] > 1 ? (temp << (exponentPtr[1] - 1))
|
||||
: (temp >> std::abs(exponentPtr[1] - 1));
|
||||
temp = fraction3Ptr[2] + hidden_bit[2] + sign_bit[2] * CONVERT_ADD_VALUE;
|
||||
outputPtr[2] = exponentPtr[2] > 1 ? (temp << (exponentPtr[2] - 1))
|
||||
: (temp >> std::abs(exponentPtr[2] - 1));
|
||||
temp = fraction3Ptr[3] + hidden_bit[3] + sign_bit[3] * CONVERT_ADD_VALUE;
|
||||
outputPtr[3] = exponentPtr[3] > 1 ? (temp << (exponentPtr[3] - 1))
|
||||
: (temp >> std::abs(exponentPtr[3] - 1));
|
||||
des[0] = outputPtr[0];
|
||||
des[1] = outputPtr[1];
|
||||
des[2] = outputPtr[2];
|
||||
des[3] = outputPtr[3];
|
||||
|
||||
}
|
||||
|
||||
inline void convertValue2(short* aValue ,float* des){
|
||||
inline void convertValue(short* aValue ,float* des){
|
||||
ushort exponentPtr[4] = {(ushort)aValue[0],(ushort)aValue[1],(ushort)aValue[2],(ushort)aValue[3]};
|
||||
exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE;
|
||||
exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE;
|
||||
@@ -1096,14 +1048,14 @@ Matrix Aurora::convertfp16tofloat(short* aData, int aRows, int aColumns)
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < quaterSize; i+=8) {
|
||||
//循环展开以避免过度的线程调用
|
||||
if (i < quaterSize)::convertValue2((short*)(input+i*4), output + (i) * 4);
|
||||
if (i+1 < quaterSize)::convertValue2((short*)(input+(i+1)*4), output + (i+1) * 4);
|
||||
if (i+2 < quaterSize)::convertValue2((short*)(input+(i+2)*4), output + (i+2) * 4);
|
||||
if (i+3 < quaterSize)::convertValue2((short*)(input+(i+3)*4), output + (i+3) * 4);
|
||||
if (i+4 < quaterSize)::convertValue2((short*)(input+(i+4)*4), output + (i+4) * 4);
|
||||
if (i+5 < quaterSize)::convertValue2((short*)(input+(i+5)*4), output + (i+5) * 4);
|
||||
if (i+6 < quaterSize)::convertValue2((short*)(input+(i+6)*4), output + (i+6) * 4);
|
||||
if (i+7 < quaterSize)::convertValue2((short*)(input+(i+7)*4), output + (i+7) * 4);
|
||||
if (i < quaterSize)::convertValue((short*)(input+i*4), output + (i) * 4);
|
||||
if (i+1 < quaterSize)::convertValue((short*)(input+(i+1)*4), output + (i+1) * 4);
|
||||
if (i+2 < quaterSize)::convertValue((short*)(input+(i+2)*4), output + (i+2) * 4);
|
||||
if (i+3 < quaterSize)::convertValue((short*)(input+(i+3)*4), output + (i+3) * 4);
|
||||
if (i+4 < quaterSize)::convertValue((short*)(input+(i+4)*4), output + (i+4) * 4);
|
||||
if (i+5 < quaterSize)::convertValue((short*)(input+(i+5)*4), output + (i+5) * 4);
|
||||
if (i+6 < quaterSize)::convertValue((short*)(input+(i+6)*4), output + (i+6) * 4);
|
||||
if (i+7 < quaterSize)::convertValue((short*)(input+(i+7)*4), output + (i+7) * 4);
|
||||
}
|
||||
return Matrix::New(output,aRows,aColumns,1);
|
||||
}
|
||||
|
||||
@@ -124,5 +124,4 @@ namespace Aurora
|
||||
void compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op);
|
||||
void compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatrix& aNewValueMatrix,CompareOp op);
|
||||
}
|
||||
|
||||
#endif //AURORA_CUDA_FUNCTION1D_H
|
||||
@@ -29,7 +29,7 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cusolverDn.h>
|
||||
#include "Function1D.cuh"
|
||||
|
||||
#include "Matrix.h"
|
||||
|
||||
#include "cufft.h"
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
#include <iostream>
|
||||
#include "CudaMatrix.h"
|
||||
|
||||
#include "Function3D.h"
|
||||
#include "Function2D.h"
|
||||
#include "Function.h"
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "CudaMatrix.h"
|
||||
#include "CudaMatrixPrivate.cuh"
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
|
||||
#endif // USE_CUDA
|
||||
|
||||
//必须在Eigen之前
|
||||
#include "AuroraDefs.h"
|
||||
@@ -70,30 +72,10 @@ Matrix Aurora::ones(int aRow, int aColumn, int aSlice) {
|
||||
return Matrix::New(data,rowSize,colSize,aSlice);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){
|
||||
if (aRow == 0 || aColumn == 0)
|
||||
{
|
||||
std::cerr<<"ones function can create matrix with dim unit cont =0";
|
||||
return CudaMatrix();
|
||||
}
|
||||
int rowSize = aRow;
|
||||
int colSize = aColumn;
|
||||
int sliceSize = aSlice == 0 ? 1 : aSlice;
|
||||
size_t arraySize = rowSize * colSize* sliceSize;
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data,arraySize*sizeof(float));
|
||||
::thrustFill(data,data+arraySize,1.0f);
|
||||
return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize);
|
||||
}
|
||||
|
||||
Matrix Aurora::ones(int aSquareRow) {
|
||||
return Aurora::ones(aSquareRow, aSquareRow);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::onesCuda(int aSquareRow) {
|
||||
return Aurora::onesCuda(aSquareRow, aSquareRow);
|
||||
}
|
||||
|
||||
Matrix Aurora::zeros(int aRow, int aColumn, int aSlice) {
|
||||
if (aRow == 0 || aColumn == 0)
|
||||
{
|
||||
@@ -110,31 +92,10 @@ Matrix Aurora::zeros(int aRow, int aColumn, int aSlice) {
|
||||
return Matrix::New(data,rowSize,colSize,sliceSize);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::zerosCuda(int aRow, int aColumn, int aSlice) {
|
||||
if (aRow == 0 || aColumn == 0)
|
||||
{
|
||||
std::cerr<<"zeros function can create matrix with dim unit cont =0";
|
||||
return CudaMatrix();
|
||||
}
|
||||
int rowSize = aRow;
|
||||
int colSize = aColumn;
|
||||
int sliceSize = aSlice == 0 ? 1 : aSlice;
|
||||
size_t arraySize = rowSize * colSize* sliceSize;
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data,arraySize*sizeof(float));
|
||||
::thrustFill(data,data+arraySize,0.0f);
|
||||
return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize);
|
||||
}
|
||||
|
||||
|
||||
Matrix Aurora::zeros(int aSquareRow) {
|
||||
return Aurora::zeros(aSquareRow, aSquareRow);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::zerosCuda(int aSquareRow) {
|
||||
return Aurora::zerosCuda(aSquareRow, aSquareRow);
|
||||
}
|
||||
|
||||
Matrix Aurora::size(const Matrix &aMatrix)
|
||||
{
|
||||
if (aMatrix.isScalar()){
|
||||
@@ -165,50 +126,11 @@ Matrix Aurora::size(const Matrix &aMatrix)
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::size(const CudaMatrix &aMatrix){
|
||||
float * output=nullptr;
|
||||
if (aMatrix.isScalar()){
|
||||
cudaMalloc((void**)&output,sizeof(float));
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,1,1,1);
|
||||
outMatrix.setValue(0, 1);
|
||||
return outMatrix;
|
||||
}
|
||||
else if (aMatrix.isVector()){
|
||||
cudaMalloc((void**)&output,sizeof(float)*2);
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,2,1,1);
|
||||
outMatrix.setValue(0, aMatrix.getDimSize(0));
|
||||
outMatrix.setValue(1, aMatrix.getDimSize(1));
|
||||
return outMatrix;
|
||||
}
|
||||
//3D
|
||||
else if (aMatrix.getDimSize(2)>1){
|
||||
cudaMalloc((void**)&output,sizeof(float)*3);
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,3,1,1);
|
||||
outMatrix.setValue(0,aMatrix.getDimSize(0));
|
||||
outMatrix.setValue(1,aMatrix.getDimSize(1));
|
||||
outMatrix.setValue(2,aMatrix.getDimSize(2));
|
||||
return outMatrix;
|
||||
}
|
||||
//2D matrix
|
||||
else{
|
||||
cudaMalloc((void**)&output,sizeof(float)*2);
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,2,1,1);
|
||||
outMatrix.setValue(0,aMatrix.getDimSize(0));
|
||||
outMatrix.setValue(1,aMatrix.getDimSize(1));
|
||||
return outMatrix;
|
||||
}
|
||||
}
|
||||
|
||||
int Aurora::size(const Matrix &aMatrix,int dims)
|
||||
{
|
||||
return aMatrix.getDimSize(dims-1);
|
||||
}
|
||||
|
||||
int Aurora::size(const CudaMatrix &aMatrix,int dims)
|
||||
{
|
||||
return aMatrix.getDimSize(dims-1);
|
||||
}
|
||||
|
||||
Matrix Aurora::meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval)
|
||||
{
|
||||
std::vector<Matrix> zTemps;
|
||||
@@ -291,3 +213,85 @@ Matrix Aurora::meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix&
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#if USE_CUDA
|
||||
CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){
|
||||
if (aRow == 0 || aColumn == 0)
|
||||
{
|
||||
std::cerr<<"ones function can create matrix with dim unit cont =0";
|
||||
return CudaMatrix();
|
||||
}
|
||||
int rowSize = aRow;
|
||||
int colSize = aColumn;
|
||||
int sliceSize = aSlice == 0 ? 1 : aSlice;
|
||||
size_t arraySize = rowSize * colSize* sliceSize;
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data,arraySize*sizeof(float));
|
||||
::thrustFill(data,data+arraySize,1.0f);
|
||||
return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::onesCuda(int aSquareRow) {
|
||||
return Aurora::onesCuda(aSquareRow, aSquareRow);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::zerosCuda(int aRow, int aColumn, int aSlice) {
|
||||
if (aRow == 0 || aColumn == 0)
|
||||
{
|
||||
std::cerr<<"zeros function can create matrix with dim unit cont =0";
|
||||
return CudaMatrix();
|
||||
}
|
||||
int rowSize = aRow;
|
||||
int colSize = aColumn;
|
||||
int sliceSize = aSlice == 0 ? 1 : aSlice;
|
||||
size_t arraySize = rowSize * colSize* sliceSize;
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data,arraySize*sizeof(float));
|
||||
::thrustFill(data,data+arraySize,0.0f);
|
||||
return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize);
|
||||
}
|
||||
|
||||
|
||||
CudaMatrix Aurora::zerosCuda(int aSquareRow) {
|
||||
return Aurora::zerosCuda(aSquareRow, aSquareRow);
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::size(const CudaMatrix &aMatrix){
|
||||
float * output=nullptr;
|
||||
if (aMatrix.isScalar()){
|
||||
cudaMalloc((void**)&output,sizeof(float));
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,1,1,1);
|
||||
outMatrix.setValue(0, 1);
|
||||
return outMatrix;
|
||||
}
|
||||
else if (aMatrix.isVector()){
|
||||
cudaMalloc((void**)&output,sizeof(float)*2);
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,2,1,1);
|
||||
outMatrix.setValue(0, aMatrix.getDimSize(0));
|
||||
outMatrix.setValue(1, aMatrix.getDimSize(1));
|
||||
return outMatrix;
|
||||
}
|
||||
//3D
|
||||
else if (aMatrix.getDimSize(2)>1){
|
||||
cudaMalloc((void**)&output,sizeof(float)*3);
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,3,1,1);
|
||||
outMatrix.setValue(0,aMatrix.getDimSize(0));
|
||||
outMatrix.setValue(1,aMatrix.getDimSize(1));
|
||||
outMatrix.setValue(2,aMatrix.getDimSize(2));
|
||||
return outMatrix;
|
||||
}
|
||||
//2D matrix
|
||||
else{
|
||||
cudaMalloc((void**)&output,sizeof(float)*2);
|
||||
auto outMatrix = CudaMatrix::fromRawData(output,2,1,1);
|
||||
outMatrix.setValue(0,aMatrix.getDimSize(0));
|
||||
outMatrix.setValue(1,aMatrix.getDimSize(1));
|
||||
return outMatrix;
|
||||
}
|
||||
}
|
||||
|
||||
int Aurora::size(const CudaMatrix &aMatrix,int dims)
|
||||
{
|
||||
return aMatrix.getDimSize(dims-1);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
|
||||
#include "Matrix.h"
|
||||
#include "Function1D.h"
|
||||
#include "CudaMatrix.h"
|
||||
|
||||
#if USE_CUDA
|
||||
#include "CudaMatrix.h"
|
||||
#endif
|
||||
namespace Aurora {
|
||||
|
||||
/**
|
||||
@@ -17,8 +19,6 @@ namespace Aurora {
|
||||
*/
|
||||
Matrix ones(int aRow, int aColumn, int aSlice = 0);
|
||||
|
||||
CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
|
||||
|
||||
/**
|
||||
* 创建全部为1的方阵
|
||||
* @param aSquareRow
|
||||
@@ -26,8 +26,6 @@ namespace Aurora {
|
||||
*/
|
||||
Matrix ones(int aSquareRow);
|
||||
|
||||
CudaMatrix onesCuda(int aSquareRow);
|
||||
|
||||
/**
|
||||
* 创建全部为0的数组,矩阵
|
||||
* @param aRow 行数,必须大于0
|
||||
@@ -37,25 +35,30 @@ namespace Aurora {
|
||||
*/
|
||||
Matrix zeros(int aRow, int aColumn, int aSlice = 0);
|
||||
|
||||
CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
|
||||
|
||||
/**
|
||||
* 创建全部为0的方阵
|
||||
* @param aSquareRow
|
||||
* @return 全部为0的方阵
|
||||
*/
|
||||
Matrix zeros(int aSquareRow);
|
||||
CudaMatrix zerosCuda(int aSquareRow);
|
||||
Matrix interp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
|
||||
Matrix meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval);
|
||||
Matrix interpn(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod);
|
||||
|
||||
Matrix size(const Matrix &aMatrix);
|
||||
CudaMatrix size(const CudaMatrix &aMatrix);
|
||||
|
||||
int size(const Matrix &aMatrix,int dims);
|
||||
int size(const CudaMatrix &aMatrix,int dims);
|
||||
|
||||
|
||||
#if USE_CUDA
|
||||
CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0);
|
||||
CudaMatrix onesCuda(int aSquareRow);
|
||||
|
||||
CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0);
|
||||
CudaMatrix zerosCuda(int aSquareRow);
|
||||
CudaMatrix size(const CudaMatrix &aMatrix);
|
||||
int size(const CudaMatrix &aMatrix,int dims);
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
#include "Matrix.h"
|
||||
#include "CudaMatrix.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <complex>
|
||||
#include <cstddef>
|
||||
@@ -21,6 +19,7 @@
|
||||
#include "Function1D.h"
|
||||
|
||||
#ifdef USE_CUDA
|
||||
#include "CudaMatrix.h"
|
||||
#include <cuda_runtime.h>
|
||||
#endif
|
||||
|
||||
|
||||
@@ -10,7 +10,10 @@
|
||||
namespace Aurora {
|
||||
|
||||
const int $ = -1;
|
||||
|
||||
#if USE_CUDA
|
||||
class CudaMatrix;
|
||||
#endif
|
||||
|
||||
class Matrix {
|
||||
public:
|
||||
@@ -285,9 +288,10 @@ namespace Aurora {
|
||||
}
|
||||
|
||||
void forceReshape(int rows, int columns, int slices);
|
||||
|
||||
|
||||
#if USE_CUDA
|
||||
CudaMatrix toDeviceMatrix() const;
|
||||
|
||||
#endif
|
||||
|
||||
private:
|
||||
ValueType mValueType = Normal;
|
||||
|
||||
@@ -7,15 +7,19 @@
|
||||
#include <complex>
|
||||
|
||||
#include "Matrix.h"
|
||||
#include "CudaMatrix.h"
|
||||
#include "Function.h"
|
||||
#include "Function1D.h"
|
||||
#include "Function2D.h"
|
||||
#include "Function3D.h"
|
||||
#include "MatlabReader.h"
|
||||
|
||||
#if USE_CUDA
|
||||
#include "CudaMatrix.h"
|
||||
#endif //USE_CUDA
|
||||
|
||||
int main()
|
||||
{
|
||||
#if USE_CUDA
|
||||
auto A = Aurora::zeros(1000,1,1);
|
||||
auto B = Aurora::zeros(1000,1,1);
|
||||
for (size_t i = 0; i < 1000; i++)
|
||||
@@ -115,5 +119,6 @@ int main()
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif //USE_CUDA
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user