From 5407c3ccb6c3a22f81c1b9388e3e888eb2449c63 Mon Sep 17 00:00:00 2001 From: kradchen Date: Wed, 18 Dec 2024 17:55:14 +0800 Subject: [PATCH] feat: make cuda version build by USE_CUDA args --- src/CudaMatrix.cpp | 3 +- src/Function1D.cpp | 66 +++--------------- src/Function1D.cuh | 1 - src/Function2D.cu | 2 +- src/Function3D.cpp | 170 +++++++++++++++++++++++---------------------- src/Function3D.h | 23 +++--- src/Matrix.cpp | 3 +- src/Matrix.h | 8 ++- src/main.cxx | 7 +- 9 files changed, 125 insertions(+), 158 deletions(-) diff --git a/src/CudaMatrix.cpp b/src/CudaMatrix.cpp index 3d1cf5c..eebb2af 100644 --- a/src/CudaMatrix.cpp +++ b/src/CudaMatrix.cpp @@ -1,8 +1,9 @@ #include "AuroraDefs.h" -#include "Function1D.cuh" #include #include + #ifdef USE_CUDA +#include "Function1D.cuh" #include "CudaMatrix.h" #include "Function.h" diff --git a/src/Function1D.cpp b/src/Function1D.cpp index 141a59c..c62c573 100644 --- a/src/Function1D.cpp +++ b/src/Function1D.cpp @@ -35,55 +35,7 @@ namespace { uint CONVERT_ADD_VALUE = UINT32_MAX - 4095; - inline void convertValue(float aValue ,float* des){ - float value = aValue; - ushort *exponentPtr = (ushort *)&value; - exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE; - exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE; - exponentPtr[2] = (exponentPtr[2] >> 11) & CONVERT_AND_VALUE; - exponentPtr[3] = (exponentPtr[3] >> 11) & CONVERT_AND_VALUE; - float signValue = aValue; - short *signPtr = (short *)&signValue; - uint sign_bit[4] = { - (uint)(signPtr[0] < 0 ? 1 : 0), (uint)(signPtr[1] < 0 ? 1 : 0), - (uint)(signPtr[2] < 0 ? 1 : 0), (uint)(signPtr[3] < 0 ? 1 : 0)}; - float fraction3Value = aValue; - ushort *fraction3Ptr = (ushort *)&fraction3Value; - fraction3Ptr[0] &= CONVERT_AND_VALUE_2; - fraction3Ptr[1] &= CONVERT_AND_VALUE_2; - fraction3Ptr[2] &= CONVERT_AND_VALUE_2; - fraction3Ptr[3] &= CONVERT_AND_VALUE_2; - uint hidden_bit[4] = { - sign_bit[0] * (!exponentPtr[0] ? 1 : 0) * CONVERT_MUL_VALUE + - ((!sign_bit[0] && exponentPtr[0]) ? 1 : 0) * CONVERT_MUL_VALUE, - sign_bit[1] * (!exponentPtr[1] ? 1 : 0) * 2048 + - ((!sign_bit[1] && exponentPtr[1]) ? 1 : 0) * CONVERT_MUL_VALUE, - sign_bit[2] * (!exponentPtr[2] ? 1 : 0) * CONVERT_MUL_VALUE + - ((!sign_bit[2] && exponentPtr[2]) ? 1 : 0) * CONVERT_MUL_VALUE, - sign_bit[3] * (!exponentPtr[3] ? 1 : 0) * 2048 + - ((!sign_bit[3] && exponentPtr[3]) ? 1 : 0) * CONVERT_MUL_VALUE, - }; - int outputPtr[4] = {0}; - uint temp = fraction3Ptr[0] + hidden_bit[0] + sign_bit[0] * CONVERT_ADD_VALUE; - outputPtr[0] = exponentPtr[0] > 1 ? (temp << (exponentPtr[0] - 1)) - : (temp >> std::abs(exponentPtr[0] - 1)); - temp = fraction3Ptr[1] + hidden_bit[1] + sign_bit[1] * CONVERT_ADD_VALUE; - outputPtr[1] = exponentPtr[1] > 1 ? (temp << (exponentPtr[1] - 1)) - : (temp >> std::abs(exponentPtr[1] - 1)); - temp = fraction3Ptr[2] + hidden_bit[2] + sign_bit[2] * CONVERT_ADD_VALUE; - outputPtr[2] = exponentPtr[2] > 1 ? (temp << (exponentPtr[2] - 1)) - : (temp >> std::abs(exponentPtr[2] - 1)); - temp = fraction3Ptr[3] + hidden_bit[3] + sign_bit[3] * CONVERT_ADD_VALUE; - outputPtr[3] = exponentPtr[3] > 1 ? (temp << (exponentPtr[3] - 1)) - : (temp >> std::abs(exponentPtr[3] - 1)); - des[0] = outputPtr[0]; - des[1] = outputPtr[1]; - des[2] = outputPtr[2]; - des[3] = outputPtr[3]; - - } - - inline void convertValue2(short* aValue ,float* des){ + inline void convertValue(short* aValue ,float* des){ ushort exponentPtr[4] = {(ushort)aValue[0],(ushort)aValue[1],(ushort)aValue[2],(ushort)aValue[3]}; exponentPtr[0] = (exponentPtr[0] >> 11) & CONVERT_AND_VALUE; exponentPtr[1] = (exponentPtr[1] >> 11) & CONVERT_AND_VALUE; @@ -1096,14 +1048,14 @@ Matrix Aurora::convertfp16tofloat(short* aData, int aRows, int aColumns) #pragma omp parallel for for (size_t i = 0; i < quaterSize; i+=8) { //循环展开以避免过度的线程调用 - if (i < quaterSize)::convertValue2((short*)(input+i*4), output + (i) * 4); - if (i+1 < quaterSize)::convertValue2((short*)(input+(i+1)*4), output + (i+1) * 4); - if (i+2 < quaterSize)::convertValue2((short*)(input+(i+2)*4), output + (i+2) * 4); - if (i+3 < quaterSize)::convertValue2((short*)(input+(i+3)*4), output + (i+3) * 4); - if (i+4 < quaterSize)::convertValue2((short*)(input+(i+4)*4), output + (i+4) * 4); - if (i+5 < quaterSize)::convertValue2((short*)(input+(i+5)*4), output + (i+5) * 4); - if (i+6 < quaterSize)::convertValue2((short*)(input+(i+6)*4), output + (i+6) * 4); - if (i+7 < quaterSize)::convertValue2((short*)(input+(i+7)*4), output + (i+7) * 4); + if (i < quaterSize)::convertValue((short*)(input+i*4), output + (i) * 4); + if (i+1 < quaterSize)::convertValue((short*)(input+(i+1)*4), output + (i+1) * 4); + if (i+2 < quaterSize)::convertValue((short*)(input+(i+2)*4), output + (i+2) * 4); + if (i+3 < quaterSize)::convertValue((short*)(input+(i+3)*4), output + (i+3) * 4); + if (i+4 < quaterSize)::convertValue((short*)(input+(i+4)*4), output + (i+4) * 4); + if (i+5 < quaterSize)::convertValue((short*)(input+(i+5)*4), output + (i+5) * 4); + if (i+6 < quaterSize)::convertValue((short*)(input+(i+6)*4), output + (i+6) * 4); + if (i+7 < quaterSize)::convertValue((short*)(input+(i+7)*4), output + (i+7) * 4); } return Matrix::New(output,aRows,aColumns,1); } diff --git a/src/Function1D.cuh b/src/Function1D.cuh index 8ead088..11b608d 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -124,5 +124,4 @@ namespace Aurora void compareSet(CudaMatrix& aDesAndCompareMatrix,CudaMatrix& aOtherCompareMatrix, float newValue,CompareOp op); void compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatrix& aNewValueMatrix,CompareOp op); } - #endif //AURORA_CUDA_FUNCTION1D_H \ No newline at end of file diff --git a/src/Function2D.cu b/src/Function2D.cu index 553904b..d467406 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -29,7 +29,7 @@ #include #include #include -#include "Function1D.cuh" + #include "Matrix.h" #include "cufft.h" diff --git a/src/Function3D.cpp b/src/Function3D.cpp index 4de3e06..cded3d9 100644 --- a/src/Function3D.cpp +++ b/src/Function3D.cpp @@ -1,12 +1,14 @@ #include -#include "CudaMatrix.h" + #include "Function3D.h" #include "Function2D.h" #include "Function.h" + +#ifdef USE_CUDA +#include "CudaMatrix.h" #include "CudaMatrixPrivate.cuh" #include - - +#endif // USE_CUDA //必须在Eigen之前 #include "AuroraDefs.h" @@ -70,30 +72,10 @@ Matrix Aurora::ones(int aRow, int aColumn, int aSlice) { return Matrix::New(data,rowSize,colSize,aSlice); } -CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){ - if (aRow == 0 || aColumn == 0) - { - std::cerr<<"ones function can create matrix with dim unit cont =0"; - return CudaMatrix(); - } - int rowSize = aRow; - int colSize = aColumn; - int sliceSize = aSlice == 0 ? 1 : aSlice; - size_t arraySize = rowSize * colSize* sliceSize; - float* data = nullptr; - cudaMalloc((void**)&data,arraySize*sizeof(float)); - ::thrustFill(data,data+arraySize,1.0f); - return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize); -} - Matrix Aurora::ones(int aSquareRow) { return Aurora::ones(aSquareRow, aSquareRow); } -CudaMatrix Aurora::onesCuda(int aSquareRow) { - return Aurora::onesCuda(aSquareRow, aSquareRow); -} - Matrix Aurora::zeros(int aRow, int aColumn, int aSlice) { if (aRow == 0 || aColumn == 0) { @@ -110,31 +92,10 @@ Matrix Aurora::zeros(int aRow, int aColumn, int aSlice) { return Matrix::New(data,rowSize,colSize,sliceSize); } -CudaMatrix Aurora::zerosCuda(int aRow, int aColumn, int aSlice) { - if (aRow == 0 || aColumn == 0) - { - std::cerr<<"zeros function can create matrix with dim unit cont =0"; - return CudaMatrix(); - } - int rowSize = aRow; - int colSize = aColumn; - int sliceSize = aSlice == 0 ? 1 : aSlice; - size_t arraySize = rowSize * colSize* sliceSize; - float* data = nullptr; - cudaMalloc((void**)&data,arraySize*sizeof(float)); - ::thrustFill(data,data+arraySize,0.0f); - return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize); -} - - Matrix Aurora::zeros(int aSquareRow) { return Aurora::zeros(aSquareRow, aSquareRow); } -CudaMatrix Aurora::zerosCuda(int aSquareRow) { - return Aurora::zerosCuda(aSquareRow, aSquareRow); -} - Matrix Aurora::size(const Matrix &aMatrix) { if (aMatrix.isScalar()){ @@ -165,50 +126,11 @@ Matrix Aurora::size(const Matrix &aMatrix) } } -CudaMatrix Aurora::size(const CudaMatrix &aMatrix){ - float * output=nullptr; - if (aMatrix.isScalar()){ - cudaMalloc((void**)&output,sizeof(float)); - auto outMatrix = CudaMatrix::fromRawData(output,1,1,1); - outMatrix.setValue(0, 1); - return outMatrix; - } - else if (aMatrix.isVector()){ - cudaMalloc((void**)&output,sizeof(float)*2); - auto outMatrix = CudaMatrix::fromRawData(output,2,1,1); - outMatrix.setValue(0, aMatrix.getDimSize(0)); - outMatrix.setValue(1, aMatrix.getDimSize(1)); - return outMatrix; - } - //3D - else if (aMatrix.getDimSize(2)>1){ - cudaMalloc((void**)&output,sizeof(float)*3); - auto outMatrix = CudaMatrix::fromRawData(output,3,1,1); - outMatrix.setValue(0,aMatrix.getDimSize(0)); - outMatrix.setValue(1,aMatrix.getDimSize(1)); - outMatrix.setValue(2,aMatrix.getDimSize(2)); - return outMatrix; - } - //2D matrix - else{ - cudaMalloc((void**)&output,sizeof(float)*2); - auto outMatrix = CudaMatrix::fromRawData(output,2,1,1); - outMatrix.setValue(0,aMatrix.getDimSize(0)); - outMatrix.setValue(1,aMatrix.getDimSize(1)); - return outMatrix; - } -} - int Aurora::size(const Matrix &aMatrix,int dims) { return aMatrix.getDimSize(dims-1); } -int Aurora::size(const CudaMatrix &aMatrix,int dims) -{ - return aMatrix.getDimSize(dims-1); -} - Matrix Aurora::meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval) { std::vector zTemps; @@ -291,3 +213,85 @@ Matrix Aurora::meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& return result; } + +#if USE_CUDA +CudaMatrix Aurora::onesCuda(int aRow, int aColumn, int aSlice){ + if (aRow == 0 || aColumn == 0) + { + std::cerr<<"ones function can create matrix with dim unit cont =0"; + return CudaMatrix(); + } + int rowSize = aRow; + int colSize = aColumn; + int sliceSize = aSlice == 0 ? 1 : aSlice; + size_t arraySize = rowSize * colSize* sliceSize; + float* data = nullptr; + cudaMalloc((void**)&data,arraySize*sizeof(float)); + ::thrustFill(data,data+arraySize,1.0f); + return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize); +} + +CudaMatrix Aurora::onesCuda(int aSquareRow) { + return Aurora::onesCuda(aSquareRow, aSquareRow); +} + +CudaMatrix Aurora::zerosCuda(int aRow, int aColumn, int aSlice) { + if (aRow == 0 || aColumn == 0) + { + std::cerr<<"zeros function can create matrix with dim unit cont =0"; + return CudaMatrix(); + } + int rowSize = aRow; + int colSize = aColumn; + int sliceSize = aSlice == 0 ? 1 : aSlice; + size_t arraySize = rowSize * colSize* sliceSize; + float* data = nullptr; + cudaMalloc((void**)&data,arraySize*sizeof(float)); + ::thrustFill(data,data+arraySize,0.0f); + return CudaMatrix::fromRawData(data,rowSize,colSize,sliceSize); +} + + +CudaMatrix Aurora::zerosCuda(int aSquareRow) { + return Aurora::zerosCuda(aSquareRow, aSquareRow); +} + +CudaMatrix Aurora::size(const CudaMatrix &aMatrix){ + float * output=nullptr; + if (aMatrix.isScalar()){ + cudaMalloc((void**)&output,sizeof(float)); + auto outMatrix = CudaMatrix::fromRawData(output,1,1,1); + outMatrix.setValue(0, 1); + return outMatrix; + } + else if (aMatrix.isVector()){ + cudaMalloc((void**)&output,sizeof(float)*2); + auto outMatrix = CudaMatrix::fromRawData(output,2,1,1); + outMatrix.setValue(0, aMatrix.getDimSize(0)); + outMatrix.setValue(1, aMatrix.getDimSize(1)); + return outMatrix; + } + //3D + else if (aMatrix.getDimSize(2)>1){ + cudaMalloc((void**)&output,sizeof(float)*3); + auto outMatrix = CudaMatrix::fromRawData(output,3,1,1); + outMatrix.setValue(0,aMatrix.getDimSize(0)); + outMatrix.setValue(1,aMatrix.getDimSize(1)); + outMatrix.setValue(2,aMatrix.getDimSize(2)); + return outMatrix; + } + //2D matrix + else{ + cudaMalloc((void**)&output,sizeof(float)*2); + auto outMatrix = CudaMatrix::fromRawData(output,2,1,1); + outMatrix.setValue(0,aMatrix.getDimSize(0)); + outMatrix.setValue(1,aMatrix.getDimSize(1)); + return outMatrix; + } +} + +int Aurora::size(const CudaMatrix &aMatrix,int dims) +{ + return aMatrix.getDimSize(dims-1); +} +#endif diff --git a/src/Function3D.h b/src/Function3D.h index 901bdb5..bd759ed 100644 --- a/src/Function3D.h +++ b/src/Function3D.h @@ -4,8 +4,10 @@ #include "Matrix.h" #include "Function1D.h" -#include "CudaMatrix.h" +#if USE_CUDA +#include "CudaMatrix.h" +#endif namespace Aurora { /** @@ -17,8 +19,6 @@ namespace Aurora { */ Matrix ones(int aRow, int aColumn, int aSlice = 0); - CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0); - /** * 创建全部为1的方阵 * @param aSquareRow @@ -26,8 +26,6 @@ namespace Aurora { */ Matrix ones(int aSquareRow); - CudaMatrix onesCuda(int aSquareRow); - /** * 创建全部为0的数组,矩阵 * @param aRow 行数,必须大于0 @@ -37,25 +35,30 @@ namespace Aurora { */ Matrix zeros(int aRow, int aColumn, int aSlice = 0); - CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0); - /** * 创建全部为0的方阵 * @param aSquareRow * @return 全部为0的方阵 */ Matrix zeros(int aSquareRow); - CudaMatrix zerosCuda(int aSquareRow); Matrix interp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod); Matrix meshgridInterp3(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod, float aExtrapval); Matrix interpn(const Matrix& aX, const Matrix& aY, const Matrix& aZ, const Matrix& aV, const Matrix& aX1, const Matrix& aY1, const Matrix& aZ1,InterpnMethod aMethod); Matrix size(const Matrix &aMatrix); - CudaMatrix size(const CudaMatrix &aMatrix); int size(const Matrix &aMatrix,int dims); - int size(const CudaMatrix &aMatrix,int dims); + + #if USE_CUDA + CudaMatrix onesCuda(int aRow, int aColumn, int aSlice = 0); + CudaMatrix onesCuda(int aSquareRow); + + CudaMatrix zerosCuda(int aRow, int aColumn, int aSlice = 0); + CudaMatrix zerosCuda(int aSquareRow); + CudaMatrix size(const CudaMatrix &aMatrix); + int size(const CudaMatrix &aMatrix,int dims); + #endif }; diff --git a/src/Matrix.cpp b/src/Matrix.cpp index 475cd9d..d2d0d06 100644 --- a/src/Matrix.cpp +++ b/src/Matrix.cpp @@ -1,6 +1,4 @@ #include "Matrix.h" -#include "CudaMatrix.h" - #include #include #include @@ -21,6 +19,7 @@ #include "Function1D.h" #ifdef USE_CUDA +#include "CudaMatrix.h" #include #endif diff --git a/src/Matrix.h b/src/Matrix.h index b0e4a52..b828fe3 100644 --- a/src/Matrix.h +++ b/src/Matrix.h @@ -10,7 +10,10 @@ namespace Aurora { const int $ = -1; + + #if USE_CUDA class CudaMatrix; + #endif class Matrix { public: @@ -285,9 +288,10 @@ namespace Aurora { } void forceReshape(int rows, int columns, int slices); - + + #if USE_CUDA CudaMatrix toDeviceMatrix() const; - + #endif private: ValueType mValueType = Normal; diff --git a/src/main.cxx b/src/main.cxx index 7f602a3..9687545 100644 --- a/src/main.cxx +++ b/src/main.cxx @@ -7,15 +7,19 @@ #include #include "Matrix.h" -#include "CudaMatrix.h" #include "Function.h" #include "Function1D.h" #include "Function2D.h" #include "Function3D.h" #include "MatlabReader.h" +#if USE_CUDA +#include "CudaMatrix.h" +#endif //USE_CUDA + int main() { + #if USE_CUDA auto A = Aurora::zeros(1000,1,1); auto B = Aurora::zeros(1000,1,1); for (size_t i = 0; i < 1000; i++) @@ -115,5 +119,6 @@ int main() } } } + #endif //USE_CUDA return 0; }