Add CudaMatrix.

This commit is contained in:
sunwen
2023-10-30 10:28:24 +08:00
parent 15c0654c5c
commit e3abe9fabe
6 changed files with 510 additions and 20 deletions

View File

@@ -3,6 +3,11 @@ project(Aurora)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_INCLUDE_CURRENT_DIR ON)
set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
find_package (OpenMP REQUIRED)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
@@ -25,6 +30,14 @@ target_include_directories(Aurora PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_IN
target_link_libraries(Aurora PUBLIC $<LINK_ONLY:MKL::MKL>)
target_link_libraries(Aurora PUBLIC OpenMP::OpenMP_CXX)
target_link_libraries(Aurora PUBLIC matio)
target_include_directories(Aurora PRIVATE ./src /usr/local/cuda/include)
set_target_properties(Aurora PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_compile_options(Aurora PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-arch=sm_75
>)
target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
find_package(GTest REQUIRED)
INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS})
include_directories(./src/util)
@@ -41,5 +54,12 @@ target_link_libraries(Aurora_Test PUBLIC $<LINK_ONLY:MKL::MKL>)
target_link_libraries(Aurora_Test PUBLIC OpenMP::OpenMP_CXX)
target_link_libraries(Aurora_Test PUBLIC matio)
target_link_libraries(Aurora_Test PUBLIC ${GTEST_BOTH_LIBRARIES} )
target_include_directories(Aurora_Test PRIVATE ./src /usr/local/cuda/include)
set_target_properties(Aurora_Test PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_compile_options(Aurora_Test PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
-arch=sm_75
>)
target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart)
gtest_discover_tests(Aurora_Test )
#target_link_libraries(CreateMatchedFilter PRIVATE TBB::tbb)

238
src/CudaMatrix.cpp Normal file
View File

@@ -0,0 +1,238 @@
#include "CudaMatrix.h"
#include "Function.h"
#include "Matrix.h"
#include <iostream>
#include <cstddef>
#include <cuda_runtime.h>
using namespace Aurora;
CudaMatrix::CudaMatrix(std::shared_ptr<float> aData, std::vector<int> aInfo, ValueType aValueType)
: mValueType(aValueType)
, mData(aData)
, mInfo(aInfo)
{
size_t infoSize = mInfo.size();
for(; infoSize<3; ++infoSize)
{
mInfo.push_back(1);
}
}
bool CudaMatrix::isNull() const
{
return !mData || mInfo.empty();
}
bool CudaMatrix::isNan() const
{
for(size_t i=0; i<getDataSize(); ++i)
{
if(mData.get()[i] == mData.get()[i])
{
return false;
}
}
return true;
}
bool CudaMatrix::isScalar() const
{
return (getDimSize(0) == 1 &&
getDimSize(1) == 1 &&
getDimSize(2) < 2);
}
float CudaMatrix::getScalar() const
{
if (isNull()) return 0.0;
if (isNull()) return 0.0;
return getData()[0];
}
bool CudaMatrix::isVector() const
{
if (getDimSize(2)>1) return false;
if (isScalar()) return false;
return getDimSize(0) == 1 ||
getDimSize(1) == 1;
}
int CudaMatrix::getDims() const
{
if(mInfo[2] > 1)
{
return 3;
}
return 2;
}
float *CudaMatrix::getData() const
{
return mData.get();
}
int CudaMatrix::getDimSize(int aIndex) const
{
if (aIndex >= 0 && aIndex < 3) {
return mInfo.at(aIndex);
}
return 0;
}
size_t CudaMatrix::getDataSize() const
{
if (!mData.get())return 0;
size_t ret = 1;
for (auto v: mInfo) {
ret *= v;
}
return ret;
}
void CudaMatrix::forceReshape(int rows, int columns, int slices)
{
mInfo = {rows,columns,slices};
}
bool CudaMatrix::compareShape(const CudaMatrix &other) const
{
if (mInfo[2] == 1 && other.mInfo[2] == 1) {
if (mInfo[0]==1 && other.mInfo[1] == 1 && mInfo[1] == other.mInfo[0]) return true;
if (mInfo[1]==1 && other.mInfo[0] == 1 && mInfo[0] == other.mInfo[1]) return true;
}
for (int i = 0; i < mInfo.size(); ++i) {
if (mInfo[i] != other.mInfo[i]) return false;
}
return true;
}
CudaMatrix CudaMatrix::fromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)
{
if (!aData)
{
return CudaMatrix();
}
std::vector<int> vector{aRows, aCols, aSlices};
CudaMatrix ret({aData, gpuFree}, vector, aType);
return ret;
}
CudaMatrix CudaMatrix::copyFromRawData(float *aData, int aRows, int aCols, int aSlices, ValueType aType)
{
if (!aData)
{
return CudaMatrix();
}
float* data = nullptr;
unsigned long long size = aRows * aCols * aSlices * aType;
cudaMalloc((void**)&data, sizeof(float) * size);
cudaMemcpy(data, aData, sizeof(float) * size, cudaMemcpyDeviceToDevice);
std::vector<int> vector{aRows, aCols, aSlices};
return CudaMatrix({data, gpuFree}, vector, aType);
}
CudaMatrix CudaMatrix::deepCopy() const
{
float* data = nullptr;
unsigned long long size = getDataSize() * getValueType();
cudaMalloc((void**)&data, sizeof(float) * size);
cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToDevice);
return CudaMatrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
}
Matrix CudaMatrix::toHostMatrix() const
{
unsigned long long size = getDataSize() * getValueType();
float* data = new float[size];
cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToHost);
return Matrix::fromRawData(data, getDimSize(0), getDimSize(1), getDimSize(2), getValueType());
}
CudaMatrix CudaMatrix::block(int aDim,int aBeginIndex, int aEndIndex) const
{
if(aDim > 2)
{
std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;
return CudaMatrix();
}
if (isVector() && aDim == 0 && getDimSize(1)>1)
{
aDim = 1;
}
if (aBeginIndex>=getDimSize(aDim) || aBeginIndex<0)
{
std::cerr<<"CudaMatrix block BeginIndx error!BeginIndx:"<<aBeginIndex<<std::endl;
return CudaMatrix();
}
if (aEndIndex>=getDimSize(aDim) || aEndIndex<0)
{
std::cerr<<"CudaMatrix block EndIndex error!EndIndex:"<<aEndIndex<<std::endl;
return CudaMatrix();
}
if (aEndIndex < aBeginIndex)
{
std::cerr<<"CudaMatrix block EndIndex can not less than BeginIndex ! BeginIndex:"<<aBeginIndex <<", EndIndex:"<<aEndIndex<<std::endl;
return CudaMatrix();
}
int dimLength = aEndIndex - aBeginIndex + 1;
int dataSize = getDataSize()/getDimSize(aDim)*dimLength;
float * dataOutput = nullptr;
cudaMalloc((void**)&dataOutput, sizeof(float) * dataSize * getValueType());
int colStride = getDimSize(0);
int sliceStride = getDimSize(0)*getDimSize(1);
switch (aDim)
{
case 0:
{
int colStride2 = dimLength;
int sliceStride2 = dimLength*getDimSize(1);
for (size_t i = 0; i < getDimSize(2); i++)
{
for (size_t j = 0; j < getDimSize(1); j++)
{
cudaMemcpy(dataOutput + (colStride2 * j + i * sliceStride2)*getValueType(),
mData.get()+ (aBeginIndex + j * colStride + i * sliceStride)*getValueType(),
sizeof(float) * colStride2*getValueType(), cudaMemcpyDeviceToDevice);
}
}
return CudaMatrix::fromRawData(dataOutput,dimLength,getDimSize(1),getDimSize(2),getValueType());
}
case 1:
{
int colStride2 = getDimSize(0);
int copySize = dimLength*getDimSize(0);
for (size_t i = 0; i < getDimSize(2); i++)
{
cudaMemcpy(dataOutput + getValueType()*(i * copySize),
mData.get() + getValueType()*(aBeginIndex * colStride + i * sliceStride),
sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);
}
return CudaMatrix::fromRawData(dataOutput,getDimSize(0),dimLength,getDimSize(2),getValueType());
}
case 2:
{
int copySize = dimLength*sliceStride;
cudaMemcpy(dataOutput,
mData.get() + aBeginIndex * sliceStride*getValueType(),
sizeof(float) * copySize*getValueType(), cudaMemcpyDeviceToDevice);
return CudaMatrix::fromRawData(dataOutput,getDimSize(0),getDimSize(1),dimLength,getValueType());
}
}
}
bool CudaMatrix::setBlockValue(int aDim,int aBeginIndx, int aEndIndex,float value)
{
if(aDim>2 )
{
std::cerr<<"CudaMatrix block only support 1D-3D data!"<<std::endl;
return false;
}
}

239
src/CudaMatrix.h Normal file
View File

@@ -0,0 +1,239 @@
#ifndef CUDAMATRIX_H
#define CUDAMATRIX_H
#include "Matrix.h"
namespace Aurora
{
class CudaMatrix
{
public:
explicit CudaMatrix(std::shared_ptr<float> aData = std::shared_ptr<float>(),
std::vector<int> aInfo = std::vector<int>(),
ValueType aValueType = Normal);
/**
* Create from a Raw data(float array).
* Use Raw data which create like new float[size]() as a data source
* and the share_ptr's deleter will be std::default_delete<float[]>
* @param data
* @param rows
* @param cols
* @param slices
* @param type
* @return
*/
static CudaMatrix fromRawData(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal);
/**
* Create from a Raw data(float array) with copy the data to a new mkl memory.
* @param data
* @param rows
* @param cols
* @param slices
* @param type
* @return
*/
static CudaMatrix copyFromRawData(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal);
/**
* 深拷贝操作
* @return 深拷贝的Matrix对象
*/
CudaMatrix deepCopy() const;
// add
CudaMatrix operator+(float aScalar) const;
friend CudaMatrix operator+(float aScalar, const CudaMatrix &aMatrix);
friend CudaMatrix& operator+(float aScalar, CudaMatrix &&aMatrix);
friend CudaMatrix& operator+(CudaMatrix &&aMatrix,float aScalar);
CudaMatrix operator+(const CudaMatrix &aMatrix) const;
CudaMatrix operator+(CudaMatrix &&aMatrix) const;
friend CudaMatrix operator+(CudaMatrix &&aMatrix,CudaMatrix &aOther);
// sub
CudaMatrix operator-(float aScalar) const;
friend CudaMatrix operator-(float aScalar, const CudaMatrix &aMatrix);
friend CudaMatrix& operator-(float aScalar, CudaMatrix &&aMatrix);
friend CudaMatrix& operator-(CudaMatrix &&aMatrix,float aScalar);
CudaMatrix operator-(const CudaMatrix &aMatrix) const;
CudaMatrix operator-(CudaMatrix &&aMatrix) const;
friend CudaMatrix operator-(CudaMatrix &&aMatrix,CudaMatrix &aOther);
//negetive
friend CudaMatrix operator-(CudaMatrix &&aMatrix);
friend CudaMatrix operator-(const CudaMatrix &aMatrix);
// mul
CudaMatrix operator*(float aScalar) const;
friend CudaMatrix operator*(float aScalar, const CudaMatrix &aMatrix);
friend CudaMatrix& operator*(float aScalar, CudaMatrix &&aMatrix);
friend CudaMatrix& operator*(CudaMatrix &&aMatrix,float aScalar);
CudaMatrix operator*(const CudaMatrix &aMatrix) const;
CudaMatrix operator*(CudaMatrix &&aMatrix) const;
friend CudaMatrix operator*(CudaMatrix &&aMatrix,CudaMatrix &aOther);
// div
CudaMatrix operator/(float aScalar) const;
friend CudaMatrix operator/(float aScalar, const CudaMatrix &aMatrix);
friend CudaMatrix& operator/(float aScalar, CudaMatrix &&aMatrix);
friend CudaMatrix& operator/(CudaMatrix &&aMatrix,float aScalar);
CudaMatrix operator/(const CudaMatrix &aMatrix) const;
CudaMatrix operator/(CudaMatrix &&aMatrix) const;
friend CudaMatrix operator/(CudaMatrix &&aMatrix, CudaMatrix &aOther);
// pow
CudaMatrix operator^(int times) const;
friend CudaMatrix operator^(CudaMatrix &&aMatrix,int times);
//compare
CudaMatrix operator>(float aScalar) const;
friend CudaMatrix operator>(float aScalar, const CudaMatrix &aMatrix);
CudaMatrix operator>(const CudaMatrix &aMatrix) const;
CudaMatrix operator<(float aScalar) const;
friend CudaMatrix operator<(float aScalar, const CudaMatrix &aMatrix);
CudaMatrix operator<(const CudaMatrix &aMatrix) const;
CudaMatrix operator>=(float aScalar) const;
friend CudaMatrix operator>=(float aScalar, const CudaMatrix &aMatrix);
CudaMatrix operator>=(const CudaMatrix &aMatrix) const;
CudaMatrix operator<=(float aScalar) const;
friend CudaMatrix operator<=(float aScalar, const CudaMatrix &aMatrix);
CudaMatrix operator<=(const CudaMatrix &aMatrix) const;
CudaMatrix operator==(float aScalar) const;
friend CudaMatrix operator==(float aScalar, const CudaMatrix &aMatrix);
CudaMatrix operator==(const CudaMatrix &aMatrix) const;
CudaMatrix operator!=(float aScalar) const;
friend CudaMatrix operator!=(float aScalar, const CudaMatrix &aMatrix);
CudaMatrix operator!=(const CudaMatrix &aMatrix) const;
// sub
float& operator[](size_t index);
float operator[](size_t index) const;
/**
* 切块操作
*
* @param aDim 需要切块的维度,
* @param aBeginIndx 起始索引,包含
* @param aEndIndex 终止索引,包含
* @return Matrix 返回矩阵
*/
CudaMatrix block(int aDim,int aBeginIndx, int aEndIndex) const;
bool setBlockValue(int aDim,int aBeginIndx, int aEndIndex,float value);
bool setBlockComplexValue(int aDim,int aBeginIndx, int aEndIndex,std::complex<float> value);
bool setBlock(int aDim,int aBeginIndx, int aEndIndex,const CudaMatrix& src);
/**
* 矩阵乘法
* @attention 目前只支持矩阵乘向量
* @param aOther
* @return Matrix
*/
CudaMatrix mul(const CudaMatrix& aOther) const;
/**
* 矩阵乘法
* @attention 目前只支持矩阵乘向量
* @param aOther
* @return Matrix
*/
CudaMatrix mul(CudaMatrix&& aOther) const;
/**
* print matrix , only support 2d matrix now
*/
void printf();
void printfShape();
bool isScalar() const;
float getScalar() const;
/**
* Get is the Matrix's data is empty or size is zero.
* @return
*/
bool isNull() const;
bool isNan() const;
bool isVector() const;
bool isMKLAllocated() const;
/**
* Get dimension count of the matrix
* @return dimension count
*/
int getDims() const;
float *getData() const;
int getDimSize(int index) const;
/**
* Compare matrix shape
* @param other matrix
* @return is identity
*/
bool compareShape(const CudaMatrix& other) const;
/**
* Get the data size
* @return the unit count of the matrix
*/
size_t getDataSize() const;
/**
* Get Value type as normal and complex,
* complex use std::complex<float>,
* it's contains two float value.
* @return
*/
ValueType getValueType() const {
return mValueType;
}
/**
* Set Value type as normal and complex,
* @return
*/
void setValueType(ValueType aValueType) {
mValueType = aValueType;
}
/**
* return true if the valueType is complex,
* @return
*/
bool isComplex() const {
if(mValueType == Complex)
{
return true;
}
return false;
}
void forceReshape(int rows, int columns, int slices);
Matrix toHostMatrix() const;
private:
ValueType mValueType = Normal;
std::shared_ptr<float> mData;
std::vector<int> mInfo;
};
}
#endif

View File

@@ -92,6 +92,11 @@ Aurora::Matrix MatlabReader::read(const std::string& aFieldName)
std::copy((float*)matvar->data, (float*)matvar->data + matrixSize, matrixData);
break;
}
case MAT_T_DOUBLE:
{
std::copy((double*)matvar->data, (double*)matvar->data + matrixSize, matrixData);
break;
}
}
result = Aurora::Matrix::fromRawData(matrixData,rows,columns,slices);
}

View File

@@ -1,4 +1,5 @@
#include "Matrix.h"
#include "CudaMatrix.h"
#include <cmath>
#include <cstddef>
@@ -393,24 +394,12 @@ namespace Aurora {
return ret;
}
Matrix Matrix::NewCuda(float *data, int rows, int cols, int slices, ValueType type)
CudaMatrix Matrix::toDeviceMatrix() const
{
if (!data) return Matrix();
std::vector<int> vector(3);
vector[0]=rows;
vector[1] = (cols > 0?cols:1);
vector[2] = (slices > 0?slices:1);
Matrix ret({data, gpuFree}, vector);
if (type != Normal)ret.setValueType(type);
ret.mCuda_Allocated = true;
return ret;
}
Matrix Matrix::toHostMatrix() const
{
float* data = new float[getDataSize()];
cudaMemcpy(data, mData.get(), sizeof(float) * getDataSize() * (mValueType == Normal ? 1 : 2), cudaMemcpyDeviceToHost);
return fromRawData(data, mInfo[0], mInfo[1], mInfo[2], getValueType());
float* deviceData = nullptr;
cudaMalloc((void**)&deviceData, sizeof(float) * getDataSize() * getValueType() );
cudaMemcpy(deviceData, mData.get(), sizeof(float) * getDataSize() * getValueType(), cudaMemcpyHostToDevice);
return CudaMatrix::fromRawData(deviceData, mInfo[0], mInfo[1], mInfo[2], getValueType());
}
Matrix Matrix::New(float *data, const Matrix &shapeMatrix) {

View File

@@ -12,6 +12,7 @@ namespace Aurora {
Complex
};
const int $ = -1;
class CudaMatrix;
class Matrix {
public:
@@ -87,7 +88,6 @@ namespace Aurora {
*/
static Matrix New(float *data, const Matrix &shapeMatrix);
static Matrix NewCuda(float *data, int rows, int cols = 1, int slices = 1, ValueType type = Normal);
/**
* New a mkl calculate based Matrix
* @attention Memory are allocate by Aurora:malloc function
@@ -288,7 +288,7 @@ namespace Aurora {
void forceReshape(int rows, int columns, int slices);
Matrix toHostMatrix() const;
CudaMatrix toDeviceMatrix() const;
private:
@@ -296,7 +296,6 @@ namespace Aurora {
std::shared_ptr<float> mData;
std::vector<int> mInfo;
bool mMKL_Allocated = false;
bool mCuda_Allocated = false;
};
}