Merge branch 'dtof' of http://192.168.1.9:3000/Bug/Aurora into dtof

This commit is contained in:
kradchen
2023-11-27 10:19:52 +08:00
4 changed files with 161 additions and 2 deletions

View File

@@ -147,6 +147,10 @@ CudaMatrix CudaMatrix::deepCopy() const
Matrix CudaMatrix::toHostMatrix() const
{
if(!mData.get())
{
return Matrix();
}
unsigned long long size = getDataSize() * getValueType();
float* data = new float[size];
cudaMemcpy(data, mData.get(), sizeof(float) * size, cudaMemcpyDeviceToHost);

View File

@@ -195,6 +195,11 @@ __global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize
CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix)
{
if(aMatrix.getValueType() == Aurora::Complex)
{
std::cerr<<"sqrt not support complex"<<std::endl;
return CudaMatrix();
}
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
@@ -206,6 +211,11 @@ CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix)
CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix)
{
if(aMatrix.getValueType() == Aurora::Complex)
{
std::cerr<<"sqrt not support complex"<<std::endl;
return CudaMatrix();
}
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
@@ -302,3 +312,94 @@ CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix)
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void repMatKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex)
{
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
if(aIsComplex)
{
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX);
unsigned int inPutIndex = 2 * (threadIdx.y * blockDim.x + threadIdx.x);
aOutput[outPutIndex] = aInputData[inPutIndex];
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
}
else
{
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.y * blockDim.x + threadIdx.x];
}
}
CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes)
{
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull())
{
return CudaMatrix();
}
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1);
dim3 gridSize(aRowTimes, aColumnTimes, 1);
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes;
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
{
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull())
{
return CudaMatrix();
}
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1);
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes);
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
}
__global__ void repMat3DKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex)
{
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
if(aIsComplex)
{
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX);
unsigned int inPutIndex = 2 * (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x);
aOutput[outPutIndex] = aInputData[inPutIndex];
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
}
else
{
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x];
}
}
CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes)
{
if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() < 3 || aMatrix.isNull())
{
return CudaMatrix();
}
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes);
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
}

View File

@@ -39,6 +39,12 @@ namespace Aurora
CudaMatrix sign(const CudaMatrix& aMatrix);
CudaMatrix sign(const CudaMatrix&& aMatrix);
CudaMatrix repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes);
CudaMatrix repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes);
CudaMatrix repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes);
}
#endif //AURORA_CUDA_FUNCTION1D_H