#include "CudaMatrix.h" #include "AuroraDefs.h" #include "Function1D.cuh" #include "Function1D.h" #include "Matrix.h" #include #include #include #include #include #include #include #include #include #include using namespace Aurora; using namespace thrust::placeholders; namespace { const int THREADS_PER_BLOCK = 256; const int THREADS_PER_BLOCK_DIM2_X = 16; const int THREADS_PER_BLOCK_DIM2_Y = 16; const int THREADS_PER_BLOCK_DIM3_X = 8; const int THREADS_PER_BLOCK_DIM3_Y = 8; const int THREADS_PER_BLOCK_DIM3_Z = 8; } __global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[2*idx] = aInputData[idx]; aOutput[2*idx + 1] = 0; } } CudaMatrix Aurora::complex(const CudaMatrix& aMatrix) { if(aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; complexKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex); } __global__ void realKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = aInputData[idx*2]; } } CudaMatrix Aurora::real(const CudaMatrix& aMatrix) { if(!aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; realKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } __global__ void imageKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = aInputData[idx*2 + 1]; } } CudaMatrix Aurora::imag(const CudaMatrix& aMatrix) { if(!aMatrix.isComplex()) { return CudaMatrix(); } size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; imageKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal); } __global__ void ceilKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::ceil(aInputData[idx]); } } CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; ceilKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void roundKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::round(aInputData[idx]); } } CudaMatrix Aurora::round(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::round(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; roundKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void floorKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::floor(aInputData[idx]); } } CudaMatrix Aurora::floor(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; floorKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void sqrtKernel(float* aInputData, float* aOutput, unsigned int aSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { aOutput[idx] = std::sqrt(aInputData[idx]); } } CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix) { if(aMatrix.getValueType() == Aurora::Complex) { std::cerr<<"sqrt not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix) { if(aMatrix.getValueType() == Aurora::Complex) { std::cerr<<"sqrt not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void absKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { if(aIsComplex) { aOutput[idx] = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]); } else { aOutput[idx] = abs(aInputData[idx]); } } } CudaMatrix Aurora::abs(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; absKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } __global__ void signKernel(float* aInputData, float* aOutput, unsigned int aSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aSize) { if(aIsComplex) { float absValue = sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx + 1] * aInputData[2*idx + 1]); aOutput[2*idx] = aInputData[2*idx] / absValue; aOutput[2*idx + 1] = aInputData[2*idx + 1] / absValue; return; } if(aInputData[idx] < 0) { aOutput[idx] = -1; } else if(aInputData[idx] > 0) { aOutput[idx] = 1; } else { aOutput[idx] = 0; } } } CudaMatrix Aurora::sign(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; signKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } void Aurora::nantoval(CudaMatrix& aMatrix,float val){ auto lambda = [=] __host__ __device__ (const float& x){ return ::isnan(x)?val:x; }; thrust::transform(thrust::device,aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(),aMatrix.getData(),lambda); } CudaMatrix Aurora::isnan(const CudaMatrix& aMatrix){ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__ (const float& x){ return ::isnan(x)?1.0:0; }; thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::isfinite(const CudaMatrix& aMatrix){ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__ (const float& x){ return ::isfinite(x)?1.0:0; }; thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } void Aurora::padding(CudaMatrix& aMatrix, int aIndex, float aValue){ if(aMatrix.isNull() || !aMatrix.isVector() || aMatrix.isComplex()) { std::cerr<<"padding only support real vector"<aIndex){ aMatrix.setValue(aIndex, aValue); return; } //长度不足需补齐 size_t size = (aIndex+1) ; float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); cudaMemcpy(data, aMatrix.getData(), aMatrix.getDataSize(), cudaMemcpyDeviceToDevice); thrust::fill_n(thrust::device, data+aMatrix.getDataSize(),size-aMatrix.getDataSize(),aValue); aMatrix=CudaMatrix::fromRawData(data,size,1,1,aMatrix.getValueType()); } CudaMatrix Aurora::auroraNot(const CudaMatrix& aMatrix){ return auroraNot(std::forward(aMatrix.deepCopy())); } CudaMatrix Aurora::auroraNot(CudaMatrix&& aMatrix){ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__ (const float& x){ return x<=0?1.0:0; }; thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } void Aurora::compareSet(CudaMatrix& aValueMatrix,float compareValue, float newValue,CompareOp op) { switch (op) { case GT: { auto lambda = [=] __host__ __device__ (const float& x){ return x>compareValue?newValue:x; }; thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); break; } case NG:{ auto lambda = [=] __host__ __device__ (const float& x){ return x<=compareValue?newValue:x; }; thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); break; } case EQ:{ auto lambda = [=] __host__ __device__ (const float& x){ return x==compareValue?newValue:x; }; thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); break; } case NE:{ auto lambda = [=] __host__ __device__ (const float& x){ return x!=compareValue?newValue:x; }; thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); break; } case NL:{ auto lambda = [=] __host__ __device__ (const float& x){ return x>=compareValue?newValue:x; }; thrust::transform(thrust::device,aValueMatrix.getData(),aValueMatrix.getData()+aValueMatrix.getDataSize(),aValueMatrix.getData(),lambda); break; } case LT:{ auto lambda = [=] __host__ __device__ (const float& x){ return xcompareValue?newValue:y; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aValueMatrix.getDataSize(), aValueMatrix.getData(), aValueMatrix.getData(), lambda); break; } case NG:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x<=compareValue?newValue:y; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aValueMatrix.getDataSize(), aValueMatrix.getData(), aValueMatrix.getData(), lambda); break; } case EQ:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x==compareValue?newValue:y; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aValueMatrix.getDataSize(), aValueMatrix.getData(), aValueMatrix.getData(), lambda); break; } case NE:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x!=compareValue?newValue:y; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aValueMatrix.getDataSize(), aValueMatrix.getData(), aValueMatrix.getData(), lambda); break; } case NL:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x>=compareValue?newValue:y; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aValueMatrix.getDataSize(), aValueMatrix.getData(), aValueMatrix.getData(), lambda); break; } case LT:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return xy?newValue:x; }; thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), lambda); break; } case NG:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x<=y?newValue:x; }; thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), lambda); break; } case EQ:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x==y?newValue:x; }; thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), lambda); break; } case NE:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x!=y?newValue:x; }; thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), lambda); break; } case NL:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x>=y?newValue:x; }; thrust::transform(thrust::device,aDesAndCompareMatrix.getData(), aDesAndCompareMatrix.getData()+aDesAndCompareMatrix.getDataSize(), aOtherCompareMatrix.getData(), aDesAndCompareMatrix.getData(), lambda); break; } case LT:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return xcompareValue?y:x; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aCompareMatrix.getDataSize(), aNewValueMatrix.getData(), aCompareMatrix.getData(), lambda); break; } case NG:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x<=compareValue?y:x; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aCompareMatrix.getDataSize(), aNewValueMatrix.getData(), aCompareMatrix.getData(), lambda); break; } case EQ:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x==compareValue?y:x; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aCompareMatrix.getDataSize(), aNewValueMatrix.getData(), aCompareMatrix.getData(), lambda); break; } case NE:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x!=compareValue?y:x; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aCompareMatrix.getDataSize(), aNewValueMatrix.getData(), aCompareMatrix.getData(), lambda); break; } case NL:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x>=compareValue?y:x; }; thrust::transform(thrust::device,aCompareMatrix.getData(), aCompareMatrix.getData()+aCompareMatrix.getDataSize(), aNewValueMatrix.getData(), aCompareMatrix.getData(), lambda); break; } case LT:{ auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return x= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) { return; } if(aIsComplex) { unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX); unsigned int inPutIndex = 2 * (idY % aInputColumns * aInputRows + idX % aInputRows); aOutput[outPutIndex] = aInputData[inPutIndex]; aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1]; } else { aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idY % aInputColumns * aInputRows + idX % aInputRows]; } } CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes) { if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull()) { return CudaMatrix(); } size_t rowSize = aMatrix.getDimSize(0) * aRowTimes; size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; dim3 blockSize(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_Y, 1); dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM2_X - 1)/THREADS_PER_BLOCK_DIM2_X, (columnSize + THREADS_PER_BLOCK_DIM2_Y - 1)/THREADS_PER_BLOCK_DIM2_Y, 1); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes; float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); repMatKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, 1, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes) { if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() > 2 || aMatrix.isNull()) { return CudaMatrix(); } size_t rowSize = aMatrix.getDimSize(0) * aRowTimes; size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes; dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); repMatKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, sliceSize, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); } __global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns, unsigned int aInputSlices, float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex) { unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) { return; } if(aIsComplex) { unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX); unsigned int inPutIndex = 2 * (idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows); aOutput[outPutIndex] = aInputData[inPutIndex]; aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1]; } else { aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows]; } } CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes) { if(aRowTimes < 1 || aColumnTimes < 1 || aMatrix.getDims() < 3 || aMatrix.isNull()) { return CudaMatrix(); } size_t rowSize = aMatrix.getDimSize(0) * aRowTimes; size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes; dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); repMat3DKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), data, rowSize, columnSize, sliceSize, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); } __global__ void logKernel(float* aInputData, float* aOutput, unsigned int aInputSize, int aBaseNum) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { if(aBaseNum == -1) { aOutput[idx] = logf(aInputData[idx]); } else { float value = logf(aBaseNum); aOutput[idx] = logf(aInputData[idx]) / value; } } } CudaMatrix Aurora::log(const CudaMatrix& aMatrix, int aBaseNum) { if(aMatrix.getValueType() == Aurora::Complex) { std::cerr<<"log not support complex"<>>(aMatrix.getData(), data, size, aBaseNum); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void expKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { if(aIsComplex) { unsigned int index = 2 * idx; float expReal = expf(aInputData[index]); aOutput[index] = expReal * cosf(aInputData[index + 1]); aOutput[index + 1] = expReal * sinf(aInputData[index + 1]); } else { aOutput[idx] = expf(aInputData[idx]); } } } CudaMatrix Aurora::exp(const CudaMatrix& aMatrix) { size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; expKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void modKernel(float* aInputData, float* aOutput, unsigned int aInputSize, float aModValue) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { aOutput[idx] = fmodf(aInputData[idx], aModValue); } } CudaMatrix Aurora::mod(const CudaMatrix& aMatrix, float aValue) { if(aMatrix.isComplex()) { std::cerr<<"mod not support complex"<>>(aMatrix.getData(), data, size, aValue); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } __global__ void acosKernel(float* aInputData, float* aOutput, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { aOutput[idx] = acosf(aInputData[idx]); } } CudaMatrix Aurora::acos(const CudaMatrix& aMatrix) { if(aMatrix.isComplex()) { std::cerr<<"acos not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } __global__ void acosdKernel(float* aInputData, float* aOutput, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { aOutput[idx] = acosf(aInputData[idx]) * 180 / PI; } } CudaMatrix Aurora::acosd(const CudaMatrix& aMatrix) { if(aMatrix.isComplex()) { std::cerr<<"acos not support complex"<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); } __global__ void conjKernel(float* aInputData, float* aOutput, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { unsigned int index = idx * 2; aOutput[index] = aInputData[index]; aOutput[index + 1] = -aInputData[index + 1]; } } CudaMatrix Aurora::conj(const CudaMatrix& aMatrix) { if(!aMatrix.isComplex()) { return CudaMatrix::copyFromRawData(aMatrix.getData(),aMatrix.getDimSize(0),aMatrix.getDimSize(1),aMatrix.getDimSize(2)); } size_t size = aMatrix.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType()); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; conjKernel<<>>(aMatrix.getData(), data, size); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod) { float resultValue = 0; if(aMatrix.getDims() > 2) { std::cerr<<"norm not support 3d matrix"<()); return resultValue; } else { CudaMatrix result = aMatrix.deepCopy(); thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); return std::sqrt(resultValue); } } //for 2 dims if(aNormMethod == Aurora::NormF) { CudaMatrix result = aMatrix.deepCopy(); thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); return std::sqrt(resultValue); } else if(aNormMethod == Aurora::Norm1) { for(int i=0; i()); if(resultValue < tempValue) { resultValue = tempValue; } } return resultValue; } else { if(aMatrix.isComplex()) { std::cerr<<"norm2 not support 2d complex matrix"< resultValue) { resultValue = S[i]; } } if (d_S ) cudaFree(d_S); if (d_U ) cudaFree(d_U); if (d_VT) cudaFree(d_VT); if (devInfo) cudaFree(devInfo); if (d_work) cudaFree(d_work); if (cusolverH) cusolverDnDestroy(cusolverH); if (stream) cudaStreamDestroy(stream); return resultValue; } } __global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int aRowSize, unsigned int aColumnSize, bool aIsComplex) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; if (idx < aRowSize && idy < aColumnSize) { unsigned int inputindex = idy * aRowSize + idx; unsigned int outputIndex = idx * aColumnSize + idy; if(aIsComplex) { aOutput[2*outputIndex] = aInputData[2*inputindex]; aOutput[2*outputIndex + 1] = aInputData[2*inputindex + 1]; } else { aOutput[outputIndex] = aInputData[inputindex]; } } } CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix) { //not surpport for 3 dims. if(aMatrix.isNull() || aMatrix.getDimSize(2) > 1) { std::cerr<<"transpose not support 3d complex matrix"<>>(aMatrix.getData(), data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(1), aMatrix.getDimSize(0), aMatrix.getDimSize(2), aMatrix.getValueType()); } CudaMatrix Aurora::horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) { if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || aMatrix1.getDimSize(0) != aMatrix2.getDimSize(0) || aMatrix1.getValueType() != aMatrix2.getValueType()) { std::cerr<<"horzcat must have same rows and slices"<= aOutputRowSize || idY >= aOutputColumnSize || idZ >= aOutputSliceSize) { return; } if(aIsComplex) { if(idX < aInputData1RowSize) { unsigned int inputIndex = idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX; unsigned int outputIndex = idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX; aOutput[2*outputIndex] = aInputData1[2*inputIndex]; aOutput[2*outputIndex + 1] = aInputData1[2*inputIndex + 1]; } else { unsigned int inputIndex = idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize; unsigned int outputIndex =idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX; aOutput[2*outputIndex] = aInputData2[2*inputIndex]; aOutput[2*outputIndex + 1] = aInputData2[2*inputIndex + 1]; } } else { if(idX < aInputData1RowSize) { aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData1[idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX]; } else { aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize]; } } } CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) { if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || aMatrix1.getDimSize(1) != aMatrix2.getDimSize(1) || aMatrix1.getValueType() != aMatrix2.getValueType()) { return CudaMatrix(); } size_t outputRows = aMatrix1.getDimSize(0) + aMatrix2.getDimSize(0); size_t outputColumns = aMatrix1.getDimSize(1); size_t outputSlices = aMatrix1.getDimSize(2); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * outputRows * outputColumns * outputSlices * aMatrix1.getValueType()); dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); dim3 gridSize((outputRows + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, (outputColumns + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, (outputSlices + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); vertcatKernel<<>>(aMatrix1.getData(), aMatrix1.getDimSize(0), aMatrix2.getData(), aMatrix2.getDimSize(0), data, outputRows, outputColumns, outputSlices, aMatrix1.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType()); } __global__ void vecnorm1Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 0; if(aIsComplex) { for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) { unsigned int indexByRows = i*blockDim.x + threadIdx.x; if(indexByRows < aInputRowSize) { unsigned int idx = blockIdx.x*aInputRowSize + indexByRows; sharedValue[threadIdx.x] += sqrt(aInputData[2*idx] * aInputData[2*idx] + aInputData[2*idx+1] * aInputData[2*idx+1]); } } } else { for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) { unsigned int indexByRows = i*blockDim.x + threadIdx.x; if(indexByRows < aInputRowSize) { sharedValue[threadIdx.x] += abs(aInputData[blockIdx.x*aInputRowSize + indexByRows]); } } } __syncthreads(); for(unsigned int i = blockDim.x/2; i>0; i >>= 1) { if(threadIdx.x < i) { sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; } __syncthreads(); } aOutput[blockIdx.x] = sharedValue[0]; } __global__ void vecnorm2Kernel(float* aInputData, unsigned int aInputRowSize, float* aOutput, bool aIsComplex) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 0; for(unsigned int i=0; i<=aInputRowSize/blockDim.x; ++i) { unsigned int indexByRows = i*blockDim.x + threadIdx.x; if(indexByRows < aInputRowSize) { if(aIsComplex) { unsigned int idx = blockIdx.x*aInputRowSize + indexByRows; sharedValue[threadIdx.x] += aInputData[2 * idx] * aInputData[2 * idx]; sharedValue[threadIdx.x] += aInputData[2 * idx + 1] * aInputData[2 * idx + 1]; } else { sharedValue[threadIdx.x] += aInputData[blockIdx.x*aInputRowSize + indexByRows] * aInputData[blockIdx.x*aInputRowSize + indexByRows]; } } } __syncthreads(); for(unsigned int i = blockDim.x/2; i>0; i >>= 1) { if(threadIdx.x < i) { sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; } __syncthreads(); } aOutput[blockIdx.x] = sqrt(sharedValue[0]); } CudaMatrix Aurora::vecnorm(const CudaMatrix& aMatrix, NormMethod aNormMethod, int aDim) { //only surpport aDim = 1 for now. if(aDim != 1 || aNormMethod == NormMethod::NormF || aMatrix.isNull()) { return CudaMatrix(); } unsigned int column = aMatrix.getDimSize(1); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * column); if(aNormMethod == Aurora::Norm1) { vecnorm1Kernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex()); } else if(aNormMethod == Aurora::Norm2) { vecnorm2Kernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), data, aMatrix.isComplex()); } cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data,column); } __global__ void linspaceKernel(float* aOutput, unsigned int aOutputSize, float aStartNum, float aStepNum) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aOutputSize) { aOutput[idx] = aStartNum + idx * aStepNum; } } CudaMatrix Aurora::linspaceCuda(float aStart, float aEnd, int aNum) { float step = (aEnd - aStart) / (aNum - 1); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * aNum); int blocksPerGrid = (aNum + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; linspaceKernel<<>>(data, aNum, aStart, step); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data,aNum); } CudaMatrix Aurora::auroraUnion(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) { if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.isComplex() || aMatrix2.isComplex()) { std::cerr<<"auroraUnion not support complex cudamatrix"<>>(aMatrix1.getData(), result.getData(), aMatrix1.getDataSize(), iaResult, size); cudaDeviceSynchronize(); aIa = CudaMatrix::fromRawData(iaResult,size); return result; } CudaMatrix Aurora::reshape(const CudaMatrix& aMatrix, int aRows, int aColumns, int aSlices) { if(aMatrix.isNull() || (aMatrix.getDataSize() != aRows * aColumns * aSlices)) { std::cerr<<"reshape diffirent size with cudamatrix"<>>(aMatrix1.getData(), aMatrix2.getData(), aMatrix1.getDataSize(), data, size); cudaDeviceSynchronize(); return CudaMatrix::fromRawData(data, size); }