diff --git a/src/Function1D.cu b/src/Function1D.cu index d95d289..4e3271e 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -20,6 +20,11 @@ using namespace thrust::placeholders; namespace { const int THREADS_PER_BLOCK = 256; + const int THREADS_PER_BLOCK_DIM2_X = 16; + const int THREADS_PER_BLOCK_DIM2_Y = 16; + const int THREADS_PER_BLOCK_DIM3_X = 8; + const int THREADS_PER_BLOCK_DIM3_Y = 8; + const int THREADS_PER_BLOCK_DIM3_Z = 8; } __global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize) @@ -582,21 +587,28 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri } } -__global__ void repMatKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) +__global__ void repMatKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns, + float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex) { unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; + + if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) + { + return; + } + if(aIsComplex) { - unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX); - unsigned int inPutIndex = 2 * (threadIdx.y * blockDim.x + threadIdx.x); + unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX); + unsigned int inPutIndex = 2 * (idY % aInputColumns * aInputRows + idX % aInputRows); aOutput[outPutIndex] = aInputData[inPutIndex]; aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1]; } else { - aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.y * blockDim.x + threadIdx.x]; + aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idY % aInputColumns * aInputRows + idX % aInputRows]; } } @@ -606,14 +618,16 @@ CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTi { return CudaMatrix(); } - - dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1); - dim3 gridSize(aRowTimes, aColumnTimes, 1); + size_t rowSize = aMatrix.getDimSize(0) * aRowTimes; + size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; + dim3 blockSize(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_Y, 1); + dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM2_X - 1)/THREADS_PER_BLOCK_DIM2_X, + (columnSize + THREADS_PER_BLOCK_DIM2_Y - 1)/THREADS_PER_BLOCK_DIM2_Y, 1); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes; float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); - repMatKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + repMatKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, 1, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType()); } @@ -625,32 +639,44 @@ CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTi return CudaMatrix(); } - dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1); - dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes); + size_t rowSize = aMatrix.getDimSize(0) * aRowTimes; + size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; + size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes; + dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); + dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, + (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, + (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); - repMatKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + repMatKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, sliceSize, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); } -__global__ void repMat3DKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) +__global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns, unsigned int aInputSlices, + float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex) { unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; + + if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices) + { + return; + } + if(aIsComplex) { - unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX); - unsigned int inPutIndex = 2 * (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x); + unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX); + unsigned int inPutIndex = 2 * (idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows); aOutput[outPutIndex] = aInputData[inPutIndex]; aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1]; } else { - aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x]; + aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows]; } } @@ -662,13 +688,18 @@ CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumn return CudaMatrix(); } - dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); - dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes); + size_t rowSize = aMatrix.getDimSize(0) * aRowTimes; + size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes; + size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes; + dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); + dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, + (columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, + (sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); - repMat3DKernel<<>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); + repMat3DKernel<<>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), data, rowSize, columnSize, sliceSize, aMatrix.isComplex()); cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); } @@ -953,3 +984,137 @@ float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod) return resultValue; } } + +__global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int aRowSize, unsigned int aColumnSize, bool aIsComplex) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y; + if (idx < aRowSize && idy < aColumnSize) + { + unsigned int inputindex = idy * aRowSize + idx; + unsigned int outputIndex = idx * aColumnSize + idy; + if(aIsComplex) + { + aOutput[2*outputIndex] = aInputData[2*inputindex]; + aOutput[2*outputIndex + 1] = aInputData[2*inputindex + 1]; + } + else + { + aOutput[outputIndex] = aInputData[inputindex]; + } + } +} + + +CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix) +{ + //not surpport for 3 dims. + if(aMatrix.isNull() || aMatrix.getDimSize(2) > 1) + { + std::cerr<<"transpose not support 3d complex matrix"<>>(aMatrix.getData(), data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(1), aMatrix.getDimSize(0), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +CudaMatrix Aurora::horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) +{ + if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || + aMatrix1.getDimSize(0) != aMatrix2.getDimSize(0) || aMatrix1.getValueType() != aMatrix2.getValueType()) + { + std::cerr<<"horzcat must have same rows and slices"<= aOutputRowSize || idY >= aOutputColumnSize || idZ >= aOutputSliceSize) + { + return; + } + + if(aIsComplex) + { + if(idX < aInputData1RowSize) + { + unsigned int inputIndex = idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX; + unsigned int outputIndex = idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX; + aOutput[2*outputIndex] = aInputData1[2*inputIndex]; + aOutput[2*outputIndex + 1] = aInputData1[2*inputIndex + 1]; + } + else + { + unsigned int inputIndex = idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize; + unsigned int outputIndex =idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX; + aOutput[2*outputIndex] = aInputData2[2*inputIndex]; + aOutput[2*outputIndex + 1] = aInputData2[2*inputIndex + 1]; + } + } + else + { + if(idX < aInputData1RowSize) + { + aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData1[idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX]; + } + else + { + aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize]; + } + } + +} + +CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2) +{ + if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) || + aMatrix1.getDimSize(1) != aMatrix2.getDimSize(1) || aMatrix1.getValueType() != aMatrix2.getValueType()) + { + return CudaMatrix(); + } + size_t outputRows = aMatrix1.getDimSize(0) + aMatrix2.getDimSize(0); + size_t outputColumns = aMatrix1.getDimSize(1); + size_t outputSlices = aMatrix1.getDimSize(2); + + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * outputRows * outputColumns * outputSlices * aMatrix1.getValueType()); + + dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z); + dim3 gridSize((outputRows + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X, + (outputColumns + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y, + (outputSlices + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z); + vertcatKernel<<>>(aMatrix1.getData(), aMatrix1.getDimSize(0), aMatrix2.getData(), aMatrix2.getDimSize(0), + data, outputRows, outputColumns, outputSlices, aMatrix1.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType()); +} + diff --git a/src/Function1D.cuh b/src/Function1D.cuh index 8d5c8f4..62d7b16 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -63,6 +63,12 @@ namespace Aurora float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod); + CudaMatrix transpose(const CudaMatrix& aMatrix); + + CudaMatrix horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2); + + CudaMatrix vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2); + // ------compareSet---------------------------------------------------- diff --git a/test/Function1D_Cuda_Test.cpp b/test/Function1D_Cuda_Test.cpp index 03c8c90..6fd397a 100644 --- a/test/Function1D_Cuda_Test.cpp +++ b/test/Function1D_Cuda_Test.cpp @@ -280,6 +280,21 @@ TEST_F(Function1D_Cuda_Test, repmat) } } +TEST_F(Function1D_Cuda_Test, repmat3d) +{ + Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[12]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8,9,10,11,12}, 2,3,2); + Aurora::CudaMatrix deviceMatrix = hostMatrix.toDeviceMatrix(); + + auto result1 = Aurora::repmat3d(hostMatrix,3,6,4); + auto result2 = Aurora::repmat3d(deviceMatrix,3,6,4).toHostMatrix(); + EXPECT_EQ(result2.getDataSize(), result1.getDataSize()); + EXPECT_EQ(result2.getValueType(), Aurora::Normal); + for(size_t i=0; i