Add transpose, horzcat, vertcat and Fix repmat threadPerBlock problem.
This commit is contained in:
@@ -20,6 +20,11 @@ using namespace thrust::placeholders;
|
|||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
const int THREADS_PER_BLOCK = 256;
|
const int THREADS_PER_BLOCK = 256;
|
||||||
|
const int THREADS_PER_BLOCK_DIM2_X = 16;
|
||||||
|
const int THREADS_PER_BLOCK_DIM2_Y = 16;
|
||||||
|
const int THREADS_PER_BLOCK_DIM3_X = 8;
|
||||||
|
const int THREADS_PER_BLOCK_DIM3_Y = 8;
|
||||||
|
const int THREADS_PER_BLOCK_DIM3_Z = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize)
|
||||||
@@ -582,21 +587,28 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void repMatKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex)
|
__global__ void repMatKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns,
|
||||||
|
float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex)
|
||||||
{
|
{
|
||||||
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
||||||
|
|
||||||
|
if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if(aIsComplex)
|
if(aIsComplex)
|
||||||
{
|
{
|
||||||
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX);
|
unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX);
|
||||||
unsigned int inPutIndex = 2 * (threadIdx.y * blockDim.x + threadIdx.x);
|
unsigned int inPutIndex = 2 * (idY % aInputColumns * aInputRows + idX % aInputRows);
|
||||||
aOutput[outPutIndex] = aInputData[inPutIndex];
|
aOutput[outPutIndex] = aInputData[inPutIndex];
|
||||||
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.y * blockDim.x + threadIdx.x];
|
aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idY % aInputColumns * aInputRows + idX % aInputRows];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -606,14 +618,16 @@ CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTi
|
|||||||
{
|
{
|
||||||
return CudaMatrix();
|
return CudaMatrix();
|
||||||
}
|
}
|
||||||
|
size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
|
||||||
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1);
|
size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
|
||||||
dim3 gridSize(aRowTimes, aColumnTimes, 1);
|
dim3 blockSize(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_Y, 1);
|
||||||
|
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM2_X - 1)/THREADS_PER_BLOCK_DIM2_X,
|
||||||
|
(columnSize + THREADS_PER_BLOCK_DIM2_Y - 1)/THREADS_PER_BLOCK_DIM2_Y, 1);
|
||||||
|
|
||||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes;
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes;
|
||||||
float* data = nullptr;
|
float* data = nullptr;
|
||||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||||
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, 1, aMatrix.isComplex());
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType());
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||||
}
|
}
|
||||||
@@ -625,32 +639,44 @@ CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTi
|
|||||||
return CudaMatrix();
|
return CudaMatrix();
|
||||||
}
|
}
|
||||||
|
|
||||||
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1);
|
size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
|
||||||
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes);
|
size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
|
||||||
|
size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes;
|
||||||
|
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
|
||||||
|
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
|
||||||
|
(columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
|
||||||
|
(sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
|
||||||
|
|
||||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
||||||
float* data = nullptr;
|
float* data = nullptr;
|
||||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||||
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, sliceSize, aMatrix.isComplex());
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void repMat3DKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex)
|
__global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns, unsigned int aInputSlices,
|
||||||
|
float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex)
|
||||||
{
|
{
|
||||||
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
||||||
|
|
||||||
|
if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if(aIsComplex)
|
if(aIsComplex)
|
||||||
{
|
{
|
||||||
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX);
|
unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX);
|
||||||
unsigned int inPutIndex = 2 * (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x);
|
unsigned int inPutIndex = 2 * (idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows);
|
||||||
aOutput[outPutIndex] = aInputData[inPutIndex];
|
aOutput[outPutIndex] = aInputData[inPutIndex];
|
||||||
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x];
|
aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows];
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@@ -662,13 +688,18 @@ CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumn
|
|||||||
return CudaMatrix();
|
return CudaMatrix();
|
||||||
}
|
}
|
||||||
|
|
||||||
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
|
||||||
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes);
|
size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
|
||||||
|
size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes;
|
||||||
|
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
|
||||||
|
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
|
||||||
|
(columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
|
||||||
|
(sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
|
||||||
|
|
||||||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
|
||||||
float* data = nullptr;
|
float* data = nullptr;
|
||||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||||
repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), data, rowSize, columnSize, sliceSize, aMatrix.isComplex());
|
||||||
cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
||||||
}
|
}
|
||||||
@@ -953,3 +984,137 @@ float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod)
|
|||||||
return resultValue;
|
return resultValue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int aRowSize, unsigned int aColumnSize, bool aIsComplex)
|
||||||
|
{
|
||||||
|
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
if (idx < aRowSize && idy < aColumnSize)
|
||||||
|
{
|
||||||
|
unsigned int inputindex = idy * aRowSize + idx;
|
||||||
|
unsigned int outputIndex = idx * aColumnSize + idy;
|
||||||
|
if(aIsComplex)
|
||||||
|
{
|
||||||
|
aOutput[2*outputIndex] = aInputData[2*inputindex];
|
||||||
|
aOutput[2*outputIndex + 1] = aInputData[2*inputindex + 1];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
aOutput[outputIndex] = aInputData[inputindex];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix)
|
||||||
|
{
|
||||||
|
//not surpport for 3 dims.
|
||||||
|
if(aMatrix.isNull() || aMatrix.getDimSize(2) > 1)
|
||||||
|
{
|
||||||
|
std::cerr<<"transpose not support 3d complex matrix"<<std::endl;
|
||||||
|
return CudaMatrix();
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size = aMatrix.getDataSize();
|
||||||
|
float* data = nullptr;
|
||||||
|
cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType());
|
||||||
|
dim3 blocksPerGrid((aMatrix.getDimSize(0) + THREADS_PER_BLOCK_DIM2_X - 1) / THREADS_PER_BLOCK_DIM2_X, (aMatrix.getDimSize(1) + THREADS_PER_BLOCK_DIM2_Y - 1) / THREADS_PER_BLOCK_DIM2_Y, 1);
|
||||||
|
dim3 threadPerBlock(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_X, 1);
|
||||||
|
transposeKernel<<<blocksPerGrid, threadPerBlock>>>(aMatrix.getData(), data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.isComplex());
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(1), aMatrix.getDimSize(0), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||||
|
}
|
||||||
|
|
||||||
|
CudaMatrix Aurora::horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
|
||||||
|
{
|
||||||
|
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) ||
|
||||||
|
aMatrix1.getDimSize(0) != aMatrix2.getDimSize(0) || aMatrix1.getValueType() != aMatrix2.getValueType())
|
||||||
|
{
|
||||||
|
std::cerr<<"horzcat must have same rows and slices"<<std::endl;
|
||||||
|
return CudaMatrix();
|
||||||
|
}
|
||||||
|
|
||||||
|
int column1 = aMatrix1.getDimSize(1);
|
||||||
|
int column2 = aMatrix2.getDimSize(1);
|
||||||
|
int slice = aMatrix1.getDimSize(2);
|
||||||
|
int row = aMatrix1.getDimSize(0);
|
||||||
|
size_t size1= row*column1*aMatrix1.getValueType();
|
||||||
|
size_t size2= row*column2*aMatrix2.getValueType();
|
||||||
|
float* data = nullptr;
|
||||||
|
cudaMalloc((void**)&data, sizeof(float) * (aMatrix1.getDataSize() + aMatrix2.getDataSize()) * aMatrix1.getValueType());
|
||||||
|
size_t sliceStride = row*(column1+column2) * aMatrix1.getValueType();
|
||||||
|
for (size_t i = 0; i < slice; i++)
|
||||||
|
{
|
||||||
|
cudaMemcpy(data + i*sliceStride, aMatrix1.getData() + i*size1, sizeof(float) * size1, cudaMemcpyDeviceToDevice);
|
||||||
|
cudaMemcpy(data + i*sliceStride + size1, aMatrix2.getData() + i*size2, sizeof(float) * size2, cudaMemcpyDeviceToDevice);
|
||||||
|
}
|
||||||
|
return CudaMatrix::fromRawData(data, row, column1+column2, slice, aMatrix1.getValueType());
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void vertcatKernel(float* aInputData1, unsigned int aInputData1RowSize, float* aInputData2, unsigned int aInputData2RowSize,
|
||||||
|
float* aOutput, unsigned int aOutputRowSize, unsigned int aOutputColumnSize, unsigned int aOutputSliceSize, bool aIsComplex)
|
||||||
|
{
|
||||||
|
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
|
||||||
|
|
||||||
|
if(idX >= aOutputRowSize || idY >= aOutputColumnSize || idZ >= aOutputSliceSize)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(aIsComplex)
|
||||||
|
{
|
||||||
|
if(idX < aInputData1RowSize)
|
||||||
|
{
|
||||||
|
unsigned int inputIndex = idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX;
|
||||||
|
unsigned int outputIndex = idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX;
|
||||||
|
aOutput[2*outputIndex] = aInputData1[2*inputIndex];
|
||||||
|
aOutput[2*outputIndex + 1] = aInputData1[2*inputIndex + 1];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
unsigned int inputIndex = idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize;
|
||||||
|
unsigned int outputIndex =idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX;
|
||||||
|
aOutput[2*outputIndex] = aInputData2[2*inputIndex];
|
||||||
|
aOutput[2*outputIndex + 1] = aInputData2[2*inputIndex + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if(idX < aInputData1RowSize)
|
||||||
|
{
|
||||||
|
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData1[idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
|
||||||
|
{
|
||||||
|
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) ||
|
||||||
|
aMatrix1.getDimSize(1) != aMatrix2.getDimSize(1) || aMatrix1.getValueType() != aMatrix2.getValueType())
|
||||||
|
{
|
||||||
|
return CudaMatrix();
|
||||||
|
}
|
||||||
|
size_t outputRows = aMatrix1.getDimSize(0) + aMatrix2.getDimSize(0);
|
||||||
|
size_t outputColumns = aMatrix1.getDimSize(1);
|
||||||
|
size_t outputSlices = aMatrix1.getDimSize(2);
|
||||||
|
|
||||||
|
float* data = nullptr;
|
||||||
|
cudaMalloc((void**)&data, sizeof(float) * outputRows * outputColumns * outputSlices * aMatrix1.getValueType());
|
||||||
|
|
||||||
|
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
|
||||||
|
dim3 gridSize((outputRows + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
|
||||||
|
(outputColumns + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
|
||||||
|
(outputSlices + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
|
||||||
|
vertcatKernel<<<gridSize, blockSize>>>(aMatrix1.getData(), aMatrix1.getDimSize(0), aMatrix2.getData(), aMatrix2.getDimSize(0),
|
||||||
|
data, outputRows, outputColumns, outputSlices, aMatrix1.isComplex());
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType());
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -63,6 +63,12 @@ namespace Aurora
|
|||||||
|
|
||||||
float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
|
float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
|
||||||
|
|
||||||
|
CudaMatrix transpose(const CudaMatrix& aMatrix);
|
||||||
|
|
||||||
|
CudaMatrix horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2);
|
||||||
|
|
||||||
|
CudaMatrix vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2);
|
||||||
|
|
||||||
// ------compareSet----------------------------------------------------
|
// ------compareSet----------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -280,6 +280,21 @@ TEST_F(Function1D_Cuda_Test, repmat)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(Function1D_Cuda_Test, repmat3d)
|
||||||
|
{
|
||||||
|
Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[12]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8,9,10,11,12}, 2,3,2);
|
||||||
|
Aurora::CudaMatrix deviceMatrix = hostMatrix.toDeviceMatrix();
|
||||||
|
|
||||||
|
auto result1 = Aurora::repmat3d(hostMatrix,3,6,4);
|
||||||
|
auto result2 = Aurora::repmat3d(deviceMatrix,3,6,4).toHostMatrix();
|
||||||
|
EXPECT_EQ(result2.getDataSize(), result1.getDataSize());
|
||||||
|
EXPECT_EQ(result2.getValueType(), Aurora::Normal);
|
||||||
|
for(size_t i=0; i<result1.getDataSize(); ++i)
|
||||||
|
{
|
||||||
|
EXPECT_EQ(result1[i], result2[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(Function1D_Cuda_Test, log)
|
TEST_F(Function1D_Cuda_Test, log)
|
||||||
{
|
{
|
||||||
Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[8]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8}, 2,4);
|
Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[8]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8}, 2,4);
|
||||||
@@ -800,3 +815,90 @@ TEST_F(Function1D_Cuda_Test, norm) {
|
|||||||
result = Aurora::norm(deviceMatrix,Aurora::NormMethod::NormF);
|
result = Aurora::norm(deviceMatrix,Aurora::NormMethod::NormF);
|
||||||
EXPECT_FLOAT_AE(result,44.3847);
|
EXPECT_FLOAT_AE(result,44.3847);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(Function1D_Cuda_Test, transpose) {
|
||||||
|
float *data = new float[6]{1,2,3,4,5,6};
|
||||||
|
auto matrix = Aurora::Matrix::fromRawData(data, 3,2).toDeviceMatrix();
|
||||||
|
auto result = Aurora::transpose(matrix).toHostMatrix();
|
||||||
|
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[0],1);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[1],4);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[2],2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[3],5);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[4],3);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[5],6);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(0),2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
|
||||||
|
|
||||||
|
data = new float[12]{1,2,3,4,5,6,7,8,9,10,11,12};
|
||||||
|
matrix = Aurora::Matrix::fromRawData(data, 3,2,1,Aurora::Complex).toDeviceMatrix();
|
||||||
|
result = Aurora::transpose(matrix).toHostMatrix();
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[0],1);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[1],2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[2],7);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[3],8);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[4],3);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[5],4);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[6],9);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[7],10);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[8],5);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[9],6);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[10],11);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[11],12);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(0),2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(0),2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(Function1D_Cuda_Test, horzcat) {
|
||||||
|
float *data1 = new float[6]{1,2,3,4,5,6};
|
||||||
|
auto matrix1 = Aurora::Matrix::fromRawData(data1, 3,2).toDeviceMatrix();
|
||||||
|
float *data2 = new float[9]{7,8,9,10,11,12,13,14,15};
|
||||||
|
auto matrix2 = Aurora::Matrix::fromRawData(data2, 3,3).toDeviceMatrix();
|
||||||
|
auto result = Aurora::horzcat(matrix1,matrix2).toHostMatrix();
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[0],1);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[1],2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[10],11);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[14],15);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(0),3);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(1),5);
|
||||||
|
|
||||||
|
data1 = new float[6]{1,2,3,4,5,6};
|
||||||
|
matrix1 = Aurora::Matrix::fromRawData(data1, 3,1,1,Aurora::Complex).toDeviceMatrix();
|
||||||
|
data2 = new float[6]{7,8,9,10,11,12};
|
||||||
|
matrix2 = Aurora::Matrix::fromRawData(data2, 3,1,1,Aurora::Complex).toDeviceMatrix();
|
||||||
|
result = Aurora::horzcat(matrix1,matrix2).toHostMatrix();
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[0],1);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[1],2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[8],9);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[9],10);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(0),3);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(1),2);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(Function1D_Cuda_Test, vertcat) {
|
||||||
|
float *data1 = new float[6]{1,2,3,4,5,6};
|
||||||
|
auto matrix1 = Aurora::Matrix::fromRawData(data1, 2,3).toDeviceMatrix();
|
||||||
|
float *data2 = new float[9]{7,8,9,10,11,12,13,14,15};
|
||||||
|
auto matrix2 = Aurora::Matrix::fromRawData(data2, 3,3).toDeviceMatrix();
|
||||||
|
auto result = Aurora::vertcat(matrix1,matrix2).toHostMatrix();
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[0],1);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[1],2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[10],5);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[14],15);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(0),5);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
|
||||||
|
|
||||||
|
data1 = new float[6]{1,2,3,4,5,6};
|
||||||
|
matrix1 = Aurora::Matrix::fromRawData(data1, 3,1,1,Aurora::Complex).toDeviceMatrix();
|
||||||
|
data2 = new float[6]{7,8,9,10,11,12};
|
||||||
|
matrix2 = Aurora::Matrix::fromRawData(data2, 3,1,1,Aurora::Complex).toDeviceMatrix();
|
||||||
|
result = Aurora::vertcat(matrix1,matrix2).toHostMatrix();
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[0],1);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[1],2);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[8],9);
|
||||||
|
EXPECT_FLOAT_EQ(result.getData()[9],10);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(0),6);
|
||||||
|
EXPECT_FLOAT_EQ(result.getDimSize(1),1);
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user