Add transpose, horzcat, vertcat and Fix repmat threadPerBlock problem.

This commit is contained in:
sunwen
2023-11-30 10:22:06 +08:00
parent 87dd34095c
commit a8c65f21b1
3 changed files with 291 additions and 18 deletions

View File

@@ -20,6 +20,11 @@ using namespace thrust::placeholders;
namespace namespace
{ {
const int THREADS_PER_BLOCK = 256; const int THREADS_PER_BLOCK = 256;
const int THREADS_PER_BLOCK_DIM2_X = 16;
const int THREADS_PER_BLOCK_DIM2_Y = 16;
const int THREADS_PER_BLOCK_DIM3_X = 8;
const int THREADS_PER_BLOCK_DIM3_Y = 8;
const int THREADS_PER_BLOCK_DIM3_Z = 8;
} }
__global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize) __global__ void complexKernel(float* aInputData, float* aOutput, unsigned int aSize)
@@ -582,21 +587,28 @@ void Aurora::compareSet(CudaMatrix& aCompareMatrix,float compareValue, CudaMatri
} }
} }
__global__ void repMatKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) __global__ void repMatKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns,
float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex)
{ {
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices)
{
return;
}
if(aIsComplex) if(aIsComplex)
{ {
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX); unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX);
unsigned int inPutIndex = 2 * (threadIdx.y * blockDim.x + threadIdx.x); unsigned int inPutIndex = 2 * (idY % aInputColumns * aInputRows + idX % aInputRows);
aOutput[outPutIndex] = aInputData[inPutIndex]; aOutput[outPutIndex] = aInputData[inPutIndex];
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1]; aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
} }
else else
{ {
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.y * blockDim.x + threadIdx.x]; aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idY % aInputColumns * aInputRows + idX % aInputRows];
} }
} }
@@ -606,14 +618,16 @@ CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTi
{ {
return CudaMatrix(); return CudaMatrix();
} }
size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1); size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
dim3 gridSize(aRowTimes, aColumnTimes, 1); dim3 blockSize(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_Y, 1);
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM2_X - 1)/THREADS_PER_BLOCK_DIM2_X,
(columnSize + THREADS_PER_BLOCK_DIM2_Y - 1)/THREADS_PER_BLOCK_DIM2_Y, 1);
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes; size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes;
float* data = nullptr; float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size); cudaMalloc((void**)&data, sizeof(float) * size);
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, 1, aMatrix.isComplex());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType()); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2), aMatrix.getValueType());
} }
@@ -625,32 +639,44 @@ CudaMatrix Aurora::repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTi
return CudaMatrix(); return CudaMatrix();
} }
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), 1); size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes); size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes;
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
(columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
(sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
float* data = nullptr; float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size); cudaMalloc((void**)&data, sizeof(float) * size);
repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); repMatKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), data, rowSize, columnSize, sliceSize, aMatrix.isComplex());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
} }
__global__ void repMat3DKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) __global__ void repMat3DKernel(float* aInputData, unsigned int aInputRows, unsigned int aInputColumns, unsigned int aInputSlices,
float* aOutput, unsigned int aOutputRows, unsigned int aOutputColumns, unsigned int aOutputSlices, bool aIsComplex)
{ {
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x; unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y; unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z; unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
if(idX >= aOutputRows || idY >= aOutputColumns || idZ >= aOutputSlices)
{
return;
}
if(aIsComplex) if(aIsComplex)
{ {
unsigned int outPutIndex = 2 * (idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX); unsigned int outPutIndex = 2 * (idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX);
unsigned int inPutIndex = 2 * (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x); unsigned int inPutIndex = 2 * (idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows);
aOutput[outPutIndex] = aInputData[inPutIndex]; aOutput[outPutIndex] = aInputData[inPutIndex];
aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1]; aOutput[outPutIndex + 1] = aInputData[inPutIndex + 1];
} }
else else
{ {
aOutput[idZ * blockDim.x * blockDim.y * gridDim.x * gridDim.y + idY * blockDim.x * gridDim.x + idX] = aInputData[threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x]; aOutput[idZ * aOutputRows * aOutputColumns + idY * aOutputRows + idX] = aInputData[idZ % aInputSlices * aInputRows * aInputColumns + idY % aInputColumns * aInputRows + idX % aInputRows];
} }
} }
@@ -662,13 +688,18 @@ CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumn
return CudaMatrix(); return CudaMatrix();
} }
dim3 blockSize(aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); size_t rowSize = aMatrix.getDimSize(0) * aRowTimes;
dim3 gridSize(aRowTimes, aColumnTimes, aSliceTimes); size_t columnSize = aMatrix.getDimSize(1) * aColumnTimes;
size_t sliceSize = aMatrix.getDimSize(2) * aSliceTimes;
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
dim3 gridSize((rowSize + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
(columnSize + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
(sliceSize + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes; size_t size = aMatrix.getDataSize() * aMatrix.getValueType() * aRowTimes * aColumnTimes * aSliceTimes;
float* data = nullptr; float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size); cudaMalloc((void**)&data, sizeof(float) * size);
repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex()); repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), data, rowSize, columnSize, sliceSize, aMatrix.isComplex());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType()); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
} }
@@ -953,3 +984,137 @@ float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod)
return resultValue; return resultValue;
} }
} }
__global__ void transposeKernel(float* aInputData, float* aOutput, unsigned int aRowSize, unsigned int aColumnSize, bool aIsComplex)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < aRowSize && idy < aColumnSize)
{
unsigned int inputindex = idy * aRowSize + idx;
unsigned int outputIndex = idx * aColumnSize + idy;
if(aIsComplex)
{
aOutput[2*outputIndex] = aInputData[2*inputindex];
aOutput[2*outputIndex + 1] = aInputData[2*inputindex + 1];
}
else
{
aOutput[outputIndex] = aInputData[inputindex];
}
}
}
CudaMatrix Aurora::transpose(const CudaMatrix& aMatrix)
{
//not surpport for 3 dims.
if(aMatrix.isNull() || aMatrix.getDimSize(2) > 1)
{
std::cerr<<"transpose not support 3d complex matrix"<<std::endl;
return CudaMatrix();
}
size_t size = aMatrix.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType());
dim3 blocksPerGrid((aMatrix.getDimSize(0) + THREADS_PER_BLOCK_DIM2_X - 1) / THREADS_PER_BLOCK_DIM2_X, (aMatrix.getDimSize(1) + THREADS_PER_BLOCK_DIM2_Y - 1) / THREADS_PER_BLOCK_DIM2_Y, 1);
dim3 threadPerBlock(THREADS_PER_BLOCK_DIM2_X, THREADS_PER_BLOCK_DIM2_X, 1);
transposeKernel<<<blocksPerGrid, threadPerBlock>>>(aMatrix.getData(), data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(1), aMatrix.getDimSize(0), aMatrix.getDimSize(2), aMatrix.getValueType());
}
CudaMatrix Aurora::horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
{
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) ||
aMatrix1.getDimSize(0) != aMatrix2.getDimSize(0) || aMatrix1.getValueType() != aMatrix2.getValueType())
{
std::cerr<<"horzcat must have same rows and slices"<<std::endl;
return CudaMatrix();
}
int column1 = aMatrix1.getDimSize(1);
int column2 = aMatrix2.getDimSize(1);
int slice = aMatrix1.getDimSize(2);
int row = aMatrix1.getDimSize(0);
size_t size1= row*column1*aMatrix1.getValueType();
size_t size2= row*column2*aMatrix2.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * (aMatrix1.getDataSize() + aMatrix2.getDataSize()) * aMatrix1.getValueType());
size_t sliceStride = row*(column1+column2) * aMatrix1.getValueType();
for (size_t i = 0; i < slice; i++)
{
cudaMemcpy(data + i*sliceStride, aMatrix1.getData() + i*size1, sizeof(float) * size1, cudaMemcpyDeviceToDevice);
cudaMemcpy(data + i*sliceStride + size1, aMatrix2.getData() + i*size2, sizeof(float) * size2, cudaMemcpyDeviceToDevice);
}
return CudaMatrix::fromRawData(data, row, column1+column2, slice, aMatrix1.getValueType());
}
__global__ void vertcatKernel(float* aInputData1, unsigned int aInputData1RowSize, float* aInputData2, unsigned int aInputData2RowSize,
float* aOutput, unsigned int aOutputRowSize, unsigned int aOutputColumnSize, unsigned int aOutputSliceSize, bool aIsComplex)
{
unsigned int idX = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idY = blockIdx.y * blockDim.y + threadIdx.y;
unsigned int idZ = blockIdx.z * blockDim.z + threadIdx.z;
if(idX >= aOutputRowSize || idY >= aOutputColumnSize || idZ >= aOutputSliceSize)
{
return;
}
if(aIsComplex)
{
if(idX < aInputData1RowSize)
{
unsigned int inputIndex = idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX;
unsigned int outputIndex = idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX;
aOutput[2*outputIndex] = aInputData1[2*inputIndex];
aOutput[2*outputIndex + 1] = aInputData1[2*inputIndex + 1];
}
else
{
unsigned int inputIndex = idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize;
unsigned int outputIndex =idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX;
aOutput[2*outputIndex] = aInputData2[2*inputIndex];
aOutput[2*outputIndex + 1] = aInputData2[2*inputIndex + 1];
}
}
else
{
if(idX < aInputData1RowSize)
{
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData1[idZ * aInputData1RowSize * aOutputColumnSize + idY * aInputData1RowSize + idX];
}
else
{
aOutput[idZ * aOutputRowSize * aOutputColumnSize + idY * aOutputRowSize + idX] = aInputData2[idZ * aInputData2RowSize * aOutputColumnSize + idY * aInputData2RowSize + idX - aInputData1RowSize];
}
}
}
CudaMatrix Aurora::vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2)
{
if(aMatrix1.isNull() || aMatrix2.isNull() || aMatrix1.getDimSize(2) != aMatrix2.getDimSize(2) ||
aMatrix1.getDimSize(1) != aMatrix2.getDimSize(1) || aMatrix1.getValueType() != aMatrix2.getValueType())
{
return CudaMatrix();
}
size_t outputRows = aMatrix1.getDimSize(0) + aMatrix2.getDimSize(0);
size_t outputColumns = aMatrix1.getDimSize(1);
size_t outputSlices = aMatrix1.getDimSize(2);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * outputRows * outputColumns * outputSlices * aMatrix1.getValueType());
dim3 blockSize(THREADS_PER_BLOCK_DIM3_X, THREADS_PER_BLOCK_DIM3_Y, THREADS_PER_BLOCK_DIM3_Z);
dim3 gridSize((outputRows + THREADS_PER_BLOCK_DIM3_X - 1)/THREADS_PER_BLOCK_DIM3_X,
(outputColumns + THREADS_PER_BLOCK_DIM3_Y - 1)/THREADS_PER_BLOCK_DIM3_Y,
(outputSlices + THREADS_PER_BLOCK_DIM3_Z - 1)/THREADS_PER_BLOCK_DIM3_Z);
vertcatKernel<<<gridSize, blockSize>>>(aMatrix1.getData(), aMatrix1.getDimSize(0), aMatrix2.getData(), aMatrix2.getDimSize(0),
data, outputRows, outputColumns, outputSlices, aMatrix1.isComplex());
cudaDeviceSynchronize();
return Aurora::CudaMatrix::fromRawData(data, outputRows, outputColumns, outputSlices, aMatrix1.getValueType());
}

View File

@@ -63,6 +63,12 @@ namespace Aurora
float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod); float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod);
CudaMatrix transpose(const CudaMatrix& aMatrix);
CudaMatrix horzcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2);
CudaMatrix vertcat(const CudaMatrix& aMatrix1, const CudaMatrix& aMatrix2);
// ------compareSet---------------------------------------------------- // ------compareSet----------------------------------------------------

View File

@@ -280,6 +280,21 @@ TEST_F(Function1D_Cuda_Test, repmat)
} }
} }
TEST_F(Function1D_Cuda_Test, repmat3d)
{
Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[12]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8,9,10,11,12}, 2,3,2);
Aurora::CudaMatrix deviceMatrix = hostMatrix.toDeviceMatrix();
auto result1 = Aurora::repmat3d(hostMatrix,3,6,4);
auto result2 = Aurora::repmat3d(deviceMatrix,3,6,4).toHostMatrix();
EXPECT_EQ(result2.getDataSize(), result1.getDataSize());
EXPECT_EQ(result2.getValueType(), Aurora::Normal);
for(size_t i=0; i<result1.getDataSize(); ++i)
{
EXPECT_EQ(result1[i], result2[i]);
}
}
TEST_F(Function1D_Cuda_Test, log) TEST_F(Function1D_Cuda_Test, log)
{ {
Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[8]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8}, 2,4); Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[8]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8}, 2,4);
@@ -800,3 +815,90 @@ TEST_F(Function1D_Cuda_Test, norm) {
result = Aurora::norm(deviceMatrix,Aurora::NormMethod::NormF); result = Aurora::norm(deviceMatrix,Aurora::NormMethod::NormF);
EXPECT_FLOAT_AE(result,44.3847); EXPECT_FLOAT_AE(result,44.3847);
} }
TEST_F(Function1D_Cuda_Test, transpose) {
float *data = new float[6]{1,2,3,4,5,6};
auto matrix = Aurora::Matrix::fromRawData(data, 3,2).toDeviceMatrix();
auto result = Aurora::transpose(matrix).toHostMatrix();
EXPECT_FLOAT_EQ(result.getData()[0],1);
EXPECT_FLOAT_EQ(result.getData()[1],4);
EXPECT_FLOAT_EQ(result.getData()[2],2);
EXPECT_FLOAT_EQ(result.getData()[3],5);
EXPECT_FLOAT_EQ(result.getData()[4],3);
EXPECT_FLOAT_EQ(result.getData()[5],6);
EXPECT_FLOAT_EQ(result.getDimSize(0),2);
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
data = new float[12]{1,2,3,4,5,6,7,8,9,10,11,12};
matrix = Aurora::Matrix::fromRawData(data, 3,2,1,Aurora::Complex).toDeviceMatrix();
result = Aurora::transpose(matrix).toHostMatrix();
EXPECT_FLOAT_EQ(result.getData()[0],1);
EXPECT_FLOAT_EQ(result.getData()[1],2);
EXPECT_FLOAT_EQ(result.getData()[2],7);
EXPECT_FLOAT_EQ(result.getData()[3],8);
EXPECT_FLOAT_EQ(result.getData()[4],3);
EXPECT_FLOAT_EQ(result.getData()[5],4);
EXPECT_FLOAT_EQ(result.getData()[6],9);
EXPECT_FLOAT_EQ(result.getData()[7],10);
EXPECT_FLOAT_EQ(result.getData()[8],5);
EXPECT_FLOAT_EQ(result.getData()[9],6);
EXPECT_FLOAT_EQ(result.getData()[10],11);
EXPECT_FLOAT_EQ(result.getData()[11],12);
EXPECT_FLOAT_EQ(result.getDimSize(0),2);
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
EXPECT_FLOAT_EQ(result.getDimSize(0),2);
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
}
TEST_F(Function1D_Cuda_Test, horzcat) {
float *data1 = new float[6]{1,2,3,4,5,6};
auto matrix1 = Aurora::Matrix::fromRawData(data1, 3,2).toDeviceMatrix();
float *data2 = new float[9]{7,8,9,10,11,12,13,14,15};
auto matrix2 = Aurora::Matrix::fromRawData(data2, 3,3).toDeviceMatrix();
auto result = Aurora::horzcat(matrix1,matrix2).toHostMatrix();
EXPECT_FLOAT_EQ(result.getData()[0],1);
EXPECT_FLOAT_EQ(result.getData()[1],2);
EXPECT_FLOAT_EQ(result.getData()[10],11);
EXPECT_FLOAT_EQ(result.getData()[14],15);
EXPECT_FLOAT_EQ(result.getDimSize(0),3);
EXPECT_FLOAT_EQ(result.getDimSize(1),5);
data1 = new float[6]{1,2,3,4,5,6};
matrix1 = Aurora::Matrix::fromRawData(data1, 3,1,1,Aurora::Complex).toDeviceMatrix();
data2 = new float[6]{7,8,9,10,11,12};
matrix2 = Aurora::Matrix::fromRawData(data2, 3,1,1,Aurora::Complex).toDeviceMatrix();
result = Aurora::horzcat(matrix1,matrix2).toHostMatrix();
EXPECT_FLOAT_EQ(result.getData()[0],1);
EXPECT_FLOAT_EQ(result.getData()[1],2);
EXPECT_FLOAT_EQ(result.getData()[8],9);
EXPECT_FLOAT_EQ(result.getData()[9],10);
EXPECT_FLOAT_EQ(result.getDimSize(0),3);
EXPECT_FLOAT_EQ(result.getDimSize(1),2);
}
TEST_F(Function1D_Cuda_Test, vertcat) {
float *data1 = new float[6]{1,2,3,4,5,6};
auto matrix1 = Aurora::Matrix::fromRawData(data1, 2,3).toDeviceMatrix();
float *data2 = new float[9]{7,8,9,10,11,12,13,14,15};
auto matrix2 = Aurora::Matrix::fromRawData(data2, 3,3).toDeviceMatrix();
auto result = Aurora::vertcat(matrix1,matrix2).toHostMatrix();
EXPECT_FLOAT_EQ(result.getData()[0],1);
EXPECT_FLOAT_EQ(result.getData()[1],2);
EXPECT_FLOAT_EQ(result.getData()[10],5);
EXPECT_FLOAT_EQ(result.getData()[14],15);
EXPECT_FLOAT_EQ(result.getDimSize(0),5);
EXPECT_FLOAT_EQ(result.getDimSize(1),3);
data1 = new float[6]{1,2,3,4,5,6};
matrix1 = Aurora::Matrix::fromRawData(data1, 3,1,1,Aurora::Complex).toDeviceMatrix();
data2 = new float[6]{7,8,9,10,11,12};
matrix2 = Aurora::Matrix::fromRawData(data2, 3,1,1,Aurora::Complex).toDeviceMatrix();
result = Aurora::vertcat(matrix1,matrix2).toHostMatrix();
EXPECT_FLOAT_EQ(result.getData()[0],1);
EXPECT_FLOAT_EQ(result.getData()[1],2);
EXPECT_FLOAT_EQ(result.getData()[8],9);
EXPECT_FLOAT_EQ(result.getData()[9],10);
EXPECT_FLOAT_EQ(result.getDimSize(0),6);
EXPECT_FLOAT_EQ(result.getDimSize(1),1);
}