Add log function and fix start kernel block thread size.
This commit is contained in:
@@ -36,7 +36,7 @@ CudaMatrix Aurora::complex(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize() * Aurora::Complex);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
complexKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
complexKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Complex);
|
||||
}
|
||||
@@ -61,7 +61,7 @@ CudaMatrix Aurora::real(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
realKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
realKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
|
||||
}
|
||||
@@ -86,7 +86,7 @@ CudaMatrix Aurora::imag(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * aMatrix.getDataSize());
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
imageKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
imageKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), Aurora::Normal);
|
||||
}
|
||||
@@ -106,7 +106,7 @@ CudaMatrix Aurora::ceil(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
ceilKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
ceilKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -117,7 +117,7 @@ CudaMatrix Aurora::ceil(const CudaMatrix&& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
ceilKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
ceilKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -137,7 +137,7 @@ CudaMatrix Aurora::round(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
roundKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
roundKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -148,7 +148,7 @@ CudaMatrix Aurora::round(const CudaMatrix&& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
roundKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
roundKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -168,7 +168,7 @@ CudaMatrix Aurora::floor(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
floorKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
floorKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -179,7 +179,7 @@ CudaMatrix Aurora::floor(const CudaMatrix&& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
floorKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
floorKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -204,7 +204,7 @@ CudaMatrix Aurora::sqrt(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
sqrtKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
sqrtKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -220,7 +220,7 @@ CudaMatrix Aurora::sqrt(const CudaMatrix&& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
sqrtKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size);
|
||||
sqrtKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -247,7 +247,7 @@ CudaMatrix Aurora::abs(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
absKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
||||
absKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
||||
}
|
||||
@@ -258,7 +258,7 @@ CudaMatrix Aurora::abs(const CudaMatrix&& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
absKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
||||
absKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size, aMatrix.isComplex());
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2));
|
||||
}
|
||||
@@ -297,7 +297,7 @@ CudaMatrix Aurora::sign(const CudaMatrix& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
signKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
||||
signKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -308,7 +308,7 @@ CudaMatrix Aurora::sign(const CudaMatrix&& aMatrix)
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
signKernel<<<THREADS_PER_BLOCK, blocksPerGrid>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
||||
signKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
@@ -402,4 +402,37 @@ CudaMatrix Aurora::repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumn
|
||||
repMat3DKernel<<<gridSize, blockSize>>>(aMatrix.getData(), data, aMatrix.getDataSize(), aMatrix.isComplex());
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0) * aRowTimes, aMatrix.getDimSize(1) * aColumnTimes, aMatrix.getDimSize(2) * aSliceTimes, aMatrix.getValueType());
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void logKernel(float* aInputData, float* aOutput, unsigned int aInputSize, int aBaseNum)
|
||||
{
|
||||
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < aInputSize)
|
||||
{
|
||||
if(aBaseNum == -1)
|
||||
{
|
||||
aOutput[idx] = logf(aInputData[idx]);
|
||||
}
|
||||
else
|
||||
{
|
||||
float value = logf(aBaseNum);
|
||||
aOutput[idx] = logf(aInputData[idx]) / value;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CudaMatrix Aurora::log(const CudaMatrix& aMatrix, int aBaseNum)
|
||||
{
|
||||
if(aMatrix.getValueType() == Aurora::Complex)
|
||||
{
|
||||
std::cerr<<"log not support complex"<<std::endl;
|
||||
return CudaMatrix();
|
||||
}
|
||||
size_t size = aMatrix.getDataSize();
|
||||
float* data = nullptr;
|
||||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||||
logKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aMatrix.getData(), data, size, aBaseNum);
|
||||
cudaDeviceSynchronize();
|
||||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||||
}
|
||||
|
||||
@@ -45,6 +45,8 @@ namespace Aurora
|
||||
CudaMatrix repmat(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes);
|
||||
|
||||
CudaMatrix repmat3d(const CudaMatrix& aMatrix,int aRowTimes, int aColumnTimes, int aSliceTimes);
|
||||
|
||||
CudaMatrix log(const CudaMatrix& aMatrix, int aBaseNum = -1);
|
||||
}
|
||||
|
||||
#endif //AURORA_CUDA_FUNCTION1D_H
|
||||
Reference in New Issue
Block a user