diff --git a/CMakeLists.txt b/CMakeLists.txt index fa5b445..93cdf13 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,8 @@ target_compile_options(Aurora PRIVATE $<$: -arch=sm_75 --expt-extended-lambda >) target_link_libraries(Aurora PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) +target_link_libraries(Aurora PRIVATE ${CUDA_cublas_LIBRARY}) +target_link_libraries(Aurora PRIVATE ${CUDA_cusolver_LIBRARY}) endif(Aurora_USE_CUDA) find_package(GTest REQUIRED) @@ -75,6 +77,8 @@ target_compile_options(Aurora_Test PRIVATE $<$: -arch=sm_75 --expt-extended-lambda >) target_link_libraries(Aurora_Test PRIVATE ${CUDA_RUNTIME_LIBRARY} CUDA::cufft CUDA::cudart) +target_link_libraries(Aurora_Test PRIVATE ${CUDA_cublas_LIBRARY}) +target_link_libraries(Aurora_Test PRIVATE ${CUDA_cusolver_LIBRARY}) endif(Aurora_USE_CUDA) gtest_discover_tests(Aurora_Test ) #target_link_libraries(CreateMatchedFilter PRIVATE TBB::tbb) \ No newline at end of file diff --git a/src/Function1D.cu b/src/Function1D.cu index fbfd7e0..d95d289 100644 --- a/src/Function1D.cu +++ b/src/Function1D.cu @@ -1,14 +1,18 @@ #include "CudaMatrix.h" +#include "AuroraDefs.h" #include "Function1D.cuh" +#include "Function1D.h" #include "Matrix.h" #include +#include #include #include #include #include #include #include +#include using namespace Aurora; using namespace thrust::placeholders; @@ -701,3 +705,251 @@ CudaMatrix Aurora::log(const CudaMatrix& aMatrix, int aBaseNum) cudaDeviceSynchronize(); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } + +__global__ void expKernel(float* aInputData, float* aOutput, unsigned int aInputSize, bool aIsComplex) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aInputSize) + { + if(aIsComplex) + { + unsigned int index = 2 * idx; + float expReal = expf(aInputData[index]); + aOutput[index] = expReal * cosf(aInputData[index + 1]); + aOutput[index + 1] = expReal * sinf(aInputData[index + 1]); + } + else + { + aOutput[idx] = expf(aInputData[idx]); + } + } +} + +CudaMatrix Aurora::exp(const CudaMatrix& aMatrix) +{ + size_t size = aMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType()); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + expKernel<<>>(aMatrix.getData(), data, size, aMatrix.isComplex()); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + +__global__ void modKernel(float* aInputData, float* aOutput, unsigned int aInputSize, float aModValue) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aInputSize) + { + aOutput[idx] = fmodf(aInputData[idx], aModValue); + } +} + +CudaMatrix Aurora::mod(const CudaMatrix& aMatrix, float aValue) +{ + if(aMatrix.isComplex()) + { + std::cerr<<"mod not support complex"<>>(aMatrix.getData(), data, size, aValue); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); +} + +__global__ void acosKernel(float* aInputData, float* aOutput, unsigned int aInputSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aInputSize) + { + aOutput[idx] = acosf(aInputData[idx]); + } +} + +CudaMatrix Aurora::acos(const CudaMatrix& aMatrix) +{ + if(aMatrix.isComplex()) + { + std::cerr<<"acos not support complex"<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); +} + +__global__ void acosdKernel(float* aInputData, float* aOutput, unsigned int aInputSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aInputSize) + { + aOutput[idx] = acosf(aInputData[idx]) * 180 / PI; + } +} + +CudaMatrix Aurora::acosd(const CudaMatrix& aMatrix) +{ + if(aMatrix.isComplex()) + { + std::cerr<<"acos not support complex"<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2)); +} + +__global__ void conjKernel(float* aInputData, float* aOutput, unsigned int aInputSize) +{ + unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < aInputSize) + { + unsigned int index = idx * 2; + aOutput[index] = aInputData[index]; + aOutput[index + 1] = -aInputData[index + 1]; + } +} + +CudaMatrix Aurora::conj(const CudaMatrix& aMatrix) +{ + if(!aMatrix.isComplex()) + { + return CudaMatrix::copyFromRawData(aMatrix.getData(),aMatrix.getDimSize(0),aMatrix.getDimSize(1),aMatrix.getDimSize(2)); + } + size_t size = aMatrix.getDataSize(); + float* data = nullptr; + cudaMalloc((void**)&data, sizeof(float) * size * aMatrix.getValueType()); + int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + conjKernel<<>>(aMatrix.getData(), data, size); + cudaDeviceSynchronize(); + return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); +} + + +float Aurora::norm(const CudaMatrix& aMatrix, NormMethod aNormMethod) +{ + float resultValue = 0; + if(aMatrix.getDims() > 2) + { + std::cerr<<"norm not support 3d matrix"<()); + return resultValue; + } + else + { + CudaMatrix result = aMatrix.deepCopy(); + thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); + resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); + return std::sqrt(resultValue); + } + } + //for 2 dims + if(aNormMethod == Aurora::NormF) + { + CudaMatrix result = aMatrix.deepCopy(); + thrust::transform(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), result.getData(), thrust::square()); + resultValue = thrust::reduce(thrust::device, result.getData(), result.getData() + result.getDataSize() * result.getValueType(), 0.0, thrust::plus()); + return std::sqrt(resultValue); + } + else if(aNormMethod == Aurora::Norm1) + { + for(int i=0; i()); + if(resultValue < tempValue) + { + resultValue = tempValue; + } + } + return resultValue; + } + else + { + if(aMatrix.isComplex()) + { + std::cerr<<"norm2 not support 2d complex matrix"< resultValue) + { + resultValue = S[i]; + } + } + + if (d_S ) cudaFree(d_S); + if (d_U ) cudaFree(d_U); + if (d_VT) cudaFree(d_VT); + if (devInfo) cudaFree(devInfo); + if (d_work) cudaFree(d_work); + if (cusolverH) cusolverDnDestroy(cusolverH); + if (stream) cudaStreamDestroy(stream); + return resultValue; + } +} diff --git a/src/Function1D.cuh b/src/Function1D.cuh index fbe6387..8d5c8f4 100644 --- a/src/Function1D.cuh +++ b/src/Function1D.cuh @@ -51,7 +51,17 @@ namespace Aurora CudaMatrix log(const CudaMatrix& aMatrix, int aBaseNum = -1); + CudaMatrix exp(const CudaMatrix& aMatrix); + CudaMatrix mod(const CudaMatrix& aMatrix, float aValue); + + CudaMatrix acos(const CudaMatrix& aMatrix); + + CudaMatrix acosd(const CudaMatrix& aMatrix); + + CudaMatrix conj(const CudaMatrix& aMatrix); + + float norm(const CudaMatrix& aMatrix, NormMethod aNormMethod); // ------compareSet---------------------------------------------------- diff --git a/test/Function1D_Cuda_Test.cpp b/test/Function1D_Cuda_Test.cpp index c1784e8..03c8c90 100644 --- a/test/Function1D_Cuda_Test.cpp +++ b/test/Function1D_Cuda_Test.cpp @@ -647,3 +647,156 @@ TEST_F(Function1D_Cuda_Test, compareSet) } } } + +TEST_F(Function1D_Cuda_Test, exp) +{ + Aurora::Matrix hostMatrix = Aurora::Matrix::fromRawData(new float[8]{1.1,2.2,3.3,4.4,5.5,6.6,7.7,8.8}, 2,4); + Aurora::CudaMatrix deviceMatrix = hostMatrix.toDeviceMatrix(); + + auto result1 = Aurora::exp(hostMatrix); + auto result2 = Aurora::exp(deviceMatrix).toHostMatrix(); + EXPECT_EQ(result2.getDataSize(), result1.getDataSize()); + EXPECT_EQ(result2.getValueType(), result1.getValueType()); + for(size_t i=0; i