#include "AuroraDefs.h" #include "CudaMatrix.h" #include "Function1D.h" #include "Function1D.cuh" #include "Function3D.h" #include "Matrix.h" #include #include #include #include #include #include //this 2 header, only need by cuda version >= 12 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "Matrix.h" #include "cufft.h" #include using namespace Aurora; namespace { const int THREADS_PER_BLOCK = 256; } __global__ void maxColKernel(float *aInputData, float *aOutput, unsigned int aColSize) { // 确定每个thread的index unsigned int idx = blockIdx.x * aColSize + threadIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x < aColSize) ? aInputData[idx] : -FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aColSize; offset += blockDim.x) { if (threadIdx.x + offset < aColSize) { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], aInputData[idx + offset]); } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } __global__ void maxRowKernel(float *aInputData, float *aOutput, unsigned int aColSize, unsigned int aRowSize) { // 确定每个thread的基础index unsigned int idx = threadIdx.x * aColSize + blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x < aRowSize) ? aInputData[idx] : -FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowSize; offset += blockDim.x) { if (threadIdx.x + offset < aRowSize) { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], aInputData[idx + offset * aColSize]); } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) { long a, b; return max(aMatrix, direction, a, b); } CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long &rowIdx, long &colIdx) { if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") << std::endl; return CudaMatrix(); } // 针对向量行等于列 if (aMatrix.isVector()) { direction = All; } switch (direction) { case All: { thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); auto max_iter = thrust::max_element(thrust::device, d_ptr, d_ptr + aMatrix.getDataSize()); int index = max_iter - d_ptr; rowIdx = index % aMatrix.getDimSize(0); colIdx = index / aMatrix.getDimSize(0); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float)); auto ret = Aurora::CudaMatrix::fromRawData(data, 1, 1, 1); ret.setValue(0, *max_iter); return ret; } case Row: { float *matData = aMatrix.getData(); float *retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0)); maxRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); return ret; } case Column: default: { float *matData = aMatrix.getData(); float *retData = nullptr; int colCount = aMatrix.getDimSize(0); cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1)); maxColKernel<<>>(matData, retData, colCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1)); return ret; } } } CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) { // col-vec x mat if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { size_t size = aMat.getDataSize(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__(const float &x, const float &y) { return fmaxf(x, y); }; for (int i = 0; i < aMat.getDimSize(1); ++i) { thrust::transform(thrust::device, aVec.getData(), aVec.getData() + aVec.getDataSize(), aMat.getData() + aMat.getDimSize(0) * i, data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } // row-vec x mat else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1)) { size_t size = aMat.getDataSize(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); for (int i = 0; i < aMat.getDimSize(1); ++i) { float v = aVec.getValue(i); auto lambda = [=] __host__ __device__(const float &x) { return fmaxf(x, v); }; thrust::transform(thrust::device, aMat.getData() + aMat.getDimSize(0) * i, aMat.getData() + aMat.getDimSize(0) * (i + 1), data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } std::cerr << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" << std::endl; return CudaMatrix(); } CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) { if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") << std::endl; return CudaMatrix(); } if (aOther.getDimSize(2) > 1 || aOther.isComplex()) { std::cerr << (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") << std::endl; return CudaMatrix(); } // same shape if (aMatrix.compareShape(aOther)) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__(const float &x, const float &y) { return fmaxf(x, y); }; thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), aOther.getData(), data, lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } // one is scalar else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1) { float scalar = (aMatrix.getDataSize() == 1) ? aMatrix.getValue(0) : aOther.getValue(0); auto matrix = (aMatrix.getDataSize() == 1) ? aOther : aMatrix; return max(matrix, scalar); } else if (aMatrix.isVector()) { return ::vxmMax(aMatrix, aOther); } else if (aOther.isVector()) { return ::vxmMax(aOther, aMatrix); } std::cerr << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" << std::endl; return CudaMatrix(); } CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__(const float &x) { return fmaxf(x, aValue); }; thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), data, lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void minColKernel(float *aInputData, float *aOutput, unsigned int aColSize) { // 确定每个thread的index unsigned int idx = blockIdx.x * aColSize + threadIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x < aColSize) ? aInputData[idx] : FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aColSize; offset += blockDim.x) { if (threadIdx.x + offset < aColSize) { shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], aInputData[idx + offset]); } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } __global__ void minRowKernel(float *aInputData, float *aOutput, unsigned int aColSize, unsigned int aRowSize) { // 确定每个thread的基础index unsigned int idx = threadIdx.x * aColSize + blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x < aRowSize) ? aInputData[idx] : FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowSize; offset += blockDim.x) { if (threadIdx.x + offset < aRowSize) { shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], aInputData[idx + offset * aColSize]); } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) { long a, b; return min(aMatrix, direction, a, b); } CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long &rowIdx, long &colIdx) { if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") << std::endl; return CudaMatrix(); } // 针对向量行等于列 if (aMatrix.isVector()) { direction = All; } switch (direction) { case All: { thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); auto max_iter = thrust::min_element(thrust::device, d_ptr, d_ptr + aMatrix.getDataSize()); int index = max_iter - d_ptr; rowIdx = index % aMatrix.getDimSize(0); colIdx = index / aMatrix.getDimSize(0); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float)); auto ret = Aurora::CudaMatrix::fromRawData(data, 1, 1, 1); ret.setValue(0, *max_iter); return ret; } case Row: { float *matData = aMatrix.getData(); float *retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0)); minRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); return ret; } case Column: default: { float *matData = aMatrix.getData(); float *retData = nullptr; int colCount = aMatrix.getDimSize(0); cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1)); minColKernel<<>>(matData, retData, colCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1)); return ret; } } } CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) { // col-vec x mat if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { size_t size = aMat.getDataSize(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__(const float &x, const float &y) { return fminf(x, y); }; for (int i = 0; i < aMat.getDimSize(1); ++i) { thrust::transform(thrust::device, aVec.getData(), aVec.getData() + aVec.getDataSize(), aMat.getData() + aMat.getDimSize(0) * i, data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } // row-vec x mat else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1)) { size_t size = aMat.getDataSize(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); for (int i = 0; i < aMat.getDimSize(1); ++i) { float v = aVec.getValue(i); auto lambda = [=] __host__ __device__(const float &x) { return fminf(x, v); }; thrust::transform(thrust::device, aMat.getData() + aMat.getDimSize(0) * i, aMat.getData() + aMat.getDimSize(0) * (i + 1), data + aMat.getDimSize(0) * i, lambda); } return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0), aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType()); } std::cerr << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" << std::endl; return CudaMatrix(); } CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) { if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") << std::endl; return CudaMatrix(); } if (aOther.getDimSize(2) > 1 || aOther.isComplex()) { std::cerr << (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") << std::endl; return CudaMatrix(); } // same shape if (aMatrix.compareShape(aOther)) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__(const float &x, const float &y) { return fminf(x, y); }; thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), aOther.getData(), data, lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } // one is scalar else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1) { float scalar = (aMatrix.getDataSize() == 1) ? aMatrix.getValue(0) : aOther.getValue(0); auto matrix = (aMatrix.getDataSize() == 1) ? aOther : aMatrix; return min(matrix, scalar); } else if (aMatrix.isVector()) { return ::vxmMin(aMatrix, aOther); } else if (aOther.isVector()) { return ::vxmMin(aOther, aMatrix); } std::cerr << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" << std::endl; return CudaMatrix(); } CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue) { size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__(const float &x) { return fminf(x, aValue); }; thrust::transform(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), data, lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void sumColKernel(float *aInputData, float *aOutput, int aColEleCount) { // 确定每个thread的index unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; __shared__ double shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x < aColEleCount) ? aInputData[idx] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aColEleCount; offset += blockDim.x) { if (threadIdx.x + offset < aColEleCount) { shared_data[threadIdx.x] += (double)aInputData[idx + offset]; } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = (float)shared_data[0]; } } __global__ void sumRowKernel(float *aInputData, float *aOutput, unsigned int aColEleCount, unsigned int aRowEleCount) { // 确定每个thread的基础index unsigned int idx = threadIdx.x * aColEleCount + blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x < aRowEleCount) ? aInputData[idx] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowEleCount; offset += blockDim.x) { if (threadIdx.x + offset < aRowEleCount) { shared_data[threadIdx.x] += aInputData[idx + offset * aColEleCount]; } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x] += shared_data[threadIdx.x + i]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } __global__ void sumZAllColKernel(float *aInputData, float *aOutput, int aTotalSize) { // 确定每个thread的index unsigned int idx = blockIdx.x * 4096 + threadIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 bool flag = threadIdx.x < 4096 && idx < aTotalSize; shared_data[threadIdx.x][0] = flag ? aInputData[idx * 2] : 0.0; shared_data[threadIdx.x][1] = flag ? aInputData[idx * 2 + 1] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < 4096; offset += blockDim.x) { if (threadIdx.x + offset < 4096 && idx + offset < aTotalSize) { shared_data[threadIdx.x][0] += aInputData[idx * 2 + offset * 2]; shared_data[threadIdx.x][1] += aInputData[idx * 2 + offset * 2 + 1]; } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0][0]; aOutput[blockIdx.x + gridDim.x] = shared_data[0][1]; } } __global__ void sumZColKernel(float *aInputData, float *aOutput, int aColEleCount) { // 确定每个thread的index unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x][0] = (threadIdx.x < aColEleCount) ? aInputData[idx * 2] : 0.0; shared_data[threadIdx.x][1] = (threadIdx.x < aColEleCount) ? aInputData[idx * 2 + 1] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aColEleCount; offset += blockDim.x) { if (threadIdx.x + offset < aColEleCount) { shared_data[threadIdx.x][0] += aInputData[idx * 2 + offset * 2]; shared_data[threadIdx.x][1] += aInputData[idx * 2 + offset * 2 + 1]; } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x * 2] = shared_data[0][0]; aOutput[blockIdx.x * 2 + 1] = shared_data[0][1]; } } __global__ void sumZRowKernel(float *aInputData, float *aOutput, unsigned int aColEleCount, unsigned int aRowEleCount) { // 确定每个thread的基础index unsigned int idx = threadIdx.x * aColEleCount + blockIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x][0] = (threadIdx.x < aRowEleCount) ? aInputData[idx * 2] : 0.0; shared_data[threadIdx.x][1] = (threadIdx.x < aRowEleCount) ? aInputData[idx * 2 + 1] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowEleCount; offset += blockDim.x) { if (threadIdx.x + offset < aRowEleCount) { shared_data[threadIdx.x][0] += aInputData[idx * 2 + offset * aColEleCount * 2]; shared_data[threadIdx.x][1] += aInputData[idx * 2 + offset * aColEleCount * 2 + 1]; } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { shared_data[threadIdx.x][0] += shared_data[threadIdx.x + i][0]; shared_data[threadIdx.x][1] += shared_data[threadIdx.x + i][1]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x * 2] = shared_data[0][0]; aOutput[blockIdx.x * 2 + 1] = shared_data[0][1]; } } CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction) { if (aMatrix.getDimSize(2) > 1) { std::cerr << "sum() not support 3D data!" << std::endl; return CudaMatrix(); } // 针对向量行等于列 if (direction == Column && aMatrix.getDimSize(0) == 1) { direction = Row; } if (!aMatrix.isComplex()) { switch (direction) { case All: { float *data = nullptr; cudaMalloc((void **)&data, sizeof(float)); auto ret = CudaMatrix::fromRawData(data, 1, 1, 1); float result = thrust::reduce(thrust::device, aMatrix.getData(), aMatrix.getData() + aMatrix.getDataSize(), 0.0000000f, thrust::plus()); ret.setValue(0, result); return ret; } case Row: { float *matData = aMatrix.getData(); float *retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0)); sumRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); return ret; } case Column: default: { float *matData = aMatrix.getData(); float *retData = nullptr; int colElementCount = aMatrix.getDimSize(0); if (colElementCount == 1) return aMatrix; cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1)); sumColKernel<<>>(matData, retData, colElementCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1)); return ret; } } } else { switch (direction) { case All: { float *matData = aMatrix.getData(); float *retData = nullptr; // divide the whole array to some 4096 blocks, then caculate as columns sum int fakeCol = (int)ceilf((float)aMatrix.getDataSize() / 4096.0f); cudaMalloc((void **)&retData, sizeof(float) * 2 * fakeCol); auto ret = CudaMatrix::fromRawData(retData, 1, fakeCol, 1, Complex); sumZAllColKernel<<>>(matData, retData, aMatrix.getDataSize()); cudaDeviceSynchronize(); float *result_data = nullptr; cudaMalloc((void **)&result_data, sizeof(float) * 2); auto ret2 = CudaMatrix::fromRawData(result_data, 1, 1, 1, Complex); float result = thrust::reduce(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), 0, thrust::plus()); ret2.setValue(0, result); result = thrust::reduce(thrust::device, ret.getData() + ret.getDataSize(), ret.getData() + ret.getDataSize() * 2, 0, thrust::plus()); ret2.setValue(1, result); return ret2; } case Row: { float *matData = aMatrix.getData(); float *retData = nullptr; int rowElementCount = aMatrix.getDimSize(1); cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(0) * 2); sumZRowKernel<<>>(matData, retData, aMatrix.getDimSize(0), rowElementCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, aMatrix.getDimSize(0), 1); return ret; } case Column: default: { float *matData = aMatrix.getData(); float *retData = nullptr; int colElementCount = aMatrix.getDimSize(0); if (colElementCount == 1) return aMatrix; cudaMalloc((void **)&retData, sizeof(float) * aMatrix.getDimSize(1) * 2); sumZColKernel<<>>(matData, retData, colElementCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData, 1, aMatrix.getDimSize(1), 1, Complex); return ret; } } } } CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction) { if (aMatrix.getDimSize(2) > 1) { std::cerr << "sum() not support 3D data!" << std::endl; return CudaMatrix(); } // 针对向量行等于列 if (direction == Column && aMatrix.getDimSize(0) == 1) { direction = Row; } if (!aMatrix.isComplex()) { switch (direction) { case All: { auto ret = sum(aMatrix, All); ret.setValue(0, ret.getValue(0) / ((float)aMatrix.getDataSize())); return ret; } case Row: { auto ret = sum(aMatrix, Row); float count = (float)aMatrix.getDimSize(1); auto lambda = [=] __device__(const float &v) { return v / count; }; thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), ret.getData(), lambda); return ret; } case Column: default: { auto ret = sum(aMatrix, Column); float count = (float)aMatrix.getDimSize(0); auto lambda = [=] __device__(const float &v) { return v / count; }; thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize(), ret.getData(), lambda); return ret; } } } else { std::cerr << "mean() not support complex data!" << std::endl; return CudaMatrix(); } } CudaMatrix Aurora::std(const CudaMatrix &aMatrix) { if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "std() not support 3D data!" : "std() not support complex value type!") << std::endl; return CudaMatrix(); } auto src = aMatrix.isComplex() ? Aurora::abs(aMatrix) : aMatrix.deepCopy(); int calc_size = src.getDimSize(0) == 1 ? src.getDimSize(1) : src.getDimSize(0); auto meanM = Aurora::mean(src); return sqrt(Aurora::sum((src - meanM) ^ 2.0) / ((float)calc_size - 1.0f)); } template class RowElementIterator : public thrust::iterator_facade< RowElementIterator, ValueType, thrust::device_system_tag, thrust::random_access_traversal_tag, ValueType &> { public: // 构造函数 __host__ __device__ RowElementIterator(ValueType *ptr, int aColElementCount = 1) : ptr_(ptr), col_elements_(aColElementCount) {} __host__ __device__ ValueType &dereference() const { return *ptr_; } // 实现递增操作符 __host__ __device__ void increment() { ptr_ = ptr_ + col_elements_; } // 实现递减操作符 __host__ __device__ void decrement() { ptr_ = ptr_ - col_elements_; } // 实现加法操作符 __host__ __device__ void advance(typename RowElementIterator::difference_type n) { ptr_ += col_elements_ * n; } // 实现减法操作符 __host__ __device__ typename RowElementIterator::difference_type distance_to(const RowElementIterator &other) const { return (other.ptr_ - ptr_) / col_elements_; } // 实现比较操作符 __host__ __device__ bool equal(const RowElementIterator &other) const { return ptr_ == other.ptr_; } private: ValueType *ptr_; int col_elements_; }; CudaMatrix Aurora::sort(const CudaMatrix &aMatrix, FunctionDirection direction) { if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") << std::endl; return CudaMatrix(); } return sort(std::forward(aMatrix.deepCopy()), direction); } CudaMatrix Aurora::sort(CudaMatrix &&aMatrix, FunctionDirection direction) { if (aMatrix.getDimSize(2) > 1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") << std::endl; return CudaMatrix(); } float *data = aMatrix.getData(); int colElementCount = aMatrix.getDimSize(0); switch (direction) { case Row: { for (size_t i = 0; i < colElementCount; i++) { thrust::sort(thrust::device, RowElementIterator(data + i, colElementCount), RowElementIterator(data + aMatrix.getDataSize() + i, colElementCount)); } return aMatrix; } case Column: { int rowElementCount = aMatrix.getDimSize(1); for (size_t i = 0; i < rowElementCount; i++) { thrust::sort(thrust::device, data + i * colElementCount, data + (i + 1) * colElementCount); } return aMatrix; } default: { std::cerr << "Unsupported direction for sort!" << std::endl; return CudaMatrix(); } } } __global__ void immseKernel(float *aInputData1, float *aInputData2, float *aOutputData, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { aOutputData[idx] = powf(aInputData1[idx] - aInputData2[idx], 2); } } float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB) { if (aImageA.getDims() != 2 || aImageB.getDims() != 2) { std::cerr << "Fail! cuda immse args must all 2d matrix!"; return 0.0; } if (!aImageB.compareShape(aImageA)) { std::cerr << "Fail! cuda immse args must be same shape!"; return 0.0; } if (aImageA.getValueType() != Normal || aImageB.getValueType() != Normal) { std::cerr << "Fail! cuda immse args must be normal value type!"; return 0.0; } unsigned int size = aImageA.getDataSize(); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; immseKernel<<>>(aImageA.getData(), aImageB.getData(), data, size); cudaDeviceSynchronize(); float result = thrust::reduce(thrust::device, data, data + size, 0.0, thrust::plus()) / size; cudaFree(data); return result; } struct compareMatrixByRows { compareMatrixByRows(unsigned int aSize) : mSize(aSize) { }; unsigned int mSize; __host__ __device__ bool operator()(const float *aVector1, const float *aVector2) const { for (unsigned int i = 0; i < mSize; ++i) { if (aVector1[i] < aVector2[i]) { return true; } else if (aVector1[i] > aVector2[i]) { return false; } } return false; } }; CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix &indexMatrix) { CudaMatrix transposeMatrix = transpose(aMatrix); size_t rows = transposeMatrix.getDimSize(0); size_t columns = transposeMatrix.getDimSize(1); thrust::device_vector vector(columns); for (unsigned int i = 0; i < columns; ++i) { vector[i] = transposeMatrix.getData() + i * rows; } thrust::device_vector vectorBack = vector; thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows)); float *data = nullptr; float *indexResult = new float[columns]; cudaMalloc((void **)&data, sizeof(float) * rows * columns); for (unsigned int i = 0; i < columns; ++i) { cudaMemcpy(data + i * rows, vector[i], sizeof(float) * rows, cudaMemcpyDeviceToDevice); } for (unsigned int i = 0; i < columns; ++i) { auto index = thrust::find(thrust::device, vectorBack.begin(), vectorBack.end(), vector[i]); indexResult[i] = index - vectorBack.begin(); } indexMatrix = Aurora::Matrix::fromRawData(indexResult, columns).toDeviceMatrix(); return transpose(CudaMatrix::fromRawData(data, rows, columns)); } __global__ void invKernel(float **aInputPointer, float *aInputData, float **aOutputPointer, float *aOutputData) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx == 0) { aInputPointer[0] = aInputData; aOutputPointer[0] = aOutputData; } } CudaMatrix Aurora::inv(const CudaMatrix &aMatrix) { if (aMatrix.getDims() != 2) { std::cerr << "Fail! cuda inv args must be 2d matrix!"; return aMatrix; } if (aMatrix.getDimSize(0) != aMatrix.getDimSize(1)) { std::cerr << "Fail! cuda inv args must be square matrix!"; return aMatrix; } if (aMatrix.getValueType() != Normal) { std::cerr << "Fail! cuda inv args must be normal value type!"; return aMatrix; } unsigned int n = aMatrix.getDimSize(0); unsigned int size = aMatrix.getDataSize(); cublasHandle_t handle; cublasCreate(&handle); float *data; cudaMalloc((void **)&data, sizeof(float) * size); float **deviceInputPinter; cudaMalloc((void **)&deviceInputPinter, sizeof(float *)); float **deviceOutputPointer; cudaMalloc((void **)&deviceOutputPointer, sizeof(float *)); invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data); cudaDeviceSynchronize(); int *devicePivotArray; cudaMalloc((void **)&devicePivotArray, n * sizeof(int)); int *deviceInfoArray; cudaMalloc((void **)&deviceInfoArray, sizeof(int)); cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1); cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1); cudaFree(devicePivotArray); cudaFree(deviceInfoArray); cudaFree(deviceOutputPointer); cudaFree(deviceInputPinter); cublasDestroy(handle); return CudaMatrix::fromRawData(data, n, n); } CudaMatrix Aurora::inv(CudaMatrix &&aMatrix) { if (aMatrix.getDims() != 2) { std::cerr << "Fail! cuda inv args must be 2d matrix!"; return aMatrix; } if (aMatrix.getDimSize(0) != aMatrix.getDimSize(1)) { std::cerr << "Fail! cuda inv args must be square matrix!"; return aMatrix; } if (aMatrix.getValueType() != Normal) { std::cerr << "Fail! cuda inv args must be normal value type!"; return aMatrix; } unsigned int n = aMatrix.getDimSize(0); unsigned int size = aMatrix.getDataSize(); cublasHandle_t handle; cublasCreate(&handle); float *data; cudaMalloc((void **)&data, sizeof(float) * size); float **deviceInputPinter; cudaMalloc((void **)&deviceInputPinter, sizeof(float *)); float **deviceOutputPointer; cudaMalloc((void **)&deviceOutputPointer, sizeof(float *)); invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data); cudaDeviceSynchronize(); int *devicePivotArray; cudaMalloc((void **)&devicePivotArray, n * sizeof(int)); int *deviceInfoArray; cudaMalloc((void **)&deviceInfoArray, sizeof(int)); cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1); cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1); cudaFree(devicePivotArray); cudaFree(deviceInfoArray); cudaFree(deviceOutputPointer); cudaFree(deviceInputPinter); cublasDestroy(handle); return CudaMatrix::fromRawData(data, n, n); } __global__ void dotColumnKernel(float *aInputData1, float *aInputData2, float *aOutputData, unsigned int aInputRowSize) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 0; for (unsigned int i = 0; i <= aInputRowSize / blockDim.x; ++i) { unsigned int indexByRows = i * blockDim.x + threadIdx.x; if (indexByRows < aInputRowSize) { sharedValue[threadIdx.x] += aInputData1[blockIdx.x * aInputRowSize + indexByRows] * aInputData2[blockIdx.x * aInputRowSize + indexByRows]; } } __syncthreads(); for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { sharedValue[threadIdx.x] += sharedValue[threadIdx.x + i]; } __syncthreads(); } aOutputData[blockIdx.x] = sharedValue[0]; } CudaMatrix Aurora::dot(const CudaMatrix &aMatrix, const CudaMatrix &aOther, FunctionDirection direction) { if (direction != Aurora::Column) { std::cerr << "cuda dot() only support column data!" << std::endl; return CudaMatrix(); } if (!aMatrix.compareShape(aOther)) { std::cerr << "cuda dot() matrix must be same shape!" << std::endl; return CudaMatrix(); } if (aMatrix.getValueType() == Aurora::Complex || aOther.getValueType() == Aurora::Complex) { std::cerr << "cuda dot() do not support complex data!" << std::endl; return CudaMatrix(); } unsigned int column = aMatrix.getDimSize(1); unsigned int row = aMatrix.getDimSize(0); if (aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) { column = 1; row = aMatrix.getDataSize(); } float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * column); dotColumnKernel<<>>(aMatrix.getData(), aOther.getData(), data, row); cudaDeviceSynchronize(); return CudaMatrix::fromRawData(data, 1, column); } __global__ void prodKernel(float *aInputData, float *aOutputData, unsigned int aInputRowSize) { __shared__ float sharedValue[THREADS_PER_BLOCK]; sharedValue[threadIdx.x] = 1; for (unsigned int i = 0; i <= aInputRowSize / blockDim.x; ++i) { unsigned int indexByRows = i * blockDim.x + threadIdx.x; if (indexByRows < aInputRowSize) { sharedValue[threadIdx.x] *= aInputData[blockIdx.x * aInputRowSize + indexByRows]; } } __syncthreads(); for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { sharedValue[threadIdx.x] *= sharedValue[threadIdx.x + i]; } __syncthreads(); } if (threadIdx.x == 0) { aOutputData[blockIdx.x] = sharedValue[0]; } } __global__ void prodComplexKernel(float *aInputData, float *aOutputData, unsigned int aInputRowSize) { __shared__ float sharedValue[THREADS_PER_BLOCK * 2]; unsigned int complexIdx = threadIdx.x * 2; sharedValue[complexIdx] = 1; sharedValue[complexIdx + 1] = 0; for (unsigned int i = 0; i <= (aInputRowSize / blockDim.x); ++i) { unsigned int indexByRows = i * blockDim.x + threadIdx.x; if (indexByRows < aInputRowSize) { unsigned int index = 2 * (blockIdx.x * aInputRowSize + indexByRows); float real = sharedValue[complexIdx] * aInputData[index] - sharedValue[complexIdx + 1] * aInputData[index + 1]; float imag = sharedValue[complexIdx] * aInputData[index + 1] + sharedValue[complexIdx + 1] * aInputData[index]; sharedValue[complexIdx] = real; sharedValue[complexIdx + 1] = imag; } } __syncthreads(); for (unsigned int i = blockDim.x / 2; i > 0; i >>= 1) { if (threadIdx.x < i) { unsigned int index = 2 * (threadIdx.x + i); float real = sharedValue[complexIdx] * sharedValue[index] - sharedValue[complexIdx + 1] * sharedValue[index + 1]; float imag = sharedValue[complexIdx] * sharedValue[index + 1] + sharedValue[complexIdx + 1] * sharedValue[index]; sharedValue[complexIdx] = real; sharedValue[complexIdx + 1] = imag; } __syncthreads(); } if (threadIdx.x == 0) { aOutputData[2 * blockIdx.x] = sharedValue[0]; aOutputData[2 * blockIdx.x + 1] = sharedValue[1]; } } CudaMatrix Aurora::prod(const CudaMatrix &aMatrix) { if (aMatrix.getDimSize(2) > 1) { std::cerr << "cuda prod() not support 3D data!" << std::endl; return CudaMatrix(); } unsigned int row = aMatrix.getDimSize(0); unsigned int column = aMatrix.getDimSize(1); if (aMatrix.getDimSize(0) == 1 || aMatrix.getDimSize(1) == 1) { column = 1; row = aMatrix.getDataSize(); } float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * column * aMatrix.getValueType()); if (aMatrix.isComplex()) { prodComplexKernel<<>>(aMatrix.getData(), data, row); } else { prodKernel<<>>(aMatrix.getData(), data, row); } cudaDeviceSynchronize(); return CudaMatrix::fromRawData(data, 1, column, 1, aMatrix.getValueType()); } /** * @brief * * * @param aMatrix * @param aDirection 0 FORWARD, 1 BACKWARD */ void ExecFFT(CudaMatrix &aMatrix, int aDirection) { cufftHandle plan; cudaStream_t stream = NULL; int batch_size = aMatrix.getDimSize(1); cufftComplex *d_data = (cufftComplex *)aMatrix.getData(); cufftCreate(&plan); // int gpus[1] ={0}; // cufftXtSetGPUs(plan,1,gpus); cufftPlan1d(&plan, aMatrix.getDimSize(0), CUFFT_C2C, batch_size); cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); cufftSetStream(plan, stream); cufftExecC2C(plan, d_data, d_data, aDirection == 0 ? CUFFT_FORWARD : CUFFT_INVERSE); cudaStreamSynchronize(stream); cufftDestroy(plan); cudaStreamDestroy(stream); } __global__ void complexFillKernel(float *aInputData, float *aOutput, unsigned int aCopySize, unsigned int aSrcColEleCount, unsigned int aDesColEleCount) { unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; for (int offset = 0; offset < aDesColEleCount; offset += blockDim.x) { if (threadIdx.x + offset < aCopySize) { aOutput[2 * idx_d + offset * 2] = aInputData[idx_s + offset]; aOutput[2 * idx_d + offset * 2 + 1] = 0; } else if (threadIdx.x + offset < aDesColEleCount) { aOutput[2 * idx_d + offset * 2] = 0; aOutput[2 * idx_d + offset * 2 + 1] = 0; } else { return; } } } __global__ void complexCopyKernel(float *aInputData, float *aOutput, unsigned int aCopySize, unsigned int aSrcColEleCount, unsigned int aDesColEleCount) { unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; for (int offset = 0; offset < aDesColEleCount; offset += blockDim.x) { if (threadIdx.x + offset < aCopySize) { aOutput[2 * idx_d + offset * 2] = aInputData[idx_s * 2 + offset * 2]; aOutput[2 * idx_d + offset * 2 + 1] = aInputData[idx_s * 2 + offset * 2 + 1]; } else if (threadIdx.x + offset < aDesColEleCount) { aOutput[2 * idx_d + offset * 2] = 0; aOutput[2 * idx_d + offset * 2 + 1] = 0; } else { return; } } } CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize) { size_t ColEleCount = (aFFTSize > 0) ? aFFTSize : aMatrix.getDimSize(0); // 实际需要copy赋值的非0值 size_t needCopySize = (ColEleCount < aMatrix.getDimSize(0)) ? ColEleCount : aMatrix.getDimSize(0); size_t bufferSize = ColEleCount * aMatrix.getDimSize(1); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * 2 * bufferSize); if (aMatrix.isComplex()) { complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount); } else { complexFillKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount); } cudaDeviceSynchronize(); auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex); ExecFFT(ret, 0); return ret; } CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize) { if (!aMatrix.isComplex()) { std::cerr << "ifft input must be complex value" << std::endl; return CudaMatrix(); } size_t ColEleCount = (aFFTSize > 0) ? aFFTSize : aMatrix.getDimSize(0); // 实际需要copy赋值的非0值 size_t needCopySize = (ColEleCount < aMatrix.getDimSize(0)) ? ColEleCount : aMatrix.getDimSize(0); size_t bufferSize = ColEleCount * aMatrix.getDimSize(1); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * 2 * bufferSize); complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0), ColEleCount); cudaDeviceSynchronize(); auto ret = Aurora::CudaMatrix::fromRawData(data, ColEleCount, aMatrix.getDimSize(1), 1, Complex); ExecFFT(ret, 1); float colEleCountf = 1.f / ColEleCount; auto lambda = [=] __device__(const float &v) { return v * colEleCountf; }; thrust::transform(thrust::device, ret.getData(), ret.getData() + ret.getDataSize() * 2, ret.getData(), lambda); return ret; } __global__ void fftshiftSwapKernel(float *aData, unsigned int aColEleCount) { unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; unsigned int stride = aColEleCount / 2; for (int i = 0; i < stride; i += blockDim.x) { if (threadIdx.x + i < stride) { float temp = aData[idx]; aData[idx] = aData[idx + stride]; aData[idx + stride] = temp; } } } __global__ void memcpyColKernel(float *aInputData, float *aOutputData, unsigned int aCopySize, unsigned int aSrcColEleCount, unsigned int aDesColEleCount) { unsigned int idx = blockIdx.x * aSrcColEleCount + threadIdx.x; unsigned int idx2 = blockIdx.x * aDesColEleCount + threadIdx.x; for (int i = 0; i < aCopySize; i += blockDim.x) { if (threadIdx.x + i < aCopySize) { aOutputData[idx2 + i] = aInputData[idx + i]; } } } void Aurora::fftshift(CudaMatrix &aMatrix) { if (aMatrix.getDimSize(0) % 2 == 0) { fftshiftSwapKernel<<>>( aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaDeviceSynchronize(); } else { int copySize = aMatrix.getDimSize(0) / 2 + 1; float *data = nullptr; cudaMalloc((void **)&data, copySize * sizeof(float) * aMatrix.getValueType()); memcpyColKernel<<>>( aMatrix.getData(), data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); cudaDeviceSynchronize(); memcpyColKernel<<>>( aMatrix.getData() + copySize * aMatrix.getValueType(), aMatrix.getData(), (copySize - 1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaDeviceSynchronize(); memcpyColKernel<<>>( data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(), copySize * aMatrix.getValueType(), copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaFree(data); } } void Aurora::ifftshift(CudaMatrix &aMatrix) { if (aMatrix.getDimSize(0) % 2 == 0) { fftshiftSwapKernel<<>>( aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); } else { int copySize = aMatrix.getDimSize(0) / 2 + 1; float *data = nullptr; cudaMalloc((void **)&data, copySize * sizeof(float) * aMatrix.getValueType()); memcpyColKernel<<>>( aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(), data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); cudaDeviceSynchronize(); memcpyColKernel<<>>( aMatrix.getData(), aMatrix.getData() + (copySize)*aMatrix.getValueType(), (copySize - 1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaDeviceSynchronize(); memcpyColKernel<<>>( data, aMatrix.getData(), copySize * aMatrix.getValueType(), copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaFree(data); } } __global__ void sub2indKernel(float *aVMatrixSize, float **aindexMatrix, float *aOutputData, unsigned int aRowSize, unsigned int aColumnSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aRowSize) { aOutputData[idx] = 0; for (unsigned int i = aColumnSize; i > 0; --i) { unsigned int subSize = 1; for (unsigned int j = 0; j < i - 1; ++j) { subSize *= aVMatrixSize[j]; } aOutputData[idx] += (aindexMatrix[i - 1][idx] - 1) * subSize; } aOutputData[idx] += 1; } } CudaMatrix Aurora::sub2ind(const CudaMatrix &aVMatrixSize, std::vector aSliceIdxs) { if (aSliceIdxs.size() != aVMatrixSize.getDataSize()) { std::cerr << "cuda sub2ind size not match" << std::endl; return CudaMatrix(); } if (aSliceIdxs.size() == 0) { std::cerr << "cuda sub2ind no index need calc!" << std::endl; return CudaMatrix(); } unsigned int indexMatrixRows = aSliceIdxs.begin()->getDataSize(); unsigned int indexMatrixColumns = aSliceIdxs.size(); float **indexMatrixData = nullptr; float **tempPointer = new float *[indexMatrixColumns]; cudaMalloc((void **)&indexMatrixData, sizeof(float *) * indexMatrixColumns); for (unsigned int i = 0; i < indexMatrixColumns; ++i) { tempPointer[i] = aSliceIdxs[i].getData(); } cudaMemcpy(indexMatrixData, tempPointer, sizeof(float *) * indexMatrixColumns, cudaMemcpyHostToDevice); float *data = nullptr; cudaMalloc((void **)&data, sizeof(float) * indexMatrixRows); int blocksPerGrid = (indexMatrixRows + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; sub2indKernel<<>>(aVMatrixSize.getData(), indexMatrixData, data, indexMatrixRows, indexMatrixColumns); cudaDeviceSynchronize(); cudaFree(indexMatrixData); delete[] tempPointer; return CudaMatrix::fromRawData(data, indexMatrixRows); } __global__ void ifft_symmetricKernel(float *aMatrix, unsigned int aMatrixDataSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aMatrixDataSize) { unsigned int indexOutput = (idx + aMatrixDataSize + 2) * 2; unsigned int indexInput = 2 * (aMatrixDataSize - idx); aMatrix[indexOutput] = aMatrix[indexInput]; aMatrix[indexOutput + 1] = -aMatrix[indexInput + 1]; } } CudaMatrix Aurora::ifft_symmetric(const CudaMatrix &aMatrix, long aLength) { if (!aMatrix.isVector()) { std::cerr << "cuda ifft_symmetric only support vector!" << std::endl; return CudaMatrix(); } int matrixLength = aMatrix.getDataSize(); float *data = nullptr; unsigned int size = aLength * 2; cudaMalloc((void **)&data, sizeof(float) * size); cudaMemset(data, 0.0, size); cudaMemcpy(data, aMatrix.getData(), sizeof(float) * aLength, cudaMemcpyDeviceToDevice); int blocksPerGrid = (aLength - 1 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; ifft_symmetricKernel<<>>(data, aLength / 2 - 1); cudaDeviceSynchronize(); return real(ifft(CudaMatrix::fromRawData(data, aLength, 1, 1, Complex))); } CudaMatrix Aurora::hilbert(const CudaMatrix &aMatrix) { auto x = fft(aMatrix); auto h = Aurora::zerosCuda(aMatrix.getDimSize(0), 1, 1); thrust::fill_n(thrust::device, h.getData(), aMatrix.getDimSize(0) / 2, 2.0f); h.setValue(aMatrix.getDimSize(0) / 2, ((aMatrix.getDimSize(0) << 31) >> 31) ? 2.0 : 1.0); h.setValue(0, 1.0f); x = x * h; auto result = ifft(x); return result; } __global__ void validKernel(const float* aData, const float* aValid, float* aOutput, int aOutputRowCount, int aOutputColumnCount) { int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; int dataIndex = (int)aValid[threadIndex]; if(threadIndex < aOutputColumnCount) { for(int i=0; i < aOutputRowCount; ++i) { aOutput[threadIndex * aOutputRowCount + i] = aData[dataIndex * aOutputRowCount + i]; } } } Aurora::CudaMatrix Aurora::valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid) { int validSize = aValid.getDataSize(); int rowCount = aData.getDimSize(0); float* hostValid = new float[validSize]; float* validProcessed = new float[validSize]; float* validProcessedDevice = nullptr; cudaMemcpy(hostValid, aValid.getData(), sizeof(float) * validSize, cudaMemcpyDeviceToHost); int validColumnCount = 0; for(int i=0;i>>(aData.getData(), validProcessedDevice, result, rowCount, validColumnCount); cudaDeviceSynchronize(); cudaFree(validProcessedDevice); delete[] hostValid; delete[] validProcessed; return Aurora::CudaMatrix::fromRawData(result, rowCount, validColumnCount); }