#include "AuroraDefs.h" #include "CudaMatrix.h" #include "Function1D.h" #include "Function1D.cuh" #include "Matrix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "Function1D.cuh" #include "Matrix.h" #include "cufft.h" #include using namespace Aurora; namespace { const int THREADS_PER_BLOCK = 256; } __global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize) { //确定每个thread的index unsigned int idx = blockIdx.x * aColSize + threadIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : -FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } __global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize) { //确定每个thread的基础index unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) { if(threadIdx.x+offset < aRowSize){ shared_data[threadIdx.x]= fmaxf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]); } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x/2; i >0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) { long a,b; return max(aMatrix,direction,a,b); } CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx) { if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { std::cerr<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") << std::endl; return CudaMatrix(); } //针对向量行等于列 if (aMatrix.isVector()){ direction = All; } switch (direction) { case All: { thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); auto max_iter = thrust::max_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize()); int index = max_iter-d_ptr; rowIdx = index%aMatrix.getDimSize(0); colIdx = index/aMatrix.getDimSize(0); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float)); auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1); ret.setValue(0, *max_iter); return ret; } case Row: { float* matData = aMatrix.getData(); float* retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); maxRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); return ret; } case Column: default: { float* matData = aMatrix.getData(); float* retData = nullptr; int colCount = aMatrix.getDimSize(0); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)); maxColKernel<<>>(matData,retData,colCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); return ret; } } } CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) { //col-vec x mat if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { std::cout<<"max mat and col-vec "<1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") << std::endl; return CudaMatrix(); } if (aOther.getDimSize(2)>1 || aOther.isComplex()) { std::cerr << (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!") << std::endl; return CudaMatrix(); } //same shape if (aMatrix.compareShape(aOther)){ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return fmaxf(x,y); }; thrust::transform(thrust::device,aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(), data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } // one is scalar else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){ float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0); auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix; return max(matrix, scalar); } else if (aMatrix.isVector()) { return ::vxmMax(aMatrix,aOther); } else if (aOther.isVector()) { return ::vxmMax(aOther,aMatrix); } std::cerr << "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" << std::endl; return CudaMatrix(); } CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue){ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__ (const float& x){ return fmaxf(x,aValue); }; thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aColSize) { //确定每个thread的index unsigned int idx = blockIdx.x * aColSize + threadIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } __global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize) { //确定每个thread的基础index unsigned int idx = threadIdx.x*aColSize+ blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) { if(threadIdx.x+offset < aRowSize){ shared_data[threadIdx.x]= fminf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]); } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x/2; i >0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]); } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) { long a,b; return min(aMatrix,direction,a,b); } CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx) { if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { std::cerr<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") << std::endl; return CudaMatrix(); } //针对向量行等于列 if (aMatrix.isVector()){ direction = All; } switch (direction) { case All: { thrust::device_ptr d_ptr = thrust::device_pointer_cast(aMatrix.getData()); auto max_iter = thrust::min_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize()); int index = max_iter-d_ptr; rowIdx = index%aMatrix.getDimSize(0); colIdx = index/aMatrix.getDimSize(0); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float)); auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1); ret.setValue(0, *max_iter); return ret; } case Row: { float* matData = aMatrix.getData(); float* retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); minRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); return ret; } case Column: default: { float* matData = aMatrix.getData(); float* retData = nullptr; int colCount = aMatrix.getDimSize(0); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)); minColKernel<<>>(matData,retData,colCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); return ret; } } } CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) { //col-vec x mat if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) { std::cout<<"min mat and col-vec "<1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") << std::endl; return CudaMatrix(); } if (aOther.getDimSize(2)>1 || aOther.isComplex()) { std::cerr << (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!") << std::endl; return CudaMatrix(); } //same shape if (aMatrix.compareShape(aOther)){ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__ (const float& x, const float& y){ return fminf(x,y); }; thrust::transform(thrust::device,aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(), data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } // one is scalar else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){ float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0); auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix; return min(matrix, scalar); } else if (aMatrix.isVector()) { return ::vxmMin(aMatrix,aOther); } else if (aOther.isVector()) { return ::vxmMin(aOther,aMatrix); } std::cerr << "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]" << std::endl; return CudaMatrix(); } CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){ size_t size = aMatrix.getDataSize() * aMatrix.getValueType(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); auto lambda = [=] __host__ __device__ (const float& x){ return fminf(x,aValue); }; thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(), data,lambda); return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0), aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType()); } __global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount) { //确定每个thread的index unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; __shared__ double shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x]= (threadIdx.x< aColEleCount) ? aInputData[idx] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = (float)shared_data[0]; } } __global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aColEleCount, unsigned int aRowEleCount) { //确定每个thread的基础index unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x; __shared__ float shared_data[256]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x]= (threadIdx.x< aRowEleCount) ? aInputData[idx] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) { if(threadIdx.x+offset < aRowEleCount){ shared_data[threadIdx.x]+= aInputData[idx + offset*aColEleCount]; } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x/2; i >0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x] += shared_data[threadIdx.x+i]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0]; } } __global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSize) { //确定每个thread的index unsigned int idx = blockIdx.x * 4096 + threadIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 bool flag = threadIdx.x< 4096 && idx0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x] = shared_data[0][0]; aOutput[blockIdx.x+gridDim.x] = shared_data[0][1]; } } __global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCount) { //确定每个thread的index unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x][0]= (threadIdx.x< aColEleCount) ? aInputData[idx*2] : 0.0; shared_data[threadIdx.x][1]= (threadIdx.x< aColEleCount) ? aInputData[idx*2+1] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0]; shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x*2] = shared_data[0][0]; aOutput[blockIdx.x*2+1] = shared_data[0][1]; } } __global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aColEleCount, unsigned int aRowEleCount) { //确定每个thread的基础index unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x; __shared__ float shared_data[256][2]; // 每个线程加载一个元素到共享内存 shared_data[threadIdx.x][0]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2] : 0.0; shared_data[threadIdx.x][1]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2+1] : 0.0; __syncthreads(); // 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段 for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) { if(threadIdx.x+offset < aRowEleCount){ shared_data[threadIdx.x][0]+= aInputData[idx*2 + offset*aColEleCount*2]; shared_data[threadIdx.x][1]+= aInputData[idx*2 + offset*aColEleCount*2 + 1]; } __syncthreads(); } // 规约最前面一段 for (int i = blockDim.x/2; i >0; i>>=1) { if (threadIdx.x < i) { shared_data[threadIdx.x][0] += shared_data[threadIdx.x+i][0]; shared_data[threadIdx.x][1] += shared_data[threadIdx.x+i][1]; } __syncthreads(); } // 第一个线程存储每个分段的最大值到全局内存 if (threadIdx.x == 0) { aOutput[blockIdx.x*2] = shared_data[0][0]; aOutput[blockIdx.x*2+1] = shared_data[0][1]; } } CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){ if (aMatrix.getDimSize(2)>1 ) { std::cerr<< "sum() not support 3D data!" << std::endl; return CudaMatrix(); } //针对向量行等于列 if (direction == Column && aMatrix.getDimSize(0)==1){ direction = Row; } if (!aMatrix.isComplex()) { switch (direction) { case All: { float* data = nullptr; cudaMalloc((void**)&data, sizeof(float)); auto ret = CudaMatrix::fromRawData(data,1,1,1); float result = thrust::reduce(thrust::device, aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),0.0000000f,thrust::plus()); ret.setValue(0,result); return ret; } case Row: { float* matData = aMatrix.getData(); float* retData = nullptr; int rowCount = aMatrix.getDimSize(1); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)); sumRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); return ret; } case Column: default: { std::cout<<"Column sum"<>>(matData,retData,colElementCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1)); return ret; } } } else{ switch (direction) { case All: { float* matData = aMatrix.getData(); float* retData = nullptr; //divide the whole array to some 4096 blocks, then caculate as columns sum int fakeCol = (int)ceilf((float)aMatrix.getDataSize()/4096.0f); cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol); auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex); sumZAllColKernel<<>>(matData,retData, aMatrix.getDataSize()); float* result_data = nullptr; cudaMalloc((void**)&result_data, sizeof(float)*2); auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex); float result = thrust::reduce(thrust::device, ret.getData(),ret.getData()+ ret.getDataSize(),0,thrust::plus()); ret2.setValue(0,result); result = thrust::reduce(thrust::device, ret.getData()+ ret.getDataSize(),ret.getData()+ ret.getDataSize()*2,0,thrust::plus()); ret2.setValue(1,result); return ret2; } case Row: { float* matData = aMatrix.getData(); float* retData = nullptr; int rowElementCount = aMatrix.getDimSize(1); cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)*2); sumZRowKernel<<>>(matData,retData,aMatrix.getDimSize(0),rowElementCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1); return ret; } case Column: default: { float* matData = aMatrix.getData(); float* retData = nullptr; int colElementCount = aMatrix.getDimSize(0); if (colElementCount == 1) return aMatrix; cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)*2); sumZColKernel<<>>(matData,retData,colElementCount); cudaDeviceSynchronize(); CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1),1,Complex); return ret; } } } } CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){ if (aMatrix.getDimSize(2)>1 ) { std::cerr<< "sum() not support 3D data!" << std::endl; return CudaMatrix(); } //针对向量行等于列 if (direction == Column && aMatrix.getDimSize(0)==1){ direction = Row; } if (!aMatrix.isComplex()) { switch (direction) { case All: { auto ret = sum(aMatrix,All); ret.setValue(0,ret.getValue(0)/((float)aMatrix.getDataSize())); return ret; } case Row: { auto ret = sum(aMatrix, Row); float count = (float)aMatrix.getDimSize(1); auto lambda = [=] __device__ (const float& v){ return v/count; }; thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda); return ret; } case Column: default: { auto ret = sum(aMatrix, Column); float count = (float)aMatrix.getDimSize(0); auto lambda = [=] __device__ (const float& v){ return v/count; }; thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda); return ret; } } } else{ std::cerr<< "mean() not support complex data!" << std::endl; return CudaMatrix(); } } template class RowElementIterator:public thrust::iterator_facade< RowElementIterator, ValueType, thrust::device_system_tag, thrust::random_access_traversal_tag, ValueType& >{ public: // 构造函数 __host__ __device__ RowElementIterator(ValueType* ptr, int aColElementCount=1) : ptr_(ptr),col_elements_(aColElementCount) {} __host__ __device__ ValueType& dereference() const{ return *ptr_; } // 实现递增操作符 __host__ __device__ void increment() { ptr_ = ptr_+col_elements_; } // 实现递减操作符 __host__ __device__ void decrement() { ptr_ = ptr_ - col_elements_; } // 实现加法操作符 __host__ __device__ void advance(typename RowElementIterator::difference_type n) { ptr_ += col_elements_*n; } // 实现减法操作符 __host__ __device__ typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const { return (other.ptr_ - ptr_)/col_elements_; } // 实现比较操作符 __host__ __device__ bool equal(const RowElementIterator& other) const { return ptr_ == other.ptr_; } private: ValueType* ptr_; int col_elements_; }; CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction) { if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") << std::endl; return CudaMatrix(); } return sort(std::forward(aMatrix.deepCopy()), direction); } CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction) { if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) { std::cerr << (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!") << std::endl; return CudaMatrix(); } float * data = aMatrix.getData(); int colElementCount = aMatrix.getDimSize(0); switch (direction) { case Row: { for (size_t i = 0; i < colElementCount; i++) { thrust::sort(thrust::device, RowElementIterator(data+i,colElementCount), RowElementIterator(data+aMatrix.getDataSize()+i,colElementCount)); } return aMatrix; } case Column: { int rowElementCount = aMatrix.getDimSize(1); for (size_t i = 0; i < rowElementCount; i++) { thrust::sort(thrust::device, data+i*colElementCount, data+(i+1)*colElementCount); } return aMatrix; } default: { std::cerr << "Unsupported direction for sort!" << std::endl; return CudaMatrix(); } } } __global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputSize) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < aInputSize) { aOutputData[idx] = powf(aInputData1[idx] - aInputData2[idx], 2); } } float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB) { if (aImageA.getDims()!=2|| aImageB.getDims()!=2) { std::cerr<<"Fail! cuda immse args must all 2d matrix!"; return 0.0; } if (!aImageB.compareShape(aImageA)) { std::cerr<<"Fail! cuda immse args must be same shape!"; return 0.0; } if (aImageA.getValueType() != Normal || aImageB.getValueType() != Normal) { std::cerr << "Fail! cuda immse args must be normal value type!"; return 0.0; } unsigned int size = aImageA.getDataSize(); float* data = nullptr; cudaMalloc((void**)&data, sizeof(float) * size); int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; immseKernel<<>>(aImageA.getData(), aImageB.getData(), data, size); cudaDeviceSynchronize(); float result = thrust::reduce(thrust::device, data, data+size, 0.0, thrust::plus()) / size; cudaFree(data); return result; } struct compareMatrixByRows { compareMatrixByRows(unsigned int aSize) : mSize(aSize) { }; unsigned int mSize; __host__ __device__ bool operator()(const float* aVector1, const float* aVector2) const { for(unsigned int i=0; i aVector2[i]) { return false; } } return false; } }; CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix) { CudaMatrix transposeMatrix = transpose(aMatrix); size_t rows = transposeMatrix.getDimSize(0); size_t columns = transposeMatrix.getDimSize(1); thrust::device_vector vector(columns); for(unsigned int i=0; i vectorBack = vector; thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows)); float* data = nullptr; float* indexResult = new float[columns]; cudaMalloc((void**)&data, sizeof(float) * rows * columns); for(unsigned int i=0; i>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data); cudaDeviceSynchronize(); int* devicePivotArray; cudaMalloc((void**)&devicePivotArray, n * sizeof(int)); int* deviceInfoArray; cudaMalloc((void**)&deviceInfoArray, sizeof(int)); cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1); cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1); cudaFree(devicePivotArray); cudaFree(deviceInfoArray); cudaFree(deviceOutputPointer); cudaFree(deviceInputPinter); cublasDestroy(handle); return CudaMatrix::fromRawData(data, n, n); } /** * @brief * * * @param aMatrix * @param aDirection 0 FORWARD, 1 BACKWARD */ void ExecFFT(CudaMatrix& aMatrix,int aDirection){ cufftHandle plan; cudaStream_t stream = NULL; int batch_size = aMatrix.getDimSize(1); cufftComplex *d_data = (cufftComplex *)aMatrix.getData(); cufftCreate(&plan); // int gpus[1] ={0}; // cufftXtSetGPUs(plan,1,gpus); cufftPlan1d(&plan, aMatrix.getDimSize(0), CUFFT_C2C, batch_size); cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); cufftSetStream(plan, stream); cufftExecC2C(plan, d_data, d_data, aDirection==0?CUFFT_FORWARD:CUFFT_INVERSE); cudaStreamSynchronize(stream); cufftDestroy(plan); cudaStreamDestroy(stream); } __global__ void complexFillKernel(float* aInputData, float* aOutput,unsigned int aCopySize, unsigned int aSrcColEleCount, unsigned int aDesColEleCount) { unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x) { if(threadIdx.x + offset< aCopySize){ aOutput[2*idx_d] = aInputData[idx_s]; aOutput[2*idx_d + 1] = 0; } else if(threadIdx.x + offset< aDesColEleCount){ aOutput[2*idx_d] = 0; aOutput[2*idx_d + 1] = 0; } else{ return; } } } __global__ void complexCopyKernel(float* aInputData, float* aOutput,unsigned int aCopySize, unsigned int aSrcColEleCount, unsigned int aDesColEleCount) { unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x; unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x; for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x) { if(threadIdx.x + offset< aCopySize){ aOutput[2*idx_d] = aInputData[idx_s*2]; aOutput[2*idx_d + 1] = aInputData[idx_s*2+1]; } else if(threadIdx.x + offset< aDesColEleCount){ aOutput[2*idx_d] = 0; aOutput[2*idx_d + 1] = 0; } else{ return; } } } CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){ size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0); //实际需要copy赋值的非0值 size_t needCopySize = (ColEleCount>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); ExecFFT(ret,0); return ret; } CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){ if (!aMatrix.isComplex()){ std::cerr<<"ifft input must be complex value"<0)?aFFTSize:aMatrix.getDimSize(0); //实际需要copy赋值的非0值 size_t needCopySize = (ColEleCount>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); ExecFFT(ret,1); float colEleCountf = 1.f/ColEleCount; auto lambda = [=]__device__(const float& v){ return v*colEleCountf; }; thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize()*2,ret.getData(),lambda); return ret; } __global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){ unsigned int idx = blockIdx.x *aColEleCount + threadIdx.x; unsigned int stride = aColEleCount/2; for (int i = 0; i < stride; i+=blockDim.x) { if (threadIdx.x + i < stride) { float temp = aData[idx]; aData[idx] = aData[idx+stride]; aData[idx+stride] = temp; } } } __global__ void memcpyColKernel(float* aInputData,float* aOutputData, unsigned int aCopySize, unsigned int aSrcColEleCount,unsigned int aDesColEleCount){ unsigned int idx = blockIdx.x *aSrcColEleCount + threadIdx.x; unsigned int idx2 = blockIdx.x *aDesColEleCount + threadIdx.x; for (int i = 0; i < aCopySize; i+=blockDim.x) { if (threadIdx.x + i < aCopySize) { aOutputData[idx2+i] = aInputData[idx+i]; } } } void Aurora::fftshift(CudaMatrix &aMatrix){ if (aMatrix.getDimSize(0) % 2 == 0) { fftshiftSwapKernel<<>>( aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); } else { int copySize = aMatrix.getDimSize(0) / 2 + 1; float *data = nullptr; cudaMalloc((void **)&data, copySize * aMatrix.getValueType()); memcpyColKernel<<>>( aMatrix.getData(), data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); memcpyColKernel<<>>( aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(), (copySize - 1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); memcpyColKernel<<>>( data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(), copySize * aMatrix.getValueType(), copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaFree(data); } } void Aurora::ifftshift(CudaMatrix &aMatrix){ if (aMatrix.getDimSize(0) % 2 == 0) { fftshiftSwapKernel<<>>( aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); } else { int copySize = aMatrix.getDimSize(0) / 2 + 1; float *data = nullptr; cudaMalloc((void **)&data, copySize * aMatrix.getValueType()); memcpyColKernel<<>>( aMatrix.getData()+(copySize-1)* aMatrix.getValueType(), data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); memcpyColKernel<<>>( aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(), (copySize-1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); memcpyColKernel<<>>( data, aMatrix.getData(), copySize * aMatrix.getValueType(), copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); cudaFree(data); } }