Files
Aurora/src/Function2D.cu

1241 lines
46 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "AuroraDefs.h"
#include "CudaMatrix.h"
#include "Function1D.h"
#include "Function1D.cuh"
#include "Matrix.h"
#include <Function2D.cuh>
#include <cfloat>
#include <cstddef>
#include <cstdio>
#include <iostream>
#include <cmath>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/iterator_facade.h>
#include <thrust/copy.h>
#include <thrust/functional.h>
#include <thrust/complex.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
#include "Function1D.cuh"
#include "Matrix.h"
#include "cufft.h"
#include <cufftXt.h>
using namespace Aurora;
namespace
{
const int THREADS_PER_BLOCK = 256;
}
__global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * aColSize + threadIdx.x;
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : -FLT_MAX;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<aColSize; offset+=blockDim.x) {
if(threadIdx.x + offset<aColSize){
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], aInputData[idx + offset]);
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0];
}
}
__global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize)
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) {
if(threadIdx.x+offset < aRowSize){
shared_data[threadIdx.x]= fmaxf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]);
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]);
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0];
}
}
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) {
long a,b;
return max(aMatrix,direction,a,b);
}
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx)
{
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
//针对向量行等于列
if (aMatrix.isVector()){
direction = All;
}
switch (direction)
{
case All: {
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(aMatrix.getData());
auto max_iter = thrust::max_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize());
int index = max_iter-d_ptr;
rowIdx = index%aMatrix.getDimSize(0);
colIdx = index/aMatrix.getDimSize(0);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float));
auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1);
ret.setValue(0, *max_iter);
return ret;
}
case Row:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int rowCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
maxRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
}
case Column:
default:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int colCount = aMatrix.getDimSize(0);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
maxColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
return ret;
}
}
}
CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) {
//col-vec x mat
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) {
std::cout<<"max mat and col-vec "<<std::endl;
size_t size = aMat.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
auto lambda = [=] __host__ __device__(const float& x, const float& y) {
return fmaxf(x, y);
};
for (int i = 0; i < aMat.getDimSize(1); ++i) {
thrust::transform(thrust::device, aVec.getData(),
aVec.getData() + aVec.getDataSize(),
aMat.getData() + aMat.getDimSize(0) * i,
data + aMat.getDimSize(0) * i, lambda);
}
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
}
// row-vec x mat
else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1))
{
std::cout<<"max mat and row-vec "<<std::endl;
size_t size = aMat.getDataSize() ;
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
for (int i = 0; i < aMat.getDimSize(1); ++i) {
float v = aVec.getValue(i);
auto lambda = [=] __host__ __device__(const float& x) {
return fmaxf(x, v);
};
thrust::transform(thrust::device,
aMat.getData() + aMat.getDimSize(0)*i,
aMat.getData() + aMat.getDimSize(0) * (i+1),
data + aMat.getDimSize(0) * i, lambda);
}
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
}
std::cerr
<< "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
<< std::endl;
return CudaMatrix();
}
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr
<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
if (aOther.getDimSize(2)>1 || aOther.isComplex()) {
std::cerr
<< (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
//same shape
if (aMatrix.compareShape(aOther)){
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return fmaxf(x,y);
};
thrust::transform(thrust::device,aMatrix.getData(),
aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(),
data,lambda);
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
// one is scalar
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){
float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0);
auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix;
return max(matrix, scalar);
}
else if (aMatrix.isVector()) {
return ::vxmMax(aMatrix,aOther);
}
else if (aOther.isVector())
{
return ::vxmMax(aOther,aMatrix);
}
std::cerr
<< "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
<< std::endl;
return CudaMatrix();
}
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue){
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
auto lambda = [=] __host__ __device__ (const float& x){
return fmaxf(x,aValue);
};
thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),
data,lambda);
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aColSize)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * aColSize + threadIdx.x;
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : FLT_MAX;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<aColSize; offset+=blockDim.x) {
if(threadIdx.x + offset<aColSize){
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], aInputData[idx + offset]);
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0];
}
}
__global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize)
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) {
if(threadIdx.x+offset < aRowSize){
shared_data[threadIdx.x]= fminf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]);
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]);
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0];
}
}
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) {
long a,b;
return min(aMatrix,direction,a,b);
}
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx)
{
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
//针对向量行等于列
if (aMatrix.isVector()){
direction = All;
}
switch (direction)
{
case All: {
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(aMatrix.getData());
auto max_iter = thrust::min_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize());
int index = max_iter-d_ptr;
rowIdx = index%aMatrix.getDimSize(0);
colIdx = index/aMatrix.getDimSize(0);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float));
auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1);
ret.setValue(0, *max_iter);
return ret;
}
case Row:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int rowCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
minRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
}
case Column:
default:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int colCount = aMatrix.getDimSize(0);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
minColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
return ret;
}
}
}
CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) {
//col-vec x mat
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) {
std::cout<<"min mat and col-vec "<<std::endl;
size_t size = aMat.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
auto lambda = [=] __host__ __device__(const float& x, const float& y) {
return fminf(x, y);
};
for (int i = 0; i < aMat.getDimSize(1); ++i) {
thrust::transform(thrust::device, aVec.getData(),
aVec.getData() + aVec.getDataSize(),
aMat.getData() + aMat.getDimSize(0) * i,
data + aMat.getDimSize(0) * i, lambda);
}
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
}
// row-vec x mat
else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1))
{
std::cout<<"min mat and row-vec "<<std::endl;
size_t size = aMat.getDataSize() ;
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
for (int i = 0; i < aMat.getDimSize(1); ++i) {
float v = aVec.getValue(i);
auto lambda = [=] __host__ __device__(const float& x) {
return fminf(x, v);
};
thrust::transform(thrust::device,
aMat.getData() + aMat.getDimSize(0)*i,
aMat.getData() + aMat.getDimSize(0) * (i+1),
data + aMat.getDimSize(0) * i, lambda);
}
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
}
std::cerr
<< "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
<< std::endl;
return CudaMatrix();
}
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr
<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
if (aOther.getDimSize(2)>1 || aOther.isComplex()) {
std::cerr
<< (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
//same shape
if (aMatrix.compareShape(aOther)){
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
return fminf(x,y);
};
thrust::transform(thrust::device,aMatrix.getData(),
aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(),
data,lambda);
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
// one is scalar
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){
float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0);
auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix;
return min(matrix, scalar);
}
else if (aMatrix.isVector()) {
return ::vxmMin(aMatrix,aOther);
}
else if (aOther.isVector())
{
return ::vxmMin(aOther,aMatrix);
}
std::cerr
<< "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
<< std::endl;
return CudaMatrix();
}
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
auto lambda = [=] __host__ __device__ (const float& x){
return fminf(x,aValue);
};
thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),
data,lambda);
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
}
__global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
__shared__ double shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aColEleCount) ? aInputData[idx] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
if(threadIdx.x + offset<aColEleCount){
shared_data[threadIdx.x] += (double)aInputData[idx + offset];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = (float)shared_data[0];
}
}
__global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aColEleCount, unsigned int aRowEleCount)
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
__shared__ float shared_data[256];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x]= (threadIdx.x< aRowEleCount) ? aInputData[idx] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
if(threadIdx.x+offset < aRowEleCount){
shared_data[threadIdx.x]+= aInputData[idx + offset*aColEleCount];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x] += shared_data[threadIdx.x+i];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0];
}
}
__global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSize)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * 4096 + threadIdx.x;
__shared__ float shared_data[256][2];
// 每个线程加载一个元素到共享内存
bool flag = threadIdx.x< 4096 && idx<aTotalSize;
shared_data[threadIdx.x][0]= flag ? aInputData[idx*2] : 0.0;
shared_data[threadIdx.x][1]= flag ? aInputData[idx*2+1] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<4096; offset+=blockDim.x) {
if(threadIdx.x + offset<4096 && idx + offset<aTotalSize){
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 +1];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x] = shared_data[0][0];
aOutput[blockIdx.x+gridDim.x] = shared_data[0][1];
}
}
__global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCount)
{
//确定每个thread的index
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
__shared__ float shared_data[256][2];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x][0]= (threadIdx.x< aColEleCount) ? aInputData[idx*2] : 0.0;
shared_data[threadIdx.x][1]= (threadIdx.x< aColEleCount) ? aInputData[idx*2+1] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
if(threadIdx.x + offset<aColEleCount){
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 + 1];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x*2] = shared_data[0][0];
aOutput[blockIdx.x*2+1] = shared_data[0][1];
}
}
__global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aColEleCount, unsigned int aRowEleCount)
{
//确定每个thread的基础index
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
__shared__ float shared_data[256][2];
// 每个线程加载一个元素到共享内存
shared_data[threadIdx.x][0]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2] : 0.0;
shared_data[threadIdx.x][1]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2+1] : 0.0;
__syncthreads();
// 每个线程规约自己的分段将每个blockDim.x的值规约到数组最前面一段
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
if(threadIdx.x+offset < aRowEleCount){
shared_data[threadIdx.x][0]+= aInputData[idx*2 + offset*aColEleCount*2];
shared_data[threadIdx.x][1]+= aInputData[idx*2 + offset*aColEleCount*2 + 1];
}
__syncthreads();
}
// 规约最前面一段
for (int i = blockDim.x/2; i >0; i>>=1) {
if (threadIdx.x < i) {
shared_data[threadIdx.x][0] += shared_data[threadIdx.x+i][0];
shared_data[threadIdx.x][1] += shared_data[threadIdx.x+i][1];
}
__syncthreads();
}
// 第一个线程存储每个分段的最大值到全局内存
if (threadIdx.x == 0) {
aOutput[blockIdx.x*2] = shared_data[0][0];
aOutput[blockIdx.x*2+1] = shared_data[0][1];
}
}
CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
if (aMatrix.getDimSize(2)>1 ) {
std::cerr<< "sum() not support 3D data!"
<< std::endl;
return CudaMatrix();
}
//针对向量行等于列
if (direction == Column && aMatrix.getDimSize(0)==1){
direction = Row;
}
if (!aMatrix.isComplex())
{
switch (direction)
{
case All:
{
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float));
auto ret = CudaMatrix::fromRawData(data,1,1,1);
float result = thrust::reduce(thrust::device, aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),0.0000000f,thrust::plus<float>());
ret.setValue(0,result);
return ret;
}
case Row:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int rowCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
sumRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
}
case Column:
default:
{
std::cout<<"Column sum"<<std::endl;
float* matData = aMatrix.getData();
float* retData = nullptr;
int colElementCount = aMatrix.getDimSize(0);
if (colElementCount == 1) return aMatrix;
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
sumColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
return ret;
}
}
}
else{
switch (direction)
{
case All:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
//divide the whole array to some 4096 blocks, then caculate as columns sum
int fakeCol = (int)ceilf((float)aMatrix.getDataSize()/4096.0f);
cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol);
auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex);
sumZAllColKernel<<<fakeCol,256>>>(matData,retData, aMatrix.getDataSize());
float* result_data = nullptr;
cudaMalloc((void**)&result_data, sizeof(float)*2);
auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex);
float result = thrust::reduce(thrust::device, ret.getData(),ret.getData()+ ret.getDataSize(),0,thrust::plus<float>());
ret2.setValue(0,result);
result = thrust::reduce(thrust::device, ret.getData()+ ret.getDataSize(),ret.getData()+ ret.getDataSize()*2,0,thrust::plus<float>());
ret2.setValue(1,result);
return ret2;
}
case Row:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int rowElementCount = aMatrix.getDimSize(1);
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)*2);
sumZRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowElementCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
return ret;
}
case Column:
default:
{
float* matData = aMatrix.getData();
float* retData = nullptr;
int colElementCount = aMatrix.getDimSize(0);
if (colElementCount == 1) return aMatrix;
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)*2);
sumZColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
cudaDeviceSynchronize();
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1),1,Complex);
return ret;
}
}
}
}
CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){
if (aMatrix.getDimSize(2)>1 ) {
std::cerr<< "sum() not support 3D data!"
<< std::endl;
return CudaMatrix();
}
//针对向量行等于列
if (direction == Column && aMatrix.getDimSize(0)==1){
direction = Row;
}
if (!aMatrix.isComplex())
{
switch (direction)
{
case All:
{
auto ret = sum(aMatrix,All);
ret.setValue(0,ret.getValue(0)/((float)aMatrix.getDataSize()));
return ret;
}
case Row:
{
auto ret = sum(aMatrix, Row);
float count = (float)aMatrix.getDimSize(1);
auto lambda = [=] __device__ (const float& v){
return v/count;
};
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
return ret;
}
case Column:
default:
{
auto ret = sum(aMatrix, Column);
float count = (float)aMatrix.getDimSize(0);
auto lambda = [=] __device__ (const float& v){
return v/count;
};
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
return ret;
}
}
}
else{
std::cerr<< "mean() not support complex data!"
<< std::endl;
return CudaMatrix();
}
}
template <typename ValueType>
class RowElementIterator:public thrust::iterator_facade<
RowElementIterator<ValueType>,
ValueType,
thrust::device_system_tag,
thrust::random_access_traversal_tag,
ValueType& >{
public:
// 构造函数
__host__ __device__
RowElementIterator(ValueType* ptr, int aColElementCount=1) : ptr_(ptr),col_elements_(aColElementCount) {}
__host__ __device__
ValueType& dereference() const{
return *ptr_;
}
// 实现递增操作符
__host__ __device__
void increment() {
ptr_ = ptr_+col_elements_;
}
// 实现递减操作符
__host__ __device__
void decrement() {
ptr_ = ptr_ - col_elements_;
}
// 实现加法操作符
__host__ __device__
void advance(typename RowElementIterator::difference_type n) {
ptr_ += col_elements_*n;
}
// 实现减法操作符
__host__ __device__
typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const {
return (other.ptr_ - ptr_)/col_elements_;
}
// 实现比较操作符
__host__ __device__
bool equal(const RowElementIterator& other) const {
return ptr_ == other.ptr_;
}
private:
ValueType* ptr_;
int col_elements_;
};
CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction)
{
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
return sort(std::forward<CudaMatrix &&>(aMatrix.deepCopy()), direction);
}
CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction)
{
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
std::cerr
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
<< std::endl;
return CudaMatrix();
}
float * data = aMatrix.getData();
int colElementCount = aMatrix.getDimSize(0);
switch (direction)
{
case Row:
{
for (size_t i = 0; i < colElementCount; i++)
{
thrust::sort(thrust::device, RowElementIterator<float>(data+i,colElementCount),
RowElementIterator<float>(data+aMatrix.getDataSize()+i,colElementCount));
}
return aMatrix;
}
case Column:
{
int rowElementCount = aMatrix.getDimSize(1);
for (size_t i = 0; i < rowElementCount; i++)
{
thrust::sort(thrust::device, data+i*colElementCount,
data+(i+1)*colElementCount);
}
return aMatrix;
}
default:
{
std::cerr
<< "Unsupported direction for sort!"
<< std::endl;
return CudaMatrix();
}
}
}
__global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputSize)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < aInputSize)
{
aOutputData[idx] = powf(aInputData1[idx] - aInputData2[idx], 2);
}
}
float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB)
{
if (aImageA.getDims()!=2|| aImageB.getDims()!=2)
{
std::cerr<<"Fail! cuda immse args must all 2d matrix!";
return 0.0;
}
if (!aImageB.compareShape(aImageA))
{
std::cerr<<"Fail! cuda immse args must be same shape!";
return 0.0;
}
if (aImageA.getValueType() != Normal || aImageB.getValueType() != Normal)
{
std::cerr << "Fail! cuda immse args must be normal value type!";
return 0.0;
}
unsigned int size = aImageA.getDataSize();
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float) * size);
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
immseKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aImageA.getData(), aImageB.getData(), data, size);
cudaDeviceSynchronize();
float result = thrust::reduce(thrust::device, data, data+size, 0.0, thrust::plus<float>()) / size;
cudaFree(data);
return result;
}
struct compareMatrixByRows
{
compareMatrixByRows(unsigned int aSize)
: mSize(aSize)
{
};
unsigned int mSize;
__host__ __device__
bool operator()(const float* aVector1, const float* aVector2) const
{
for(unsigned int i=0; i<mSize; ++i)
{
if(aVector1[i] < aVector2[i])
{
return true;
}
else if(aVector1[i] > aVector2[i])
{
return false;
}
}
return false;
}
};
CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix)
{
CudaMatrix transposeMatrix = transpose(aMatrix);
size_t rows = transposeMatrix.getDimSize(0);
size_t columns = transposeMatrix.getDimSize(1);
thrust::device_vector<float*> vector(columns);
for(unsigned int i=0; i<columns; ++i)
{
vector[i] = transposeMatrix.getData() + i*rows;
}
thrust::device_vector<float*> vectorBack = vector;
thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows));
float* data = nullptr;
float* indexResult = new float[columns];
cudaMalloc((void**)&data, sizeof(float) * rows * columns);
for(unsigned int i=0; i<columns; ++i)
{
cudaMemcpy(data + i*rows, vector[i], sizeof(float) * rows, cudaMemcpyDeviceToDevice);
}
for(unsigned int i=0; i<columns; ++i)
{
auto index = thrust::find(thrust::device, vectorBack.begin(), vectorBack.end(), vector[i]);
indexResult[i] = index - vectorBack.begin();
}
indexMatrix = Aurora::Matrix::fromRawData(indexResult, columns).toDeviceMatrix();
return transpose(CudaMatrix::fromRawData(data, rows, columns));
}
__global__ void invKernel(float** aInputPointer, float* aInputData, float** aOutputPointer, float* aOutputData)
{
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx == 0)
{
aInputPointer[0] = aInputData;
aOutputPointer[0] = aOutputData;
}
}
CudaMatrix Aurora::inv(const CudaMatrix &aMatrix)
{
if (aMatrix.getDims() != 2)
{
std::cerr << "Fail! cuda inv args must be 2d matrix!";
return aMatrix;
}
if (aMatrix.getDimSize(0) != aMatrix.getDimSize(1))
{
std::cerr << "Fail! cuda inv args must be square matrix!";
return aMatrix;
}
if (aMatrix.getValueType() != Normal)
{
std::cerr << "Fail! cuda inv args must be normal value type!";
return aMatrix;
}
unsigned int n = aMatrix.getDimSize(0);
unsigned int size = aMatrix.getDataSize();
cublasHandle_t handle;
cublasCreate(&handle);
float* data;
cudaMalloc((void**)&data, sizeof(float) * size);
float** deviceInputPinter;
cudaMalloc((void**)&deviceInputPinter, sizeof(float *));
float **deviceOutputPointer;
cudaMalloc((void**)&deviceOutputPointer, sizeof(float *));
invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data);
cudaDeviceSynchronize();
int* devicePivotArray;
cudaMalloc((void**)&devicePivotArray, n * sizeof(int));
int* deviceInfoArray;
cudaMalloc((void**)&deviceInfoArray, sizeof(int));
cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1);
cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1);
cudaFree(devicePivotArray);
cudaFree(deviceInfoArray);
cudaFree(deviceOutputPointer);
cudaFree(deviceInputPinter);
cublasDestroy(handle);
return CudaMatrix::fromRawData(data, n, n);
}
/**
* @brief
*
*
* @param aMatrix
* @param aDirection 0 FORWARD, 1 BACKWARD
*/
void ExecFFT(CudaMatrix& aMatrix,int aDirection){
cufftHandle plan;
cudaStream_t stream = NULL;
int batch_size = aMatrix.getDimSize(1);
cufftComplex *d_data = (cufftComplex *)aMatrix.getData();
cufftCreate(&plan);
// int gpus[1] ={0};
// cufftXtSetGPUs(plan,1,gpus);
cufftPlan1d(&plan, aMatrix.getDimSize(0), CUFFT_C2C, batch_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cufftSetStream(plan, stream);
cufftExecC2C(plan, d_data, d_data, aDirection==0?CUFFT_FORWARD:CUFFT_INVERSE);
cudaStreamSynchronize(stream);
cufftDestroy(plan);
cudaStreamDestroy(stream);
}
__global__ void complexFillKernel(float* aInputData, float* aOutput,unsigned int aCopySize,
unsigned int aSrcColEleCount, unsigned int aDesColEleCount)
{
unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x;
unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x;
for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x)
{
if(threadIdx.x + offset< aCopySize){
aOutput[2*idx_d] = aInputData[idx_s];
aOutput[2*idx_d + 1] = 0;
}
else if(threadIdx.x + offset< aDesColEleCount){
aOutput[2*idx_d] = 0;
aOutput[2*idx_d + 1] = 0;
}
else{
return;
}
}
}
__global__ void complexCopyKernel(float* aInputData, float* aOutput,unsigned int aCopySize,
unsigned int aSrcColEleCount, unsigned int aDesColEleCount)
{
unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x;
unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x;
for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x)
{
if(threadIdx.x + offset< aCopySize){
aOutput[2*idx_d] = aInputData[idx_s*2];
aOutput[2*idx_d + 1] = aInputData[idx_s*2+1];
}
else if(threadIdx.x + offset< aDesColEleCount){
aOutput[2*idx_d] = 0;
aOutput[2*idx_d + 1] = 0;
}
else{
return;
}
}
}
CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0);
//实际需要copy赋值的非0值
size_t needCopySize = (ColEleCount<aMatrix.getDimSize(0))?ColEleCount:aMatrix.getDimSize(0);
size_t bufferSize = ColEleCount*aMatrix.getDimSize(1);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
ExecFFT(ret,0);
return ret;
}
CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){
if (!aMatrix.isComplex()){
std::cerr<<"ifft input must be complex value"<<std::endl;
return CudaMatrix();
}
size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0);
//实际需要copy赋值的非0值
size_t needCopySize = (ColEleCount<aMatrix.getDimSize(0))?ColEleCount:aMatrix.getDimSize(0);
size_t bufferSize = ColEleCount*aMatrix.getDimSize(1);
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
ExecFFT(ret,1);
float colEleCountf = 1.f/ColEleCount;
auto lambda = [=]__device__(const float& v){
return v*colEleCountf;
};
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize()*2,ret.getData(),lambda);
return ret;
}
__global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){
unsigned int idx = blockIdx.x *aColEleCount + threadIdx.x;
unsigned int stride = aColEleCount/2;
for (int i = 0; i < stride; i+=blockDim.x)
{
if (threadIdx.x + i < stride)
{
float temp = aData[idx];
aData[idx] = aData[idx+stride];
aData[idx+stride] = temp;
}
}
}
__global__ void memcpyColKernel(float* aInputData,float* aOutputData, unsigned int aCopySize,
unsigned int aSrcColEleCount,unsigned int aDesColEleCount){
unsigned int idx = blockIdx.x *aSrcColEleCount + threadIdx.x;
unsigned int idx2 = blockIdx.x *aDesColEleCount + threadIdx.x;
for (int i = 0; i < aCopySize; i+=blockDim.x)
{
if (threadIdx.x + i < aCopySize)
{
aOutputData[idx2+i] = aInputData[idx+i];
}
}
}
void Aurora::fftshift(CudaMatrix &aMatrix){
if (aMatrix.getDimSize(0) % 2 == 0) {
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
} else {
int copySize = aMatrix.getDimSize(0) / 2 + 1;
float *data = nullptr;
cudaMalloc((void **)&data, copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), data, copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(),
(copySize - 1) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(),
copySize * aMatrix.getValueType(),
copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
cudaFree(data);
}
}
void Aurora::ifftshift(CudaMatrix &aMatrix){
if (aMatrix.getDimSize(0) % 2 == 0) {
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
} else {
int copySize = aMatrix.getDimSize(0) / 2 + 1;
float *data = nullptr;
cudaMalloc((void **)&data, copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData()+(copySize-1)* aMatrix.getValueType(),
data, copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
copySize * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(),
(copySize-1) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
data, aMatrix.getData(),
copySize * aMatrix.getValueType(),
copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
cudaFree(data);
}
}