1241 lines
46 KiB
Plaintext
1241 lines
46 KiB
Plaintext
#include "AuroraDefs.h"
|
||
#include "CudaMatrix.h"
|
||
#include "Function1D.h"
|
||
#include "Function1D.cuh"
|
||
#include "Matrix.h"
|
||
#include <Function2D.cuh>
|
||
#include <cfloat>
|
||
#include <cstddef>
|
||
#include <cstdio>
|
||
#include <iostream>
|
||
|
||
#include <cmath>
|
||
#include <thrust/device_vector.h>
|
||
#include <thrust/transform.h>
|
||
#include <thrust/reduce.h>
|
||
#include <thrust/device_ptr.h>
|
||
#include <thrust/iterator/constant_iterator.h>
|
||
#include <thrust/iterator/counting_iterator.h>
|
||
#include <thrust/iterator/iterator_facade.h>
|
||
#include <thrust/copy.h>
|
||
#include <thrust/functional.h>
|
||
#include <thrust/complex.h>
|
||
#include <cuda_runtime.h>
|
||
#include <cublas_v2.h>
|
||
#include <cusolverDn.h>
|
||
#include "Function1D.cuh"
|
||
#include "Matrix.h"
|
||
|
||
#include "cufft.h"
|
||
#include <cufftXt.h>
|
||
using namespace Aurora;
|
||
|
||
namespace
|
||
{
|
||
const int THREADS_PER_BLOCK = 256;
|
||
}
|
||
|
||
|
||
__global__ void maxColKernel(float* aInputData, float* aOutput, unsigned int aColSize)
|
||
{
|
||
//确定每个thread的index
|
||
unsigned int idx = blockIdx.x * aColSize + threadIdx.x;
|
||
__shared__ float shared_data[256];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : -FLT_MAX;
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset<aColSize; offset+=blockDim.x) {
|
||
if(threadIdx.x + offset<aColSize){
|
||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], aInputData[idx + offset]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x] = shared_data[0];
|
||
}
|
||
}
|
||
|
||
__global__ void maxRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize)
|
||
{
|
||
//确定每个thread的基础index
|
||
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
|
||
__shared__ float shared_data[256];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : -FLT_MAX;
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) {
|
||
if(threadIdx.x+offset < aRowSize){
|
||
shared_data[threadIdx.x]= fmaxf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x] = fmaxf(shared_data[threadIdx.x], shared_data[threadIdx.x + i]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x] = shared_data[0];
|
||
}
|
||
}
|
||
|
||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction) {
|
||
long a,b;
|
||
return max(aMatrix,direction,a,b);
|
||
}
|
||
|
||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx)
|
||
{
|
||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||
std::cerr<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
//针对向量行等于列
|
||
if (aMatrix.isVector()){
|
||
direction = All;
|
||
}
|
||
switch (direction)
|
||
{
|
||
case All: {
|
||
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(aMatrix.getData());
|
||
auto max_iter = thrust::max_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize());
|
||
int index = max_iter-d_ptr;
|
||
rowIdx = index%aMatrix.getDimSize(0);
|
||
colIdx = index/aMatrix.getDimSize(0);
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float));
|
||
auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1);
|
||
ret.setValue(0, *max_iter);
|
||
return ret;
|
||
}
|
||
case Row:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int rowCount = aMatrix.getDimSize(1);
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
|
||
maxRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||
return ret;
|
||
}
|
||
case Column:
|
||
default:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int colCount = aMatrix.getDimSize(0);
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
|
||
maxColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
|
||
return ret;
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
CudaMatrix vxmMax(CudaMatrix aVec, CudaMatrix aMat) {
|
||
//col-vec x mat
|
||
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) {
|
||
std::cout<<"max mat and col-vec "<<std::endl;
|
||
size_t size = aMat.getDataSize();
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
auto lambda = [=] __host__ __device__(const float& x, const float& y) {
|
||
return fmaxf(x, y);
|
||
};
|
||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||
thrust::transform(thrust::device, aVec.getData(),
|
||
aVec.getData() + aVec.getDataSize(),
|
||
aMat.getData() + aMat.getDimSize(0) * i,
|
||
data + aMat.getDimSize(0) * i, lambda);
|
||
}
|
||
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
|
||
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
|
||
}
|
||
// row-vec x mat
|
||
else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1))
|
||
{
|
||
std::cout<<"max mat and row-vec "<<std::endl;
|
||
size_t size = aMat.getDataSize() ;
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||
float v = aVec.getValue(i);
|
||
auto lambda = [=] __host__ __device__(const float& x) {
|
||
return fmaxf(x, v);
|
||
};
|
||
thrust::transform(thrust::device,
|
||
aMat.getData() + aMat.getDimSize(0)*i,
|
||
aMat.getData() + aMat.getDimSize(0) * (i+1),
|
||
data + aMat.getDimSize(0) * i, lambda);
|
||
}
|
||
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
|
||
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
|
||
}
|
||
std::cerr
|
||
<< "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
|
||
}
|
||
|
||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||
std::cerr
|
||
<< (aMatrix.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
if (aOther.getDimSize(2)>1 || aOther.isComplex()) {
|
||
std::cerr
|
||
<< (aOther.getDimSize(2) > 1 ? "max() not support 3D data!" : "max() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
//same shape
|
||
if (aMatrix.compareShape(aOther)){
|
||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
||
return fmaxf(x,y);
|
||
};
|
||
thrust::transform(thrust::device,aMatrix.getData(),
|
||
aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(),
|
||
data,lambda);
|
||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||
}
|
||
// one is scalar
|
||
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){
|
||
float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0);
|
||
auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix;
|
||
return max(matrix, scalar);
|
||
}
|
||
else if (aMatrix.isVector()) {
|
||
return ::vxmMax(aMatrix,aOther);
|
||
}
|
||
else if (aOther.isVector())
|
||
{
|
||
return ::vxmMax(aOther,aMatrix);
|
||
}
|
||
std::cerr
|
||
<< "max(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
|
||
CudaMatrix Aurora::max(const CudaMatrix &aMatrix, const float aValue){
|
||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
auto lambda = [=] __host__ __device__ (const float& x){
|
||
return fmaxf(x,aValue);
|
||
};
|
||
thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),
|
||
data,lambda);
|
||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||
}
|
||
|
||
__global__ void minColKernel(float* aInputData, float* aOutput, unsigned int aColSize)
|
||
{
|
||
//确定每个thread的index
|
||
unsigned int idx = blockIdx.x * aColSize + threadIdx.x;
|
||
__shared__ float shared_data[256];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x] = (threadIdx.x< aColSize) ? aInputData[idx] : FLT_MAX;
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset<aColSize; offset+=blockDim.x) {
|
||
if(threadIdx.x + offset<aColSize){
|
||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], aInputData[idx + offset]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[i + threadIdx.x]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x] = shared_data[0];
|
||
}
|
||
}
|
||
|
||
__global__ void minRowKernel(float* aInputData, float* aOutput,unsigned int aColSize, unsigned int aRowSize)
|
||
{
|
||
//确定每个thread的基础index
|
||
unsigned int idx = threadIdx.x*aColSize+ blockIdx.x;
|
||
__shared__ float shared_data[256];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x]= (threadIdx.x< aRowSize) ? aInputData[idx] : FLT_MAX;
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset < aRowSize; offset+=blockDim.x) {
|
||
if(threadIdx.x+offset < aRowSize){
|
||
shared_data[threadIdx.x]= fminf(shared_data[threadIdx.x], aInputData[idx + offset*aColSize]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x] = fminf(shared_data[threadIdx.x], shared_data[threadIdx.x+i]);
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x] = shared_data[0];
|
||
}
|
||
}
|
||
|
||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction) {
|
||
long a,b;
|
||
return min(aMatrix,direction,a,b);
|
||
}
|
||
|
||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, FunctionDirection direction, long& rowIdx, long& colIdx)
|
||
{
|
||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||
std::cerr<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
//针对向量行等于列
|
||
if (aMatrix.isVector()){
|
||
direction = All;
|
||
}
|
||
switch (direction)
|
||
{
|
||
case All: {
|
||
thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(aMatrix.getData());
|
||
auto max_iter = thrust::min_element(thrust::device,d_ptr,d_ptr+aMatrix.getDataSize());
|
||
int index = max_iter-d_ptr;
|
||
rowIdx = index%aMatrix.getDimSize(0);
|
||
colIdx = index/aMatrix.getDimSize(0);
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float));
|
||
auto ret = Aurora::CudaMatrix::fromRawData(data,1,1,1);
|
||
ret.setValue(0, *max_iter);
|
||
return ret;
|
||
}
|
||
case Row:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int rowCount = aMatrix.getDimSize(1);
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
|
||
minRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||
return ret;
|
||
}
|
||
case Column:
|
||
default:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int colCount = aMatrix.getDimSize(0);
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
|
||
minColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
|
||
return ret;
|
||
}
|
||
}
|
||
}
|
||
|
||
CudaMatrix vxmMin(CudaMatrix aVec, CudaMatrix aMat) {
|
||
//col-vec x mat
|
||
if (aVec.getDimSize(1) == 1 && aVec.getDimSize(0) == aMat.getDimSize(0)) {
|
||
std::cout<<"min mat and col-vec "<<std::endl;
|
||
size_t size = aMat.getDataSize();
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
auto lambda = [=] __host__ __device__(const float& x, const float& y) {
|
||
return fminf(x, y);
|
||
};
|
||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||
thrust::transform(thrust::device, aVec.getData(),
|
||
aVec.getData() + aVec.getDataSize(),
|
||
aMat.getData() + aMat.getDimSize(0) * i,
|
||
data + aMat.getDimSize(0) * i, lambda);
|
||
}
|
||
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
|
||
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
|
||
}
|
||
// row-vec x mat
|
||
else if (aVec.getDimSize(0) == 1 && aVec.getDimSize(1) == aMat.getDimSize(1))
|
||
{
|
||
std::cout<<"min mat and row-vec "<<std::endl;
|
||
size_t size = aMat.getDataSize() ;
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
for (int i = 0; i < aMat.getDimSize(1); ++i) {
|
||
float v = aVec.getValue(i);
|
||
auto lambda = [=] __host__ __device__(const float& x) {
|
||
return fminf(x, v);
|
||
};
|
||
thrust::transform(thrust::device,
|
||
aMat.getData() + aMat.getDimSize(0)*i,
|
||
aMat.getData() + aMat.getDimSize(0) * (i+1),
|
||
data + aMat.getDimSize(0) * i, lambda);
|
||
}
|
||
return Aurora::CudaMatrix::fromRawData(data, aMat.getDimSize(0),
|
||
aMat.getDimSize(1), aMat.getDimSize(2), aMat.getValueType());
|
||
}
|
||
std::cerr
|
||
<< "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
|
||
}
|
||
|
||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const CudaMatrix &aOther) {
|
||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||
std::cerr
|
||
<< (aMatrix.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
if (aOther.getDimSize(2)>1 || aOther.isComplex()) {
|
||
std::cerr
|
||
<< (aOther.getDimSize(2) > 1 ? "min() not support 3D data!" : "min() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
//same shape
|
||
if (aMatrix.compareShape(aOther)){
|
||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
auto lambda = [=] __host__ __device__ (const float& x, const float& y){
|
||
return fminf(x,y);
|
||
};
|
||
thrust::transform(thrust::device,aMatrix.getData(),
|
||
aMatrix.getData()+aMatrix.getDataSize(),aOther.getData(),
|
||
data,lambda);
|
||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||
}
|
||
// one is scalar
|
||
else if (aMatrix.getDataSize() == 1 || aOther.getDataSize() == 1){
|
||
float scalar = (aMatrix.getDataSize() == 1)?aMatrix.getValue(0):aOther.getValue(0);
|
||
auto matrix = (aMatrix.getDataSize() == 1)?aOther:aMatrix;
|
||
return min(matrix, scalar);
|
||
}
|
||
else if (aMatrix.isVector()) {
|
||
return ::vxmMin(aMatrix,aOther);
|
||
}
|
||
else if (aOther.isVector())
|
||
{
|
||
return ::vxmMin(aOther,aMatrix);
|
||
}
|
||
std::cerr
|
||
<< "min(A,B) with matrix must be like A[MxN] - B[1xN] or A[Mx1] - B[MxN]"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
|
||
|
||
CudaMatrix Aurora::min(const CudaMatrix &aMatrix, const float aValue){
|
||
size_t size = aMatrix.getDataSize() * aMatrix.getValueType();
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
auto lambda = [=] __host__ __device__ (const float& x){
|
||
return fminf(x,aValue);
|
||
};
|
||
thrust::transform(thrust::device,aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),
|
||
data,lambda);
|
||
return Aurora::CudaMatrix::fromRawData(data, aMatrix.getDimSize(0),
|
||
aMatrix.getDimSize(1), aMatrix.getDimSize(2), aMatrix.getValueType());
|
||
}
|
||
|
||
__global__ void sumColKernel(float* aInputData, float* aOutput, int aColEleCount)
|
||
{
|
||
//确定每个thread的index
|
||
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
|
||
__shared__ double shared_data[256];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x]= (threadIdx.x< aColEleCount) ? aInputData[idx] : 0.0;
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
|
||
if(threadIdx.x + offset<aColEleCount){
|
||
shared_data[threadIdx.x] += (double)aInputData[idx + offset];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x] += (double)shared_data[i + threadIdx.x];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x] = (float)shared_data[0];
|
||
}
|
||
}
|
||
|
||
__global__ void sumRowKernel(float* aInputData, float* aOutput,unsigned int aColEleCount, unsigned int aRowEleCount)
|
||
{
|
||
//确定每个thread的基础index
|
||
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
|
||
__shared__ float shared_data[256];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x]= (threadIdx.x< aRowEleCount) ? aInputData[idx] : 0.0;
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
|
||
if(threadIdx.x+offset < aRowEleCount){
|
||
shared_data[threadIdx.x]+= aInputData[idx + offset*aColEleCount];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x] += shared_data[threadIdx.x+i];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x] = shared_data[0];
|
||
}
|
||
}
|
||
|
||
__global__ void sumZAllColKernel(float* aInputData, float* aOutput, int aTotalSize)
|
||
{
|
||
//确定每个thread的index
|
||
unsigned int idx = blockIdx.x * 4096 + threadIdx.x;
|
||
__shared__ float shared_data[256][2];
|
||
// 每个线程加载一个元素到共享内存
|
||
bool flag = threadIdx.x< 4096 && idx<aTotalSize;
|
||
shared_data[threadIdx.x][0]= flag ? aInputData[idx*2] : 0.0;
|
||
shared_data[threadIdx.x][1]= flag ? aInputData[idx*2+1] : 0.0;
|
||
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset<4096; offset+=blockDim.x) {
|
||
if(threadIdx.x + offset<4096 && idx + offset<aTotalSize){
|
||
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
|
||
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 +1];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
|
||
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x] = shared_data[0][0];
|
||
aOutput[blockIdx.x+gridDim.x] = shared_data[0][1];
|
||
}
|
||
}
|
||
|
||
__global__ void sumZColKernel(float* aInputData, float* aOutput, int aColEleCount)
|
||
{
|
||
//确定每个thread的index
|
||
unsigned int idx = blockIdx.x * aColEleCount + threadIdx.x;
|
||
__shared__ float shared_data[256][2];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x][0]= (threadIdx.x< aColEleCount) ? aInputData[idx*2] : 0.0;
|
||
shared_data[threadIdx.x][1]= (threadIdx.x< aColEleCount) ? aInputData[idx*2+1] : 0.0;
|
||
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset<aColEleCount; offset+=blockDim.x) {
|
||
if(threadIdx.x + offset<aColEleCount){
|
||
shared_data[threadIdx.x][0] += aInputData[idx*2 + offset*2];
|
||
shared_data[threadIdx.x][1] += aInputData[idx*2 + offset*2 + 1];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x][0] += shared_data[i + threadIdx.x][0];
|
||
shared_data[threadIdx.x][1] += shared_data[i + threadIdx.x][1];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x*2] = shared_data[0][0];
|
||
aOutput[blockIdx.x*2+1] = shared_data[0][1];
|
||
}
|
||
}
|
||
|
||
__global__ void sumZRowKernel(float* aInputData, float* aOutput, unsigned int aColEleCount, unsigned int aRowEleCount)
|
||
{
|
||
//确定每个thread的基础index
|
||
unsigned int idx = threadIdx.x*aColEleCount+ blockIdx.x;
|
||
__shared__ float shared_data[256][2];
|
||
// 每个线程加载一个元素到共享内存
|
||
shared_data[threadIdx.x][0]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2] : 0.0;
|
||
shared_data[threadIdx.x][1]= (threadIdx.x< aRowEleCount) ? aInputData[idx*2+1] : 0.0;
|
||
|
||
__syncthreads();
|
||
// 每个线程规约自己的分段,将每个blockDim.x的值规约到数组最前面一段
|
||
for (int offset = blockDim.x; offset < aRowEleCount; offset+=blockDim.x) {
|
||
if(threadIdx.x+offset < aRowEleCount){
|
||
shared_data[threadIdx.x][0]+= aInputData[idx*2 + offset*aColEleCount*2];
|
||
shared_data[threadIdx.x][1]+= aInputData[idx*2 + offset*aColEleCount*2 + 1];
|
||
}
|
||
__syncthreads();
|
||
}
|
||
// 规约最前面一段
|
||
for (int i = blockDim.x/2; i >0; i>>=1) {
|
||
if (threadIdx.x < i) {
|
||
shared_data[threadIdx.x][0] += shared_data[threadIdx.x+i][0];
|
||
shared_data[threadIdx.x][1] += shared_data[threadIdx.x+i][1];
|
||
|
||
}
|
||
__syncthreads();
|
||
}
|
||
|
||
// 第一个线程存储每个分段的最大值到全局内存
|
||
if (threadIdx.x == 0) {
|
||
aOutput[blockIdx.x*2] = shared_data[0][0];
|
||
aOutput[blockIdx.x*2+1] = shared_data[0][1];
|
||
|
||
}
|
||
}
|
||
|
||
CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||
if (aMatrix.getDimSize(2)>1 ) {
|
||
std::cerr<< "sum() not support 3D data!"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
//针对向量行等于列
|
||
if (direction == Column && aMatrix.getDimSize(0)==1){
|
||
direction = Row;
|
||
}
|
||
if (!aMatrix.isComplex())
|
||
{
|
||
switch (direction)
|
||
{
|
||
case All:
|
||
{
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float));
|
||
auto ret = CudaMatrix::fromRawData(data,1,1,1);
|
||
float result = thrust::reduce(thrust::device, aMatrix.getData(),aMatrix.getData()+aMatrix.getDataSize(),0.0000000f,thrust::plus<float>());
|
||
ret.setValue(0,result);
|
||
return ret;
|
||
}
|
||
case Row:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int rowCount = aMatrix.getDimSize(1);
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0));
|
||
sumRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||
return ret;
|
||
}
|
||
case Column:
|
||
default:
|
||
{
|
||
std::cout<<"Column sum"<<std::endl;
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int colElementCount = aMatrix.getDimSize(0);
|
||
if (colElementCount == 1) return aMatrix;
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1));
|
||
sumColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1));
|
||
return ret;
|
||
}
|
||
}
|
||
}
|
||
else{
|
||
switch (direction)
|
||
{
|
||
case All:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
//divide the whole array to some 4096 blocks, then caculate as columns sum
|
||
int fakeCol = (int)ceilf((float)aMatrix.getDataSize()/4096.0f);
|
||
cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol);
|
||
auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex);
|
||
sumZAllColKernel<<<fakeCol,256>>>(matData,retData, aMatrix.getDataSize());
|
||
float* result_data = nullptr;
|
||
cudaMalloc((void**)&result_data, sizeof(float)*2);
|
||
auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex);
|
||
float result = thrust::reduce(thrust::device, ret.getData(),ret.getData()+ ret.getDataSize(),0,thrust::plus<float>());
|
||
ret2.setValue(0,result);
|
||
result = thrust::reduce(thrust::device, ret.getData()+ ret.getDataSize(),ret.getData()+ ret.getDataSize()*2,0,thrust::plus<float>());
|
||
ret2.setValue(1,result);
|
||
return ret2;
|
||
}
|
||
case Row:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int rowElementCount = aMatrix.getDimSize(1);
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(0)*2);
|
||
sumZRowKernel<<<aMatrix.getDimSize(0),256>>>(matData,retData,aMatrix.getDimSize(0),rowElementCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,aMatrix.getDimSize(0),1);
|
||
return ret;
|
||
}
|
||
case Column:
|
||
default:
|
||
{
|
||
float* matData = aMatrix.getData();
|
||
float* retData = nullptr;
|
||
int colElementCount = aMatrix.getDimSize(0);
|
||
if (colElementCount == 1) return aMatrix;
|
||
cudaMalloc((void**)&retData, sizeof(float)*aMatrix.getDimSize(1)*2);
|
||
sumZColKernel<<<aMatrix.getDimSize(1),256>>>(matData,retData,colElementCount);
|
||
cudaDeviceSynchronize();
|
||
CudaMatrix ret = Aurora::CudaMatrix::fromRawData(retData,1,aMatrix.getDimSize(1),1,Complex);
|
||
return ret;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
CudaMatrix Aurora::mean(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||
if (aMatrix.getDimSize(2)>1 ) {
|
||
std::cerr<< "sum() not support 3D data!"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
//针对向量行等于列
|
||
if (direction == Column && aMatrix.getDimSize(0)==1){
|
||
direction = Row;
|
||
}
|
||
if (!aMatrix.isComplex())
|
||
{
|
||
switch (direction)
|
||
{
|
||
case All:
|
||
{
|
||
auto ret = sum(aMatrix,All);
|
||
ret.setValue(0,ret.getValue(0)/((float)aMatrix.getDataSize()));
|
||
return ret;
|
||
}
|
||
case Row:
|
||
{
|
||
auto ret = sum(aMatrix, Row);
|
||
float count = (float)aMatrix.getDimSize(1);
|
||
auto lambda = [=] __device__ (const float& v){
|
||
return v/count;
|
||
};
|
||
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
|
||
return ret;
|
||
}
|
||
case Column:
|
||
default:
|
||
{
|
||
auto ret = sum(aMatrix, Column);
|
||
float count = (float)aMatrix.getDimSize(0);
|
||
auto lambda = [=] __device__ (const float& v){
|
||
return v/count;
|
||
};
|
||
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize(),ret.getData(),lambda);
|
||
return ret;
|
||
}
|
||
}
|
||
}
|
||
else{
|
||
std::cerr<< "mean() not support complex data!"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
}
|
||
template <typename ValueType>
|
||
class RowElementIterator:public thrust::iterator_facade<
|
||
RowElementIterator<ValueType>,
|
||
ValueType,
|
||
thrust::device_system_tag,
|
||
thrust::random_access_traversal_tag,
|
||
ValueType& >{
|
||
public:
|
||
// 构造函数
|
||
__host__ __device__
|
||
RowElementIterator(ValueType* ptr, int aColElementCount=1) : ptr_(ptr),col_elements_(aColElementCount) {}
|
||
|
||
__host__ __device__
|
||
ValueType& dereference() const{
|
||
return *ptr_;
|
||
}
|
||
|
||
// 实现递增操作符
|
||
__host__ __device__
|
||
void increment() {
|
||
ptr_ = ptr_+col_elements_;
|
||
}
|
||
|
||
// 实现递减操作符
|
||
__host__ __device__
|
||
void decrement() {
|
||
ptr_ = ptr_ - col_elements_;
|
||
}
|
||
|
||
// 实现加法操作符
|
||
__host__ __device__
|
||
void advance(typename RowElementIterator::difference_type n) {
|
||
ptr_ += col_elements_*n;
|
||
}
|
||
|
||
// 实现减法操作符
|
||
__host__ __device__
|
||
typename RowElementIterator::difference_type distance_to(const RowElementIterator& other) const {
|
||
return (other.ptr_ - ptr_)/col_elements_;
|
||
}
|
||
|
||
// 实现比较操作符
|
||
__host__ __device__
|
||
bool equal(const RowElementIterator& other) const {
|
||
return ptr_ == other.ptr_;
|
||
}
|
||
|
||
private:
|
||
ValueType* ptr_;
|
||
int col_elements_;
|
||
};
|
||
|
||
CudaMatrix Aurora::sort(const CudaMatrix &aMatrix,FunctionDirection direction)
|
||
{
|
||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||
std::cerr
|
||
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
return sort(std::forward<CudaMatrix &&>(aMatrix.deepCopy()), direction);
|
||
}
|
||
|
||
CudaMatrix Aurora::sort(CudaMatrix &&aMatrix,FunctionDirection direction)
|
||
{
|
||
if (aMatrix.getDimSize(2)>1 || aMatrix.isComplex()) {
|
||
std::cerr
|
||
<< (aMatrix.getDimSize(2) > 1 ? "sort() not support 3D data!" : "sort() not support complex value type!")
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
float * data = aMatrix.getData();
|
||
int colElementCount = aMatrix.getDimSize(0);
|
||
switch (direction)
|
||
{
|
||
case Row:
|
||
{
|
||
for (size_t i = 0; i < colElementCount; i++)
|
||
{
|
||
thrust::sort(thrust::device, RowElementIterator<float>(data+i,colElementCount),
|
||
RowElementIterator<float>(data+aMatrix.getDataSize()+i,colElementCount));
|
||
}
|
||
return aMatrix;
|
||
}
|
||
case Column:
|
||
{
|
||
int rowElementCount = aMatrix.getDimSize(1);
|
||
for (size_t i = 0; i < rowElementCount; i++)
|
||
{
|
||
thrust::sort(thrust::device, data+i*colElementCount,
|
||
data+(i+1)*colElementCount);
|
||
}
|
||
|
||
return aMatrix;
|
||
}
|
||
default:
|
||
{
|
||
std::cerr
|
||
<< "Unsupported direction for sort!"
|
||
<< std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
|
||
}
|
||
|
||
}
|
||
|
||
__global__ void immseKernel(float* aInputData1, float* aInputData2, float* aOutputData, unsigned int aInputSize)
|
||
{
|
||
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx < aInputSize)
|
||
{
|
||
aOutputData[idx] = powf(aInputData1[idx] - aInputData2[idx], 2);
|
||
}
|
||
}
|
||
|
||
float Aurora::immse(const CudaMatrix &aImageA, const CudaMatrix &aImageB)
|
||
{
|
||
if (aImageA.getDims()!=2|| aImageB.getDims()!=2)
|
||
{
|
||
std::cerr<<"Fail! cuda immse args must all 2d matrix!";
|
||
return 0.0;
|
||
}
|
||
|
||
if (!aImageB.compareShape(aImageA))
|
||
{
|
||
std::cerr<<"Fail! cuda immse args must be same shape!";
|
||
return 0.0;
|
||
}
|
||
|
||
if (aImageA.getValueType() != Normal || aImageB.getValueType() != Normal)
|
||
{
|
||
std::cerr << "Fail! cuda immse args must be normal value type!";
|
||
return 0.0;
|
||
}
|
||
|
||
unsigned int size = aImageA.getDataSize();
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
int blocksPerGrid = (size + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||
immseKernel<<<blocksPerGrid, THREADS_PER_BLOCK>>>(aImageA.getData(), aImageB.getData(), data, size);
|
||
cudaDeviceSynchronize();
|
||
float result = thrust::reduce(thrust::device, data, data+size, 0.0, thrust::plus<float>()) / size;
|
||
cudaFree(data);
|
||
return result;
|
||
}
|
||
|
||
struct compareMatrixByRows
|
||
{
|
||
compareMatrixByRows(unsigned int aSize)
|
||
: mSize(aSize)
|
||
{
|
||
};
|
||
unsigned int mSize;
|
||
__host__ __device__
|
||
bool operator()(const float* aVector1, const float* aVector2) const
|
||
{
|
||
for(unsigned int i=0; i<mSize; ++i)
|
||
{
|
||
if(aVector1[i] < aVector2[i])
|
||
{
|
||
return true;
|
||
}
|
||
else if(aVector1[i] > aVector2[i])
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
};
|
||
|
||
CudaMatrix Aurora::sortrows(const CudaMatrix &aMatrix, CudaMatrix& indexMatrix)
|
||
{
|
||
CudaMatrix transposeMatrix = transpose(aMatrix);
|
||
size_t rows = transposeMatrix.getDimSize(0);
|
||
size_t columns = transposeMatrix.getDimSize(1);
|
||
thrust::device_vector<float*> vector(columns);
|
||
for(unsigned int i=0; i<columns; ++i)
|
||
{
|
||
vector[i] = transposeMatrix.getData() + i*rows;
|
||
}
|
||
thrust::device_vector<float*> vectorBack = vector;
|
||
thrust::sort(thrust::device, vector.begin(), vector.end(), compareMatrixByRows(rows));
|
||
|
||
float* data = nullptr;
|
||
float* indexResult = new float[columns];
|
||
cudaMalloc((void**)&data, sizeof(float) * rows * columns);
|
||
for(unsigned int i=0; i<columns; ++i)
|
||
{
|
||
cudaMemcpy(data + i*rows, vector[i], sizeof(float) * rows, cudaMemcpyDeviceToDevice);
|
||
}
|
||
|
||
for(unsigned int i=0; i<columns; ++i)
|
||
{
|
||
auto index = thrust::find(thrust::device, vectorBack.begin(), vectorBack.end(), vector[i]);
|
||
indexResult[i] = index - vectorBack.begin();
|
||
}
|
||
|
||
indexMatrix = Aurora::Matrix::fromRawData(indexResult, columns).toDeviceMatrix();
|
||
|
||
return transpose(CudaMatrix::fromRawData(data, rows, columns));
|
||
}
|
||
|
||
__global__ void invKernel(float** aInputPointer, float* aInputData, float** aOutputPointer, float* aOutputData)
|
||
{
|
||
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||
if (idx == 0)
|
||
{
|
||
aInputPointer[0] = aInputData;
|
||
aOutputPointer[0] = aOutputData;
|
||
}
|
||
}
|
||
|
||
CudaMatrix Aurora::inv(const CudaMatrix &aMatrix)
|
||
{
|
||
if (aMatrix.getDims() != 2)
|
||
{
|
||
std::cerr << "Fail! cuda inv args must be 2d matrix!";
|
||
return aMatrix;
|
||
}
|
||
if (aMatrix.getDimSize(0) != aMatrix.getDimSize(1))
|
||
{
|
||
std::cerr << "Fail! cuda inv args must be square matrix!";
|
||
return aMatrix;
|
||
}
|
||
if (aMatrix.getValueType() != Normal)
|
||
{
|
||
std::cerr << "Fail! cuda inv args must be normal value type!";
|
||
return aMatrix;
|
||
}
|
||
unsigned int n = aMatrix.getDimSize(0);
|
||
unsigned int size = aMatrix.getDataSize();
|
||
cublasHandle_t handle;
|
||
cublasCreate(&handle);
|
||
float* data;
|
||
cudaMalloc((void**)&data, sizeof(float) * size);
|
||
|
||
float** deviceInputPinter;
|
||
cudaMalloc((void**)&deviceInputPinter, sizeof(float *));
|
||
|
||
float **deviceOutputPointer;
|
||
cudaMalloc((void**)&deviceOutputPointer, sizeof(float *));
|
||
|
||
invKernel<<<1, 1>>>(deviceInputPinter, aMatrix.getData(), deviceOutputPointer, data);
|
||
cudaDeviceSynchronize();
|
||
int* devicePivotArray;
|
||
cudaMalloc((void**)&devicePivotArray, n * sizeof(int));
|
||
int* deviceInfoArray;
|
||
cudaMalloc((void**)&deviceInfoArray, sizeof(int));
|
||
cublasSgetrfBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceInfoArray, 1);
|
||
cublasSgetriBatched(handle, n, deviceInputPinter, n, devicePivotArray, deviceOutputPointer, n, deviceInfoArray, 1);
|
||
cudaFree(devicePivotArray);
|
||
cudaFree(deviceInfoArray);
|
||
cudaFree(deviceOutputPointer);
|
||
cudaFree(deviceInputPinter);
|
||
cublasDestroy(handle);
|
||
|
||
return CudaMatrix::fromRawData(data, n, n);
|
||
}
|
||
|
||
|
||
/**
|
||
* @brief
|
||
*
|
||
*
|
||
* @param aMatrix
|
||
* @param aDirection 0 FORWARD, 1 BACKWARD
|
||
*/
|
||
void ExecFFT(CudaMatrix& aMatrix,int aDirection){
|
||
cufftHandle plan;
|
||
cudaStream_t stream = NULL;
|
||
int batch_size = aMatrix.getDimSize(1);
|
||
cufftComplex *d_data = (cufftComplex *)aMatrix.getData();
|
||
cufftCreate(&plan);
|
||
// int gpus[1] ={0};
|
||
// cufftXtSetGPUs(plan,1,gpus);
|
||
cufftPlan1d(&plan, aMatrix.getDimSize(0), CUFFT_C2C, batch_size);
|
||
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
|
||
cufftSetStream(plan, stream);
|
||
cufftExecC2C(plan, d_data, d_data, aDirection==0?CUFFT_FORWARD:CUFFT_INVERSE);
|
||
cudaStreamSynchronize(stream);
|
||
cufftDestroy(plan);
|
||
cudaStreamDestroy(stream);
|
||
}
|
||
|
||
__global__ void complexFillKernel(float* aInputData, float* aOutput,unsigned int aCopySize,
|
||
unsigned int aSrcColEleCount, unsigned int aDesColEleCount)
|
||
{
|
||
unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x;
|
||
unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x;
|
||
|
||
for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x)
|
||
{
|
||
if(threadIdx.x + offset< aCopySize){
|
||
aOutput[2*idx_d] = aInputData[idx_s];
|
||
aOutput[2*idx_d + 1] = 0;
|
||
}
|
||
else if(threadIdx.x + offset< aDesColEleCount){
|
||
aOutput[2*idx_d] = 0;
|
||
aOutput[2*idx_d + 1] = 0;
|
||
}
|
||
else{
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
__global__ void complexCopyKernel(float* aInputData, float* aOutput,unsigned int aCopySize,
|
||
unsigned int aSrcColEleCount, unsigned int aDesColEleCount)
|
||
{
|
||
unsigned int idx_d = blockIdx.x * aDesColEleCount + threadIdx.x;
|
||
unsigned int idx_s = blockIdx.x * aSrcColEleCount + threadIdx.x;
|
||
|
||
for (int offset = 0; offset < aDesColEleCount; offset+=blockDim.x)
|
||
{
|
||
if(threadIdx.x + offset< aCopySize){
|
||
aOutput[2*idx_d] = aInputData[idx_s*2];
|
||
aOutput[2*idx_d + 1] = aInputData[idx_s*2+1];
|
||
}
|
||
else if(threadIdx.x + offset< aDesColEleCount){
|
||
aOutput[2*idx_d] = 0;
|
||
aOutput[2*idx_d + 1] = 0;
|
||
}
|
||
else{
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
|
||
CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
|
||
size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0);
|
||
//实际需要copy赋值的非0值
|
||
size_t needCopySize = (ColEleCount<aMatrix.getDimSize(0))?ColEleCount:aMatrix.getDimSize(0);
|
||
size_t bufferSize = ColEleCount*aMatrix.getDimSize(1);
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
|
||
|
||
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
|
||
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
|
||
ExecFFT(ret,0);
|
||
return ret;
|
||
}
|
||
|
||
CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){
|
||
if (!aMatrix.isComplex()){
|
||
std::cerr<<"ifft input must be complex value"<<std::endl;
|
||
return CudaMatrix();
|
||
}
|
||
size_t ColEleCount = (aFFTSize>0)?aFFTSize:aMatrix.getDimSize(0);
|
||
//实际需要copy赋值的非0值
|
||
size_t needCopySize = (ColEleCount<aMatrix.getDimSize(0))?ColEleCount:aMatrix.getDimSize(0);
|
||
size_t bufferSize = ColEleCount*aMatrix.getDimSize(1);
|
||
float* data = nullptr;
|
||
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
|
||
|
||
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
|
||
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
|
||
ExecFFT(ret,1);
|
||
float colEleCountf = 1.f/ColEleCount;
|
||
auto lambda = [=]__device__(const float& v){
|
||
return v*colEleCountf;
|
||
};
|
||
thrust::transform(thrust::device,ret.getData(),ret.getData()+ret.getDataSize()*2,ret.getData(),lambda);
|
||
return ret;
|
||
}
|
||
|
||
__global__ void fftshiftSwapKernel(float* aData, unsigned int aColEleCount){
|
||
unsigned int idx = blockIdx.x *aColEleCount + threadIdx.x;
|
||
unsigned int stride = aColEleCount/2;
|
||
for (int i = 0; i < stride; i+=blockDim.x)
|
||
{
|
||
if (threadIdx.x + i < stride)
|
||
{
|
||
float temp = aData[idx];
|
||
aData[idx] = aData[idx+stride];
|
||
aData[idx+stride] = temp;
|
||
}
|
||
}
|
||
}
|
||
|
||
__global__ void memcpyColKernel(float* aInputData,float* aOutputData, unsigned int aCopySize,
|
||
unsigned int aSrcColEleCount,unsigned int aDesColEleCount){
|
||
unsigned int idx = blockIdx.x *aSrcColEleCount + threadIdx.x;
|
||
unsigned int idx2 = blockIdx.x *aDesColEleCount + threadIdx.x;
|
||
for (int i = 0; i < aCopySize; i+=blockDim.x)
|
||
{
|
||
if (threadIdx.x + i < aCopySize)
|
||
{
|
||
aOutputData[idx2+i] = aInputData[idx+i];
|
||
}
|
||
}
|
||
}
|
||
|
||
void Aurora::fftshift(CudaMatrix &aMatrix){
|
||
if (aMatrix.getDimSize(0) % 2 == 0) {
|
||
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||
} else {
|
||
int copySize = aMatrix.getDimSize(0) / 2 + 1;
|
||
float *data = nullptr;
|
||
cudaMalloc((void **)&data, copySize * aMatrix.getValueType());
|
||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
aMatrix.getData(), data, copySize * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||
copySize * aMatrix.getValueType());
|
||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(),
|
||
(copySize - 1) * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(),
|
||
copySize * aMatrix.getValueType(),
|
||
copySize * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||
cudaFree(data);
|
||
}
|
||
}
|
||
|
||
void Aurora::ifftshift(CudaMatrix &aMatrix){
|
||
if (aMatrix.getDimSize(0) % 2 == 0) {
|
||
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||
} else {
|
||
int copySize = aMatrix.getDimSize(0) / 2 + 1;
|
||
float *data = nullptr;
|
||
cudaMalloc((void **)&data, copySize * aMatrix.getValueType());
|
||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
aMatrix.getData()+(copySize-1)* aMatrix.getValueType(),
|
||
data, copySize * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||
copySize * aMatrix.getValueType());
|
||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(),
|
||
(copySize-1) * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||
data, aMatrix.getData(),
|
||
copySize * aMatrix.getValueType(),
|
||
copySize * aMatrix.getValueType(),
|
||
aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||
cudaFree(data);
|
||
}
|
||
} |