feat: Move common function to Aurora library.
This commit is contained in:
110
src/Aurora.cu
110
src/Aurora.cu
@@ -1,110 +0,0 @@
|
||||
#include <iostream>
|
||||
|
||||
#include <cuda_texture_types.h>
|
||||
#include <cufft.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "Aurora.h"
|
||||
#include "CudaMatrix.h"
|
||||
|
||||
#include "log/log.h"
|
||||
|
||||
__global__ void doubleToComplexKernel(const double* input, cufftDoubleComplex* output, int size)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < size) {
|
||||
output[idx].x = input[idx];
|
||||
output[idx].y = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Aurora::doubleToComplex(const double* input, cufftDoubleComplex* output, int size)
|
||||
{
|
||||
int threadsPerBlock = 1024;
|
||||
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
|
||||
doubleToComplexKernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, size);
|
||||
cudaDeviceSynchronize(); // 等待GPU完成操作
|
||||
}
|
||||
|
||||
__global__ void maxKernel(const float* aInput, const float* aOutput, int aSize)
|
||||
{
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int stride = gridDim.x*blockDim.x;
|
||||
float maxResult = aInput[0];
|
||||
while (index < aSize)
|
||||
{
|
||||
if(maxResult < aInput[index])
|
||||
{
|
||||
maxResult = aInput[index];
|
||||
}
|
||||
index += stride;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Aurora::max(const float* aInput, const float* aOutput, int aSize)
|
||||
{
|
||||
int threadsPerBlock = 1024;
|
||||
int blocksPerGrid = 68;
|
||||
//max<<<blocksPerGrid, threadsPerBlock>>>(aInput, aOutput, aSize);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void validKernel(const float* aData, const float* aValid, float* aOutput, int aOutputRowCount, int aOutputColumnCount)
|
||||
{
|
||||
int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int dataIndex = (int)aValid[threadIndex];
|
||||
if(threadIndex < aOutputColumnCount)
|
||||
{
|
||||
for(int i=0; i < aOutputRowCount; ++i)
|
||||
{
|
||||
aOutput[threadIndex * aOutputRowCount + i] = aData[dataIndex * aOutputRowCount + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Aurora::CudaMatrix Aurora::valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid)
|
||||
{
|
||||
int validSize = aValid.getDataSize();
|
||||
int rowCount = aData.getDimSize(0);
|
||||
float* hostValid = new float[validSize];
|
||||
float* validProcessed = new float[validSize];
|
||||
float* validProcessedDevice = nullptr;
|
||||
cudaMemcpy(hostValid, aValid.getData(), sizeof(float) * validSize, cudaMemcpyDeviceToHost);
|
||||
int validColumnCount = 0;
|
||||
for(int i=0;i<validSize;++i)
|
||||
{
|
||||
if(hostValid[i] == 1)
|
||||
{
|
||||
validProcessed[validColumnCount] = i;
|
||||
++validColumnCount;
|
||||
}
|
||||
}
|
||||
cudaMalloc((void**)&validProcessedDevice, sizeof(float) * validColumnCount );
|
||||
cudaMemcpy(validProcessedDevice, validProcessed, sizeof(float) * validColumnCount, cudaMemcpyHostToDevice);
|
||||
|
||||
int threadPerBlock = 1024;
|
||||
int blockPerGrid = validColumnCount / threadPerBlock + 1;
|
||||
float* result = nullptr;
|
||||
cudaMalloc((void**)&result, sizeof(float) * validColumnCount * rowCount);
|
||||
validKernel<<<blockPerGrid, threadPerBlock>>>(aData.getData(), validProcessedDevice, result, rowCount, validColumnCount);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaFree(validProcessedDevice);
|
||||
delete[] hostValid;
|
||||
delete[] validProcessed;
|
||||
return Aurora::CudaMatrix::fromRawData(result, rowCount, validColumnCount);
|
||||
}
|
||||
|
||||
void Aurora::sort(const Aurora::Matrix& aMatrix)
|
||||
{
|
||||
RECON_INFO("cuda start");
|
||||
thrust::sort(thrust::device, aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(), thrust::greater<int>());
|
||||
RECON_INFO("cuda end");
|
||||
}
|
||||
|
||||
|
||||
16
src/Aurora.h
16
src/Aurora.h
@@ -1,16 +0,0 @@
|
||||
#ifndef SUM_MATRIX_CU_H
|
||||
#define SUM_MATRIX_CU_H
|
||||
#include <cufft.h>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "Matrix.h"
|
||||
namespace Aurora
|
||||
{
|
||||
void doubleToComplex(const double* input, cufftDoubleComplex* output, int size);
|
||||
void max(const float* aInput, const float* aOutput, int aSize);
|
||||
Aurora::CudaMatrix valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid);
|
||||
void sort(const Aurora::Matrix& aMatrix);
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user