feat: Move common function to Aurora library.

This commit is contained in:
sunwen
2024-12-24 10:43:26 +08:00
parent 8032cb5c3f
commit b170022b21
2 changed files with 0 additions and 126 deletions

View File

@@ -1,110 +0,0 @@
#include <iostream>
#include <cuda_texture_types.h>
#include <cufft.h>
#include <cuda_runtime.h>
#include <cstdio>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include "Aurora.h"
#include "CudaMatrix.h"
#include "log/log.h"
__global__ void doubleToComplexKernel(const double* input, cufftDoubleComplex* output, int size)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
output[idx].x = input[idx];
output[idx].y = 0;
}
}
void Aurora::doubleToComplex(const double* input, cufftDoubleComplex* output, int size)
{
int threadsPerBlock = 1024;
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
doubleToComplexKernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, size);
cudaDeviceSynchronize(); // 等待GPU完成操作
}
__global__ void maxKernel(const float* aInput, const float* aOutput, int aSize)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = gridDim.x*blockDim.x;
float maxResult = aInput[0];
while (index < aSize)
{
if(maxResult < aInput[index])
{
maxResult = aInput[index];
}
index += stride;
}
}
void Aurora::max(const float* aInput, const float* aOutput, int aSize)
{
int threadsPerBlock = 1024;
int blocksPerGrid = 68;
//max<<<blocksPerGrid, threadsPerBlock>>>(aInput, aOutput, aSize);
cudaDeviceSynchronize();
}
__global__ void validKernel(const float* aData, const float* aValid, float* aOutput, int aOutputRowCount, int aOutputColumnCount)
{
int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
int dataIndex = (int)aValid[threadIndex];
if(threadIndex < aOutputColumnCount)
{
for(int i=0; i < aOutputRowCount; ++i)
{
aOutput[threadIndex * aOutputRowCount + i] = aData[dataIndex * aOutputRowCount + i];
}
}
}
Aurora::CudaMatrix Aurora::valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid)
{
int validSize = aValid.getDataSize();
int rowCount = aData.getDimSize(0);
float* hostValid = new float[validSize];
float* validProcessed = new float[validSize];
float* validProcessedDevice = nullptr;
cudaMemcpy(hostValid, aValid.getData(), sizeof(float) * validSize, cudaMemcpyDeviceToHost);
int validColumnCount = 0;
for(int i=0;i<validSize;++i)
{
if(hostValid[i] == 1)
{
validProcessed[validColumnCount] = i;
++validColumnCount;
}
}
cudaMalloc((void**)&validProcessedDevice, sizeof(float) * validColumnCount );
cudaMemcpy(validProcessedDevice, validProcessed, sizeof(float) * validColumnCount, cudaMemcpyHostToDevice);
int threadPerBlock = 1024;
int blockPerGrid = validColumnCount / threadPerBlock + 1;
float* result = nullptr;
cudaMalloc((void**)&result, sizeof(float) * validColumnCount * rowCount);
validKernel<<<blockPerGrid, threadPerBlock>>>(aData.getData(), validProcessedDevice, result, rowCount, validColumnCount);
cudaDeviceSynchronize();
cudaFree(validProcessedDevice);
delete[] hostValid;
delete[] validProcessed;
return Aurora::CudaMatrix::fromRawData(result, rowCount, validColumnCount);
}
void Aurora::sort(const Aurora::Matrix& aMatrix)
{
RECON_INFO("cuda start");
thrust::sort(thrust::device, aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(), thrust::greater<int>());
RECON_INFO("cuda end");
}

View File

@@ -1,16 +0,0 @@
#ifndef SUM_MATRIX_CU_H
#define SUM_MATRIX_CU_H
#include <cufft.h>
#include <cuda_runtime.h>
#include "Matrix.h"
namespace Aurora
{
void doubleToComplex(const double* input, cufftDoubleComplex* output, int size);
void max(const float* aInput, const float* aOutput, int aSize);
Aurora::CudaMatrix valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid);
void sort(const Aurora::Matrix& aMatrix);
}
#endif