commit dev-sun.
This commit is contained in:
201
src/Aurora.cu
Normal file
201
src/Aurora.cu
Normal file
@@ -0,0 +1,201 @@
|
||||
#include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cufft.h"
|
||||
#include </usr/local/cuda-10.1/targets/x86_64-linux/include/cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
#include <thrust/device_vector.h>
|
||||
#include <thrust/execution_policy.h>
|
||||
#include <thrust/sort.h>
|
||||
|
||||
#include "Aurora.h"
|
||||
#include "AuroraDefs.h"
|
||||
#include "CudaMatrix.h"
|
||||
#include <iostream>
|
||||
#include "log/log.h"
|
||||
#include <cuda_texture_types.h>
|
||||
|
||||
__global__ void doubleToComplexKernel(const double* input, cufftDoubleComplex* output, int size)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (idx < size) {
|
||||
output[idx].x = input[idx];
|
||||
output[idx].y = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Aurora::doubleToComplex(const double* input, cufftDoubleComplex* output, int size)
|
||||
{
|
||||
int threadsPerBlock = 1024;
|
||||
int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
|
||||
doubleToComplexKernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, size);
|
||||
cudaDeviceSynchronize(); // 等待GPU完成操作
|
||||
}
|
||||
|
||||
__global__ void maxKernel(const float* aInput, const float* aOutput, int aSize)
|
||||
{
|
||||
int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int stride = gridDim.x*blockDim.x;
|
||||
float maxResult = aInput[0];
|
||||
while (index < aSize)
|
||||
{
|
||||
if(maxResult < aInput[index])
|
||||
{
|
||||
maxResult = aInput[index];
|
||||
}
|
||||
index += stride;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void Aurora::max(const float* aInput, const float* aOutput, int aSize)
|
||||
{
|
||||
int threadsPerBlock = 1024;
|
||||
int blocksPerGrid = 68;
|
||||
//max<<<blocksPerGrid, threadsPerBlock>>>(aInput, aOutput, aSize);
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void validKernel(const float* aData, const float* aValid, float* aOutput, int aOutputRowCount, int aOutputColumnCount)
|
||||
{
|
||||
int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int dataIndex = (int)aValid[threadIndex];
|
||||
if(threadIndex < aOutputColumnCount)
|
||||
{
|
||||
for(int i=0; i < aOutputRowCount; ++i)
|
||||
{
|
||||
aOutput[threadIndex * aOutputRowCount + i] = aData[dataIndex * aOutputRowCount + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// __global__ void validSubKernel(const double* aValid, double* aOutput, unsigned int* aCount, int aValidSize)
|
||||
// {
|
||||
// int index = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// if(index == 0)
|
||||
// {
|
||||
// for(int i=0;i<aValidSize;++i)
|
||||
// {
|
||||
// if(aValid[i] == 1)
|
||||
// {
|
||||
// aOutput[*aCount] = i;
|
||||
// ++(*aCount);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// __syncthreads();
|
||||
// }
|
||||
|
||||
Aurora::CudaMatrix Aurora::valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid)
|
||||
{
|
||||
int validSize = aValid.getDataSize();
|
||||
int rowCount = aData.getDimSize(0);
|
||||
float* hostValid = new float[validSize];
|
||||
float* validProcessed = new float[validSize];
|
||||
float* validProcessedDevice = nullptr;
|
||||
cudaMemcpy(hostValid, aValid.getData(), sizeof(float) * validSize, cudaMemcpyDeviceToHost);
|
||||
int validColumnCount = 0;
|
||||
for(int i=0;i<validSize;++i)
|
||||
{
|
||||
if(hostValid[i] == 1)
|
||||
{
|
||||
validProcessed[validColumnCount] = i;
|
||||
++validColumnCount;
|
||||
}
|
||||
}
|
||||
cudaMalloc((void**)&validProcessedDevice, sizeof(float) * validColumnCount );
|
||||
cudaMemcpy(validProcessedDevice, validProcessed, sizeof(float) * validColumnCount, cudaMemcpyHostToDevice);
|
||||
|
||||
int threadPerBlock = 1024;
|
||||
int blockPerGrid = validColumnCount / threadPerBlock + 1;
|
||||
float* result = nullptr;
|
||||
cudaMalloc((void**)&result, sizeof(float) * validColumnCount * rowCount);
|
||||
validKernel<<<blockPerGrid, threadPerBlock>>>(aData.getData(), validProcessedDevice, result, rowCount, validColumnCount);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaFree(validProcessedDevice);
|
||||
delete[] hostValid;
|
||||
delete[] validProcessed;
|
||||
return Aurora::CudaMatrix::fromRawData(result, rowCount, validColumnCount);
|
||||
}
|
||||
|
||||
texture<float, cudaTextureType2D, cudaReadModeElementType> tex;
|
||||
cudaArray* array;
|
||||
|
||||
__global__ void testKernel(float* aData,cudaTextureObject_t aTexObj, cudaSurfaceObject_t aSurface)
|
||||
{
|
||||
float a = tex2D(tex,5.5,5.5);
|
||||
float b = tex2D<float>(aTexObj,5.5,5.5);
|
||||
float2 c = tex2D<float2>(aSurface,1,1);
|
||||
printf("%f\n",a);
|
||||
printf("%f\n",b);
|
||||
printf("%f\n",c.x);
|
||||
printf("%f\n",c.y);
|
||||
}
|
||||
|
||||
__global__ void writeSurfaceKernel( cudaSurfaceObject_t aSurface)
|
||||
{
|
||||
float2 value;
|
||||
value.x = 100;
|
||||
value.y = 99;
|
||||
surf2Dwrite(value, aSurface, 1, 1 );
|
||||
|
||||
}
|
||||
|
||||
void subTest(cudaTextureObject_t& aTexture)
|
||||
{
|
||||
cudaResourceDesc resourceDesc;
|
||||
cudaTextureDesc textureDesc;
|
||||
memset(&resourceDesc, 0, sizeof(resourceDesc));
|
||||
resourceDesc.resType = cudaResourceTypeArray;
|
||||
resourceDesc.res.array.array = array; // 指向设备端的 CUDA 数组
|
||||
|
||||
// 在 textureDesc 中设置纹理描述
|
||||
memset(&textureDesc, 0, sizeof(textureDesc));
|
||||
textureDesc.addressMode[0] = cudaAddressModeClamp;
|
||||
textureDesc.addressMode[1] = cudaAddressModeClamp;
|
||||
textureDesc.filterMode = cudaFilterModeLinear;
|
||||
textureDesc.readMode = cudaReadModeElementType;
|
||||
textureDesc.normalizedCoords = false;
|
||||
//textureDesc.channelDesc = texChannelDescSpeedOfSoundField;
|
||||
cudaCreateTextureObject(&aTexture, &resourceDesc, &textureDesc, nullptr);
|
||||
}
|
||||
|
||||
void Aurora::test(float* aData)
|
||||
{
|
||||
tex.addressMode[0] = cudaAddressModeClamp; // Texturreferenz beschreiben
|
||||
tex.addressMode[1] = cudaAddressModeClamp;
|
||||
tex.filterMode = cudaFilterModeLinear;
|
||||
tex.normalized = 0;
|
||||
cudaChannelFormatDesc texChannelDescSpeedOfSoundField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
|
||||
cudaMallocArray(&array, &texChannelDescSpeedOfSoundField, 10, 9);
|
||||
cudaMemcpyToArray(array, 0, 0, aData,10 * 9 *sizeof(float), cudaMemcpyHostToDevice);
|
||||
cudaBindTextureToArray ( &tex, array, &texChannelDescSpeedOfSoundField );
|
||||
|
||||
cudaTextureObject_t textureObj;
|
||||
subTest(textureObj);
|
||||
|
||||
|
||||
|
||||
struct cudaResourceDesc resDesc;
|
||||
memset(&resDesc, 0, sizeof(resDesc));
|
||||
resDesc.resType = cudaResourceTypeArray;
|
||||
// Create the surface objects
|
||||
resDesc.res.array.array = array;
|
||||
cudaSurfaceObject_t inputSurfObj = 0;
|
||||
cudaCreateSurfaceObject(&inputSurfObj, &resDesc);
|
||||
|
||||
writeSurfaceKernel<<<1,1>>>(inputSurfObj);
|
||||
cudaDeviceSynchronize();
|
||||
testKernel<<<1, 1>>>(aData,textureObj, inputSurfObj);
|
||||
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
cudaUnbindTexture(&tex);
|
||||
}
|
||||
|
||||
void Aurora::sort(const Aurora::Matrix& aMatrix)
|
||||
{
|
||||
RECON_INFO("cuda start");
|
||||
thrust::sort(thrust::device, aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(), thrust::greater<int>());
|
||||
RECON_INFO("cuda end");
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user