#include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cufft.h" #include #include #include #include #include #include "Aurora.h" #include "AuroraDefs.h" #include "CudaMatrix.h" #include #include "log/log.h" #include __global__ void doubleToComplexKernel(const double* input, cufftDoubleComplex* output, int size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { output[idx].x = input[idx]; output[idx].y = 0; } } void Aurora::doubleToComplex(const double* input, cufftDoubleComplex* output, int size) { int threadsPerBlock = 1024; int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; doubleToComplexKernel<<>>(input, output, size); cudaDeviceSynchronize(); // 等待GPU完成操作 } __global__ void maxKernel(const float* aInput, const float* aOutput, int aSize) { int index = blockIdx.x * blockDim.x + threadIdx.x; int stride = gridDim.x*blockDim.x; float maxResult = aInput[0]; while (index < aSize) { if(maxResult < aInput[index]) { maxResult = aInput[index]; } index += stride; } } void Aurora::max(const float* aInput, const float* aOutput, int aSize) { int threadsPerBlock = 1024; int blocksPerGrid = 68; //max<<>>(aInput, aOutput, aSize); cudaDeviceSynchronize(); } __global__ void validKernel(const float* aData, const float* aValid, float* aOutput, int aOutputRowCount, int aOutputColumnCount) { int threadIndex = blockIdx.x * blockDim.x + threadIdx.x; int dataIndex = (int)aValid[threadIndex]; if(threadIndex < aOutputColumnCount) { for(int i=0; i < aOutputRowCount; ++i) { aOutput[threadIndex * aOutputRowCount + i] = aData[dataIndex * aOutputRowCount + i]; } } } // __global__ void validSubKernel(const double* aValid, double* aOutput, unsigned int* aCount, int aValidSize) // { // int index = blockIdx.x * blockDim.x + threadIdx.x; // if(index == 0) // { // for(int i=0;i>>(aData.getData(), validProcessedDevice, result, rowCount, validColumnCount); cudaDeviceSynchronize(); cudaFree(validProcessedDevice); delete[] hostValid; delete[] validProcessed; return Aurora::CudaMatrix::fromRawData(result, rowCount, validColumnCount); } texture tex; cudaArray* array; __global__ void testKernel(float* aData,cudaTextureObject_t aTexObj, cudaSurfaceObject_t aSurface) { float a = tex2D(tex,5.5,5.5); float b = tex2D(aTexObj,5.5,5.5); float2 c = tex2D(aSurface,1,1); printf("%f\n",a); printf("%f\n",b); printf("%f\n",c.x); printf("%f\n",c.y); } __global__ void writeSurfaceKernel( cudaSurfaceObject_t aSurface) { float2 value; value.x = 100; value.y = 99; surf2Dwrite(value, aSurface, 1, 1 ); } void subTest(cudaTextureObject_t& aTexture) { cudaResourceDesc resourceDesc; cudaTextureDesc textureDesc; memset(&resourceDesc, 0, sizeof(resourceDesc)); resourceDesc.resType = cudaResourceTypeArray; resourceDesc.res.array.array = array; // 指向设备端的 CUDA 数组 // 在 textureDesc 中设置纹理描述 memset(&textureDesc, 0, sizeof(textureDesc)); textureDesc.addressMode[0] = cudaAddressModeClamp; textureDesc.addressMode[1] = cudaAddressModeClamp; textureDesc.filterMode = cudaFilterModeLinear; textureDesc.readMode = cudaReadModeElementType; textureDesc.normalizedCoords = false; //textureDesc.channelDesc = texChannelDescSpeedOfSoundField; cudaCreateTextureObject(&aTexture, &resourceDesc, &textureDesc, nullptr); } void Aurora::test(float* aData) { tex.addressMode[0] = cudaAddressModeClamp; // Texturreferenz beschreiben tex.addressMode[1] = cudaAddressModeClamp; tex.filterMode = cudaFilterModeLinear; tex.normalized = 0; cudaChannelFormatDesc texChannelDescSpeedOfSoundField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); cudaMallocArray(&array, &texChannelDescSpeedOfSoundField, 10, 9); cudaMemcpyToArray(array, 0, 0, aData,10 * 9 *sizeof(float), cudaMemcpyHostToDevice); cudaBindTextureToArray ( &tex, array, &texChannelDescSpeedOfSoundField ); cudaTextureObject_t textureObj; subTest(textureObj); struct cudaResourceDesc resDesc; memset(&resDesc, 0, sizeof(resDesc)); resDesc.resType = cudaResourceTypeArray; // Create the surface objects resDesc.res.array.array = array; cudaSurfaceObject_t inputSurfObj = 0; cudaCreateSurfaceObject(&inputSurfObj, &resDesc); writeSurfaceKernel<<<1,1>>>(inputSurfObj); cudaDeviceSynchronize(); testKernel<<<1, 1>>>(aData,textureObj, inputSurfObj); cudaDeviceSynchronize(); cudaUnbindTexture(&tex); } void Aurora::sort(const Aurora::Matrix& aMatrix) { RECON_INFO("cuda start"); thrust::sort(thrust::device, aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(), thrust::greater()); RECON_INFO("cuda end"); }