commit dev-sun.

2023-12-22 11:17:18 +08:00
parent d015b38845
commit 410d657fe7
27 changed files with 1346 additions and 267 deletions
--- a/src/Aurora.cu
+++ b/src/Aurora.cu
@@ -0,0 +1,201 @@
+#include "/usr/local/cuda-10.1/targets/x86_64-linux/include/cufft.h"
+#include </usr/local/cuda-10.1/targets/x86_64-linux/include/cuda_runtime.h>
+#include <cstdio>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+
+#include "Aurora.h"
+#include "AuroraDefs.h"
+#include "CudaMatrix.h"
+#include <iostream>
+#include "log/log.h"
+#include <cuda_texture_types.h>
+
+__global__ void doubleToComplexKernel(const double* input, cufftDoubleComplex* output, int size)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < size) {
+        output[idx].x = input[idx];
+        output[idx].y = 0;
+    }
+}
+
+void Aurora::doubleToComplex(const double* input, cufftDoubleComplex* output, int size)
+{
+    int threadsPerBlock = 1024;
+    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
+    doubleToComplexKernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, size);
+    cudaDeviceSynchronize(); // 等待GPU完成操作
+}
+
+__global__ void maxKernel(const float* aInput, const float* aOutput, int aSize)
+{
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = gridDim.x*blockDim.x;
+    float maxResult = aInput[0];
+    while (index < aSize)
+    {
+        if(maxResult < aInput[index])
+        {
+            maxResult = aInput[index];
+        }
+        index += stride;
+    }
+
+}
+
+void Aurora::max(const float* aInput, const float* aOutput, int aSize)
+{
+    int threadsPerBlock = 1024;
+    int blocksPerGrid = 68;
+    //max<<<blocksPerGrid, threadsPerBlock>>>(aInput, aOutput, aSize);
+    cudaDeviceSynchronize();
+}
+
+__global__ void validKernel(const float* aData, const float* aValid, float* aOutput, int aOutputRowCount, int aOutputColumnCount)
+{
+    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    int dataIndex = (int)aValid[threadIndex];
+    if(threadIndex < aOutputColumnCount)
+    {
+        for(int i=0; i < aOutputRowCount; ++i)
+        {
+            aOutput[threadIndex * aOutputRowCount + i] = aData[dataIndex * aOutputRowCount + i];
+        }
+    }
+}
+
+// __global__ void validSubKernel(const double* aValid, double* aOutput, unsigned int* aCount, int aValidSize)
+// {
+//     int index = blockIdx.x * blockDim.x + threadIdx.x;
+//     if(index == 0)
+//     {       
+//         for(int i=0;i<aValidSize;++i)
+//         {
+//             if(aValid[i] == 1)
+//             {
+//                 aOutput[*aCount] = i;
+//                 ++(*aCount);
+//             }
+//         }
+//     }
+//     __syncthreads();
+// }
+
+Aurora::CudaMatrix Aurora::valid(const Aurora::CudaMatrix aData, const Aurora::CudaMatrix aValid)
+{
+    int validSize = aValid.getDataSize();
+    int rowCount = aData.getDimSize(0);
+    float* hostValid = new float[validSize];
+    float* validProcessed = new float[validSize];
+    float* validProcessedDevice = nullptr;
+    cudaMemcpy(hostValid, aValid.getData(), sizeof(float) * validSize, cudaMemcpyDeviceToHost);
+    int validColumnCount = 0;
+    for(int i=0;i<validSize;++i)
+    {
+        if(hostValid[i] == 1)
+        {
+            validProcessed[validColumnCount] = i;
+            ++validColumnCount;
+        }
+    }
+    cudaMalloc((void**)&validProcessedDevice, sizeof(float) * validColumnCount );
+    cudaMemcpy(validProcessedDevice, validProcessed, sizeof(float) * validColumnCount, cudaMemcpyHostToDevice);
+
+    int threadPerBlock = 1024;
+    int blockPerGrid = validColumnCount / threadPerBlock + 1;
+    float* result = nullptr;
+    cudaMalloc((void**)&result, sizeof(float) * validColumnCount * rowCount);
+    validKernel<<<blockPerGrid, threadPerBlock>>>(aData.getData(), validProcessedDevice, result, rowCount, validColumnCount);
+    cudaDeviceSynchronize();
+
+    cudaFree(validProcessedDevice);
+    delete[] hostValid;
+    delete[] validProcessed;
+    return Aurora::CudaMatrix::fromRawData(result, rowCount, validColumnCount);
+}
+
+texture<float, cudaTextureType2D, cudaReadModeElementType> tex;
+cudaArray* array;
+
+__global__ void testKernel(float* aData,cudaTextureObject_t aTexObj, cudaSurfaceObject_t aSurface)
+{
+    float a = tex2D(tex,5.5,5.5);
+    float b = tex2D<float>(aTexObj,5.5,5.5);
+    float2 c = tex2D<float2>(aSurface,1,1);
+    printf("%f\n",a);
+    printf("%f\n",b);
+    printf("%f\n",c.x);
+    printf("%f\n",c.y);
+}
+
+__global__ void writeSurfaceKernel( cudaSurfaceObject_t aSurface)
+{
+    float2 value;
+    value.x = 100;
+    value.y = 99;
+    surf2Dwrite(value, aSurface, 1,  1 );
+
+}
+
+void subTest(cudaTextureObject_t& aTexture)
+{
+    cudaResourceDesc resourceDesc;
+    cudaTextureDesc textureDesc;
+    memset(&resourceDesc, 0, sizeof(resourceDesc));
+    resourceDesc.resType = cudaResourceTypeArray;
+    resourceDesc.res.array.array = array; // 指向设备端的 CUDA 数组
+
+// 在 textureDesc 中设置纹理描述
+    memset(&textureDesc, 0, sizeof(textureDesc));
+    textureDesc.addressMode[0] = cudaAddressModeClamp;
+    textureDesc.addressMode[1] = cudaAddressModeClamp;
+    textureDesc.filterMode = cudaFilterModeLinear;
+    textureDesc.readMode = cudaReadModeElementType;
+    textureDesc.normalizedCoords  = false;
+    //textureDesc.channelDesc = texChannelDescSpeedOfSoundField;
+    cudaCreateTextureObject(&aTexture, &resourceDesc, &textureDesc, nullptr);
+}
+
+void Aurora::test(float* aData)
+{
+    tex.addressMode[0] = cudaAddressModeClamp;			// Texturreferenz beschreiben
+    tex.addressMode[1] = cudaAddressModeClamp;
+    tex.filterMode     = cudaFilterModeLinear;
+    tex.normalized     = 0;
+    cudaChannelFormatDesc texChannelDescSpeedOfSoundField = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);	
+    cudaMallocArray(&array, &texChannelDescSpeedOfSoundField, 10, 9);
+    cudaMemcpyToArray(array, 0, 0, aData,10 * 9 *sizeof(float), cudaMemcpyHostToDevice);
+    cudaBindTextureToArray ( &tex, array, &texChannelDescSpeedOfSoundField );
+
+    cudaTextureObject_t textureObj;
+    subTest(textureObj);
+
+
+
+    struct cudaResourceDesc resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+    resDesc.resType = cudaResourceTypeArray;
+    // Create the surface objects
+    resDesc.res.array.array = array;
+    cudaSurfaceObject_t inputSurfObj = 0;
+    cudaCreateSurfaceObject(&inputSurfObj, &resDesc);
+
+    writeSurfaceKernel<<<1,1>>>(inputSurfObj);
+    cudaDeviceSynchronize();
+    testKernel<<<1, 1>>>(aData,textureObj, inputSurfObj);
+
+
+    cudaDeviceSynchronize();
+    cudaUnbindTexture(&tex);
+}
+
+void Aurora::sort(const Aurora::Matrix& aMatrix)
+{
+    RECON_INFO("cuda start");
+    thrust::sort(thrust::device, aMatrix.getData(), aMatrix.getData()+aMatrix.getDataSize(), thrust::greater<int>());
+    RECON_INFO("cuda end");
+}
+
+