diff --git a/src/Function2D.cu b/src/Function2D.cu index 2d64f99..ddf01c6 100644 --- a/src/Function2D.cu +++ b/src/Function2D.cu @@ -703,6 +703,7 @@ CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){ cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol); auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex); sumZAllColKernel<<>>(matData,retData, aMatrix.getDataSize()); + cudaDeviceSynchronize(); float* result_data = nullptr; cudaMalloc((void**)&result_data, sizeof(float)*2); auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex); @@ -1349,12 +1350,13 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){ float* data = nullptr; cudaMalloc((void**)&data, sizeof(float)*2*bufferSize); -if (aMatrix.isComplex()){ + if (aMatrix.isComplex()){ complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); } else{ - complexFillKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); -} + complexFillKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); + } + cudaDeviceSynchronize(); auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); ExecFFT(ret,0); return ret; @@ -1373,6 +1375,7 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){ cudaMalloc((void**)&data, sizeof(float)*2*bufferSize); complexCopyKernel<<>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount); + cudaDeviceSynchronize(); auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex); ExecFFT(ret,1); float colEleCountf = 1.f/ColEleCount; @@ -1414,6 +1417,7 @@ void Aurora::fftshift(CudaMatrix &aMatrix){ if (aMatrix.getDimSize(0) % 2 == 0) { fftshiftSwapKernel<<>>( aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType()); + cudaDeviceSynchronize(); } else { int copySize = aMatrix.getDimSize(0) / 2 + 1; float *data = nullptr; @@ -1422,11 +1426,13 @@ void Aurora::fftshift(CudaMatrix &aMatrix){ aMatrix.getData(), data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); + cudaDeviceSynchronize(); memcpyColKernel<<>>( aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(), (copySize - 1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); + cudaDeviceSynchronize(); memcpyColKernel<<>>( data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(), copySize * aMatrix.getValueType(), @@ -1449,11 +1455,13 @@ void Aurora::ifftshift(CudaMatrix &aMatrix){ data, copySize * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), copySize * aMatrix.getValueType()); + cudaDeviceSynchronize(); memcpyColKernel<<>>( aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(), (copySize-1) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType(), aMatrix.getDimSize(0) * aMatrix.getValueType()); + cudaDeviceSynchronize(); memcpyColKernel<<>>( data, aMatrix.getData(), copySize * aMatrix.getValueType(),