Add cudaDeviceSynchronize() after some kernel

This commit is contained in:
kradchen
2023-12-22 11:34:20 +08:00
parent a89637fc44
commit ca5bb1d082

View File

@@ -703,6 +703,7 @@ CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol);
auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex);
sumZAllColKernel<<<fakeCol,256>>>(matData,retData, aMatrix.getDataSize());
cudaDeviceSynchronize();
float* result_data = nullptr;
cudaMalloc((void**)&result_data, sizeof(float)*2);
auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex);
@@ -1349,12 +1350,13 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
float* data = nullptr;
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
if (aMatrix.isComplex()){
if (aMatrix.isComplex()){
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
}
else{
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
}
}
cudaDeviceSynchronize();
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
ExecFFT(ret,0);
return ret;
@@ -1373,6 +1375,7 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
cudaDeviceSynchronize();
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
ExecFFT(ret,1);
float colEleCountf = 1.f/ColEleCount;
@@ -1414,6 +1417,7 @@ void Aurora::fftshift(CudaMatrix &aMatrix){
if (aMatrix.getDimSize(0) % 2 == 0) {
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
cudaDeviceSynchronize();
} else {
int copySize = aMatrix.getDimSize(0) / 2 + 1;
float *data = nullptr;
@@ -1422,11 +1426,13 @@ void Aurora::fftshift(CudaMatrix &aMatrix){
aMatrix.getData(), data, copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
copySize * aMatrix.getValueType());
cudaDeviceSynchronize();
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(),
(copySize - 1) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
cudaDeviceSynchronize();
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(),
copySize * aMatrix.getValueType(),
@@ -1449,11 +1455,13 @@ void Aurora::ifftshift(CudaMatrix &aMatrix){
data, copySize * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
copySize * aMatrix.getValueType());
cudaDeviceSynchronize();
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(),
(copySize-1) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType(),
aMatrix.getDimSize(0) * aMatrix.getValueType());
cudaDeviceSynchronize();
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
data, aMatrix.getData(),
copySize * aMatrix.getValueType(),