Add cudaDeviceSynchronize() after some kernel
This commit is contained in:
@@ -703,6 +703,7 @@ CudaMatrix Aurora::sum(const CudaMatrix &aMatrix, FunctionDirection direction ){
|
||||
cudaMalloc((void**)&retData, sizeof(float)*2*fakeCol);
|
||||
auto ret = CudaMatrix::fromRawData(retData,1,fakeCol,1,Complex);
|
||||
sumZAllColKernel<<<fakeCol,256>>>(matData,retData, aMatrix.getDataSize());
|
||||
cudaDeviceSynchronize();
|
||||
float* result_data = nullptr;
|
||||
cudaMalloc((void**)&result_data, sizeof(float)*2);
|
||||
auto ret2 = CudaMatrix::fromRawData(result_data,1,1,1,Complex);
|
||||
@@ -1349,12 +1350,13 @@ CudaMatrix Aurora::fft(const CudaMatrix &aMatrix, long aFFTSize){
|
||||
float* data = nullptr;
|
||||
|
||||
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
|
||||
if (aMatrix.isComplex()){
|
||||
if (aMatrix.isComplex()){
|
||||
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
|
||||
}
|
||||
else{
|
||||
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
|
||||
}
|
||||
complexFillKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
|
||||
}
|
||||
cudaDeviceSynchronize();
|
||||
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
|
||||
ExecFFT(ret,0);
|
||||
return ret;
|
||||
@@ -1373,6 +1375,7 @@ CudaMatrix Aurora::ifft(const CudaMatrix &aMatrix, long aFFTSize){
|
||||
cudaMalloc((void**)&data, sizeof(float)*2*bufferSize);
|
||||
|
||||
complexCopyKernel<<<aMatrix.getDimSize(1), 256>>>(aMatrix.getData(), data, needCopySize, aMatrix.getDimSize(0),ColEleCount);
|
||||
cudaDeviceSynchronize();
|
||||
auto ret = Aurora::CudaMatrix::fromRawData(data,ColEleCount,aMatrix.getDimSize(1),1,Complex);
|
||||
ExecFFT(ret,1);
|
||||
float colEleCountf = 1.f/ColEleCount;
|
||||
@@ -1414,6 +1417,7 @@ void Aurora::fftshift(CudaMatrix &aMatrix){
|
||||
if (aMatrix.getDimSize(0) % 2 == 0) {
|
||||
fftshiftSwapKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||||
aMatrix.getData(), aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||||
cudaDeviceSynchronize();
|
||||
} else {
|
||||
int copySize = aMatrix.getDimSize(0) / 2 + 1;
|
||||
float *data = nullptr;
|
||||
@@ -1422,11 +1426,13 @@ void Aurora::fftshift(CudaMatrix &aMatrix){
|
||||
aMatrix.getData(), data, copySize * aMatrix.getValueType(),
|
||||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||||
copySize * aMatrix.getValueType());
|
||||
cudaDeviceSynchronize();
|
||||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||||
aMatrix.getData() + copySize* aMatrix.getValueType(), aMatrix.getData(),
|
||||
(copySize - 1) * aMatrix.getValueType(),
|
||||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||||
aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||||
cudaDeviceSynchronize();
|
||||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||||
data, aMatrix.getData() + (copySize - 1) * aMatrix.getValueType(),
|
||||
copySize * aMatrix.getValueType(),
|
||||
@@ -1449,11 +1455,13 @@ void Aurora::ifftshift(CudaMatrix &aMatrix){
|
||||
data, copySize * aMatrix.getValueType(),
|
||||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||||
copySize * aMatrix.getValueType());
|
||||
cudaDeviceSynchronize();
|
||||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||||
aMatrix.getData(), aMatrix.getData() + (copySize) * aMatrix.getValueType(),
|
||||
(copySize-1) * aMatrix.getValueType(),
|
||||
aMatrix.getDimSize(0) * aMatrix.getValueType(),
|
||||
aMatrix.getDimSize(0) * aMatrix.getValueType());
|
||||
cudaDeviceSynchronize();
|
||||
memcpyColKernel<<<aMatrix.getDimSize(1), 256>>>(
|
||||
data, aMatrix.getData(),
|
||||
copySize * aMatrix.getValueType(),
|
||||
|
||||
Reference in New Issue
Block a user