diff --git a/src/CUDA/ddot.jl b/src/CUDA/ddot.jl index 0a7740b..638ca24 100644 --- a/src/CUDA/ddot.jl +++ b/src/CUDA/ddot.jl @@ -9,5 +9,6 @@ @inbounds for i=1:n v += dx[i]*dy[i] end + CUDA.sync_threads() return v end diff --git a/src/CUDA/dspcg.jl b/src/CUDA/dspcg.jl index 4895d72..605d48b 100644 --- a/src/CUDA/dspcg.jl +++ b/src/CUDA/dspcg.jl @@ -51,6 +51,7 @@ end end end + CUDA.sync_threads() nfree = CUDA.shfl_sync(0xffffffff, nfree, 1) # Exit if there are no free constraints.