codeplaysoftware · JackAKirk · Sep 10, 2024
diff --git a/src/MPI_with_SYCL/scatter_reduce_gather.cpp b/src/MPI_with_SYCL/scatter_reduce_gather.cpp
@@ -44,8 +44,16 @@ int main(int argc, char *argv[]) {
   /* -------------------------------------------------------------------------------------------
     SYCL Initialization, which internally sets the device.
   --------------------------------------------------------------------------------------------*/
-
-  sycl::queue q{};
+  // For simplicity the below usage of `sycl::queue` uses a single gpu shared
+  // between the two ranks. In order to use a separate gpu for each rank, simply
+  // use `Devs[rank]` instead. Note that it is important to manually instantiate a
+  // sycl::context in this way using a single device per rank. This implicitly
+  // sets the gpu device that each MPI rank will use, avoiding MPI/SYCL
+  // interoperability misuse in the case that the sycl runtime uses the default
+  // `sycl::context` which instantiates all available devices; and would lead to
+  // MPI calls leaking data to unused devices.
+  sycl::context Context(Devs[0]);
+  sycl::queue q{Context, Devs[0]};
 
   size_t N = 500000;
   std::vector<double> A(N, 1.0);

diff --git a/src/MPI_with_SYCL/send_recv_buff.cpp b/src/MPI_with_SYCL/send_recv_buff.cpp
@@ -73,8 +73,16 @@ int main(int argc, char *argv[]) {
   /* ---------------------------------------------------------------------------
     SYCL Initialization, which internally sets the device.
   ----------------------------------------------------------------------------*/
-
-  sycl::queue q{};
+  // For simplicity the below usage of `sycl::queue` uses a single gpu shared
+  // between the two ranks. In order to use a separate gpu for each rank, simply
+  // use `Devs[rank]` instead. Note that it is important to manually instantiate a
+  // sycl::context in this way using a single device per rank. This implicitly
+  // sets the gpu device that each MPI rank will use, avoiding MPI/SYCL
+  // interoperability misuse in the case that the sycl runtime uses the default
+  // `sycl::context` which instantiates all available devices; and would lead to
+  // MPI calls leaking data to unused devices.
+  sycl::context Context(Devs[0]);
+  sycl::queue q{Context, Devs[0]};
 
   int tag = 0;
   const int nelem = 20;

diff --git a/src/MPI_with_SYCL/send_recv_usm.cpp b/src/MPI_with_SYCL/send_recv_usm.cpp
@@ -38,8 +38,16 @@ int main(int argc, char *argv[]) {
   /* -------------------------------------------------------------------------------------------
       SYCL Initialization, which internally sets the device.
   --------------------------------------------------------------------------------------------*/
-
-  sycl::queue q{};
+  // For simplicity the below usage of `sycl::queue` uses a single gpu shared
+  // between the two ranks. In order to use a separate gpu for each rank, simply
+  // use `Devs[rank]` instead. Note that it is important to manually instantiate a
+  // sycl::context in this way using a single device per rank. This implicitly
+  // sets the gpu device that each MPI rank will use, avoiding MPI/SYCL
+  // interoperability misuse in the case that the sycl runtime uses the default
+  // `sycl::context` which instantiates all available devices; and would lead to
+  // MPI calls leaking data to unused devices.
+  sycl::context Context(Devs[0]);
+  sycl::queue q{Context, Devs[0]};
 
   int tag = 0;
   const int nelem = 20;
@@ -83,11 +91,10 @@ int main(int argc, char *argv[]) {
     // Copy the data back to the host and wait for the memory copy to complete.
     q.memcpy(&data[0], devp, nsize).wait();
 
-    sycl::free(devp, q);
-
     // Check the values.
     for (int i = 0; i < nelem; ++i) assert(data[i] == -2);
   }
+  sycl::free(devp, q);
   MPI_Finalize();
   return 0;
 }