Merge branch 'main' into binyli/perfgate

microsoft · Jun 16, 2023 · 03a63ae · 03a63ae
2 parents f2536fe + 8410fcd
commit 03a63ae
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 32 deletions.
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
@@ -5,7 +5,6 @@ pr:
   branches:
     include:
     - main
-  drafts: false
 
 pool:
   name: mscclpp
@@ -35,10 +34,10 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -49,7 +48,7 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -60,9 +59,9 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -73,6 +72,16 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: CheckPerfNumber
+  displayName: Check collective primitives performance
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/mscclpp-test/perf_ndmv4.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
@@ -67,6 +67,7 @@ steps:
       HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
       SSH_OPTION="StrictHostKeyChecking=no"
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      rm -rf output/*
       mkdir -p output
       touch output/mscclpp-it-000000
       tail -f output/mscclpp-it-000000 &
@@ -85,6 +86,7 @@ steps:
       HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
       SSH_OPTION="StrictHostKeyChecking=no"
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      rm -rf output/*
       mkdir -p output
       touch output/mscclpp-it-000000
       tail -f output/mscclpp-it-000000 &

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
@@ -5,7 +5,6 @@ pr:
   branches:
     include:
     - main
-  drafts: false
 
 jobs:
 - job: UnitTest

diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
@@ -32,7 +32,8 @@ __device__ void allgather0(mscclpp::channel::SimpleDeviceChannel devChan, int ra
 }
 
 __device__ void localAllGather(mscclpp::channel::SimpleDeviceChannel devChan, int rank, int worldSize,
-                               int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size) {
+                               int nranksPerNode, int remoteRank, uint64_t offset, uint64_t size,
+                               bool flushAfterSignal = true) {
   // this allgather algorithm works as follows:
   // Step 1: GPU rank i sends data to GPU rank (i+1) % nranksPerNode
   // and waits for data from GPU rank (i-1) % nranksPerNode
@@ -42,7 +43,8 @@ __device__ void localAllGather(mscclpp::channel::SimpleDeviceChannel devChan, in
   for (int i = 1; i < nranksPerNode; i++) {
     if ((remoteRank % nranksPerNode) == ((rank + i) % nranksPerNode)) {
       // put your data to GPU (rank+i) % nranksPerNode and signal in one call
-      if ((threadIdx.x % 32) == 0) devChan.putWithSignalAndFlush(offset, size);
+      if (flushAfterSignal && (threadIdx.x % 32) == 0) devChan.putWithSignalAndFlush(offset, size);
+      if (!flushAfterSignal && (threadIdx.x % 32) == 0) devChan.putWithSignal(offset, size);
     }
     // wait for the data from GPU (rank-i) % nranksPerNode to arrive
     if ((remoteRank % nranksPerNode) == ((rank - i + nranksPerNode) % nranksPerNode)) {
@@ -76,37 +78,48 @@ __device__ void allgather2(mscclpp::channel::SimpleDeviceChannel devChan, int ra
   // local allgather
   if (remoteRank / nranksPerNode == rank / nranksPerNode) {
     localAllGather(devChan, rank, worldSize, nranksPerNode, remoteRank, rank * nelemsPerGPU * sizeof(int),
-                   nelemsPerGPU * sizeof(int));
+                   nelemsPerGPU * sizeof(int), false);
   }
   // cross-node exchange
   if (remoteRank % nranksPerNode == rank % nranksPerNode) {
     // opposite side
     if ((threadIdx.x % 32) == 0)
-      devChan.putWithSignalAndFlush(rank * nelemsPerGPU * sizeof(int),
-                                    (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
+      devChan.putWithSignal(rank * nelemsPerGPU * sizeof(int),
+                            (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
     if ((threadIdx.x % 32) == 0) devChan.wait();
   }
 
+  // sync here to make sure IB flush dose not block the CUDA IPC traffic
+  __syncthreads();
+  // since all CUDA IPC share the same CUDA stream, only need to flush one of devChans
+  if ((remoteRank % nranksPerNode == rank % nranksPerNode) ||
+      (remoteRank / nranksPerNode == rank / nranksPerNode && rank % nranksPerNode == 0)) {
+    if ((threadIdx.x % 32) == 0) devChan.flush();
+  }
   __syncthreads();
 
   // Step 2
   // local allgather
   int otherNghr = (rank + nranksPerNode) % worldSize;
   if (remoteRank / nranksPerNode == rank / nranksPerNode) {
     localAllGather(devChan, rank, worldSize, nranksPerNode, remoteRank, otherNghr * nelemsPerGPU * sizeof(int),
-                   (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int));
+                   (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize * sizeof(int), false);
   }
 
   // cross-node exchange
   if (remoteRank % nranksPerNode == rank % nranksPerNode) {
     // opposite side
     if ((threadIdx.x % 32) == 0)
-      devChan.putWithSignalAndFlush(
-          (rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
-          nelemsPerGPU / pipelineSize * sizeof(int));
+      devChan.putWithSignal((rank * nelemsPerGPU + (nelemsPerGPU * (pipelineSize - 1)) / pipelineSize) * sizeof(int),
+                            nelemsPerGPU / pipelineSize * sizeof(int));
     if ((threadIdx.x % 32) == 0) devChan.wait();
   }
 
+  __syncthreads();
+  if ((remoteRank % nranksPerNode == rank % nranksPerNode) ||
+      (remoteRank / nranksPerNode == rank / nranksPerNode && rank % nranksPerNode == 0)) {
+    if ((threadIdx.x % 32) == 0) devChan.flush();
+  }
   __syncthreads();
 
   // Step 3

diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py
@@ -10,11 +10,11 @@ def load_perf_file(perf_fine: str) -> dict:
   with open(perf_fine, 'r') as f:
     for line in f:
       data = json.loads(line)
-      res[(data['name'], data['kernel'], data['ranks']),
-          data['ranksPerNode'], data['size']] = {'algBw': data['algBw'], 'busBw': data['busBw'], 'time': data['time']}
+      res[(data['name'], data['kernel'], data['ranks'],
+          data['ranksPerNode'], data['size'])] = {'algBw': data['algBw'], 'busBw': data['busBw'], 'time': data['time']}
       if ('target' in data):
-        res[(data['name'], data['kernel'], data['ranks']),
-            data['ranksPerNode'], data['size']]['target'] = data['target']
+        res[(data['name'], data['kernel'], data['ranks'],
+            data['ranksPerNode'], data['size'])]['target'] = data['target']
   return res
 
 
@@ -25,16 +25,16 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
       continue
     if baseline[key]['target'] == 'latency':
       if abs(value['time'] - baseline[key]['time']) / baseline[key]['time'] > time_threshold:
-        logging.error('time %f not match baseline %f with threshold %f',
+        logging.error('%s: time %f not match baseline %f with threshold %f', str(key),
                       value['time'], baseline[key]['time'], time_threshold)
         res = False
-    elif baseline[key]['target'] == 'bandwidth':
+    elif baseline[key]['target'] == 'throughput':
       if abs(value['algBw'] - baseline[key]['algBw']) / baseline[key]['algBw'] > bandwidth_threshold:
-        logging.error('algBw %f not match baseline %f with threshold %f',
+        logging.error('%s: algBw %f not match baseline %f with threshold %f', str(key),
                       value['algBw'], baseline[key]['algBw'], bandwidth_threshold)
         res = False
       if abs(value['busBw'] - baseline[key]['busBw']) / baseline[key]['busBw'] > bandwidth_threshold:
-        logging.error('busBw %f not match baseline %f with threshold %f',
+        logging.error('%s: busBw %f not match baseline %f with threshold %f', str(key),
                       value['busBw'], baseline[key]['busBw'], bandwidth_threshold)
         res = False
   return res
@@ -45,7 +45,7 @@ def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float,
   parser = argparse.ArgumentParser()
   parser.add_argument('--perf-file', type=str, required=True)
   parser.add_argument('--baseline-file', type=str, required=True)
-  parser.add_argument('--time-threshold', type=float, default=0.1)
+  parser.add_argument('--time-threshold', type=float, default=0.15)
   parser.add_argument('--bandwidth-threshold', type=float, default=0.05)
   args = parser.parse_args()
 

diff --git a/test/mscclpp-test/perf_ndmv4.jsonl b/test/mscclpp-test/perf_ndmv4.jsonl
@@ -1,6 +1,6 @@
 {"name":"allgather", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":271.83, "busBw":237.85, "size":1073741824, "time":3949.94, "target":"throughput"}
-{"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.0133, "busBw":0.0116, "size":1024, "time":77.03, "target":"latency"}
+{"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.1133, "busBw":0.1016, "size":8192, "time":72.88, "target":"latency"}
 {"name":"allreduce", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":139.04, "busBw":243.32, "size":1073741824,"time":7722.32, "target":"throughput"}
-{"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":0.1755, "busBw":0.3072, "size":1024, "time":5.83, "target":"latency"}
+{"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":3.99,   "busBw":6.98, "size":8192, "time":8.21, "target":"latency"}
 {"name":"alltoall",  "kernel":0, "ranks":16, "ranksPerNode":8, "algBw":46.49, "busBw":43.5928,"size":1073741824,"time":23091.7, "target":"throughput"}
 {"name":"alltoall", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":275.54, "busBw":241.10, "size":1073741824, "time":3896.75, "target":"throughput"}