Add performance check for mscclpp-test (#110)

- Add ndmv4 perf baseline - change mscclpp-test to output perf number into a json file - add python script to check the perf result with the baseline
microsoft · Jun 21, 2023 · 2640578 · 2640578
1 parent cd7797f
commit 2640578
Show file tree

Hide file tree

Showing 16 changed files with 295 additions and 130 deletions.
diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
@@ -35,10 +35,10 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o output.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -49,7 +49,7 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/sendrecv_test_perf -b 1K -e 1G -f 2 -o output.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -60,9 +60,9 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 2 -o output.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
 
 - task: Bash@3
@@ -73,6 +73,16 @@ steps:
     script: |
       set -e
       export PATH=/usr/local/mpi/bin:$PATH
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2
-      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -o output.jsonl
+      mpirun -np 8 --bind-to numa -x MSCCLPP_DEBUG=WARN ./build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 1 -o output.jsonl
+    workingDirectory: '$(System.DefaultWorkingDirectory)'
+
+- task: Bash@3
+  name: CheckPerfNumber
+  displayName: Check collective primitives performance
+  inputs:
+    targetType: 'inline'
+    script: |
+      set -e
+      python3 test/mscclpp-test/check_perf_result.py --perf-file output.jsonl --baseline-file test/deploy/perf_ndmv4.jsonl
     workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
@@ -67,6 +67,7 @@ steps:
       HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
       SSH_OPTION="StrictHostKeyChecking=no"
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      rm -rf output/*
       mkdir -p output
       touch output/mscclpp-it-000000
       tail -f output/mscclpp-it-000000 &
@@ -85,6 +86,7 @@ steps:
       HOSTFILE=$(System.DefaultWorkingDirectory)/test/mscclpp-test/deploy/hostfile
       SSH_OPTION="StrictHostKeyChecking=no"
       KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
+      rm -rf output/*
       mkdir -p output
       touch output/mscclpp-it-000000
       tail -f output/mscclpp-it-000000 &

diff --git a/.black b/.black
@@ -0,0 +1,5 @@
+[tool.black]
+line-length = 120
+target-version = ['py38']
+include = '\.pyi?$'
+extend-exclude = 'python/'
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -10,11 +10,6 @@ jobs:
     runs-on: ubuntu-20.04
 
     steps:
-    - name: Check git version
-      env:
-        min-ver: "2.18"
-      run: dpkg --compare-versions $(git version | awk '{print $3}') ge ${{ env.min-ver }}
-
     - name: Check out Git repository
       uses: actions/checkout@v3
 
@@ -28,15 +23,32 @@ jobs:
         clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
         clang-format --dry-run ${PYTHONCPPSOURCES}
 
-  spelling:
+  pylint:
     runs-on: ubuntu-20.04
 
     steps:
-    - name: Check git version
-      env:
-        min-ver: "2.18"
-      run: dpkg --compare-versions $(git version | awk '{print $3}') ge ${{ env.min-ver }}
+      - name: Check out Git repository
+        uses: actions/checkout@v3
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+
+      - name: Install Python dependencies
+        run: python3.8 -m pip install black
+
+      - name: Run linters
+        uses: wearerequired/lint-action@v2
+        with:
+          black: true
+          black_auto_fix: false
+          black_args: "--config .black --check"
+
+  spelling:
+    runs-on: ubuntu-20.04
 
+    steps:
     - name: Check out Git repository
       uses: actions/checkout@v3
 

diff --git a/test/deploy/deploy.sh b/test/deploy/deploy.sh
@@ -6,6 +6,7 @@ DST_DIR="/tmp/mscclpp"
 HOSTFILE="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy/hostfile"
 DEPLOY_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/deploy"
 SSH_OPTION="StrictHostKeyChecking=no"
+MSCCLPP_TEST_DIR="${SYSTEM_DEFAULTWORKINGDIRECTORY}/test/mscclpp-test"
 
 chmod 400 ${KeyFilePath}
 ssh-keygen -t rsa -f sshkey -P ""
@@ -28,6 +29,7 @@ parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${SRC_
 parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey ${DST_DIR}
 parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION sshkey.pub ${DST_DIR}
 parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${DEPLOY_DIR}/* ${DST_DIR}
+parallel-scp -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${MSCCLPP_TEST_DIR}/check_perf_result.py ${DST_DIR}
 
 # force to pull the latest image
 parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION \

diff --git a/test/deploy/perf_ndmv4.jsonl b/test/deploy/perf_ndmv4.jsonl
@@ -0,0 +1,7 @@
+{"name":"allgather", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":271.83, "busBw":237.85, "size":1073741824, "time":3949.94, "target":"throughput"}
+{"name":"allgather", "kernel":2, "ranks":16,"ranksPerNode":8, "algBw":243.86, "busBw":228.62, "size":3221225472, "time":13209.19,"target":"throughput"}
+{"name":"allgather", "kernel":3, "ranks":8, "ranksPerNode":8, "algBw":0.1133, "busBw":0.1016, "size":8192,       "time":72.88,   "target":"latency"}
+{"name":"allreduce", "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":139.04, "busBw":243.32, "size":1073741824, "time":7722.32, "target":"throughput"}
+{"name":"allreduce", "kernel":2, "ranks":8, "ranksPerNode":8, "algBw":1.40,   "busBw":2.45,   "size":8192,       "time":5.86,    "target":"latency"}
+{"name":"alltoall",  "kernel":0, "ranks":16,"ranksPerNode":8, "algBw":46.49,  "busBw":43.5928,"size":1073741824, "time":23091.7, "target":"throughput"}
+{"name":"alltoall",  "kernel":1, "ranks":8, "ranksPerNode":8, "algBw":275.54, "busBw":241.10, "size":1073741824, "time":3896.75, "target":"throughput"}
diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
@@ -5,30 +5,34 @@ function run_mscclpp_test()
   echo "=================Run allgather_test_perf on 2 nodes========================="
   /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0
+    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   # For kernel 2, the message size must can be divided by 3
   /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2
+    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 3K -e 3G -f 2 -k 2 -o /root/mscclpp/output.jsonl
 
   /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3
+    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allgather_test_perf -b 1K -e 1G -f 2 -k 3 -o /root/mscclpp/output.jsonl
 
   echo "==================Run allreduce_test_perf on 2 nodes========================="
   /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0
+    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
 
   /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1
+    -npernode 8 /root/mscclpp/build/test/mscclpp-test/allreduce_test_perf -b 1K -e 1G -f 2 -k 1 -o /root/mscclpp/output.jsonl
 
   echo "==================Run alltoall_test_perf on 2 nodes========================="
   /usr/local/mpi/bin/mpirun --allow-run-as-root -np 16 --bind-to numa -hostfile /root/mscclpp/hostfile_mpi \
     -x MSCCLPP_DEBUG=WARN -x LD_LIBRARY_PATH=/root/mscclpp/build:$LD_LIBRARY_PATH \
-    -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0
+    -npernode 8 /root/mscclpp/build/test/mscclpp-test/alltoall_test_perf -b 1K -e 1G -f 2 -k 0 -o /root/mscclpp/output.jsonl
+
+  echo "========================Run performance check==============================="
+  python3 /root/mscclpp/check_perf_result.py --perf-file /root/mscclpp/output.jsonl \
+    --baseline-file /root/mscclpp/perf_ndmv4.jsonl
 }
 
 function run_mp_ut()

diff --git a/test/mscclpp-test/CMakeLists.txt b/test/mscclpp-test/CMakeLists.txt
@@ -1,6 +1,9 @@
+FetchContent_Declare(json URL https://github.com/nlohmann/json/releases/download/v3.11.2/json.tar.xz)
+FetchContent_MakeAvailable(json)
+
 function(add_mscclpp_test_executable name sources)
     add_executable(${name} ${sources} common.cc)
-    target_link_libraries(${name} mscclpp MPI::MPI_CXX CUDA::cudart CUDA::cuda_driver MSCCLPP::numa)
+    target_link_libraries(${name} mscclpp MPI::MPI_CXX CUDA::cudart CUDA::cuda_driver MSCCLPP::numa nlohmann_json::nlohmann_json)
 endfunction()
 
 add_mscclpp_test_executable(sendrecv_test_perf sendrecv_test.cu)

diff --git a/test/mscclpp-test/allgather_test.cu b/test/mscclpp-test/allgather_test.cu
@@ -324,7 +324,7 @@ class AllGatherTestEngine : public BaseTestEngine {
   std::shared_ptr<int[]> expectedBuff_;
 };
 
-AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args) {}
+AllGatherTestEngine::AllGatherTestEngine(const TestArgs& args) : BaseTestEngine(args, "allgather") {}
 
 void AllGatherTestEngine::allocateBuffer() {
   sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));

diff --git a/test/mscclpp-test/allreduce_test.cu b/test/mscclpp-test/allreduce_test.cu
@@ -329,6 +329,7 @@ void AllReduceTestColl::setupCollTest(size_t size) {
   recvCount_ = base;
   paramCount_ = base;
   recvCount_ = base;
+  expectedCount_ = base;
 
   mscclpp::DeviceSyncer syncer = {};
   CUDATHROW(cudaMemcpyToSymbol(deviceSyncer, &syncer, sizeof(mscclpp::DeviceSyncer)));
@@ -358,7 +359,9 @@ class AllReduceTestEngine : public BaseTestEngine {
   std::shared_ptr<int[]> expectedBuff_;
 };
 
-AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args) { inPlace_ = isInPlace(); }
+AllReduceTestEngine::AllReduceTestEngine(const TestArgs& args) : BaseTestEngine(args, "allreduce") {
+  inPlace_ = isInPlace();
+}
 
 bool AllReduceTestEngine::isUsePacket() const { return (args_.kernelNum == 2); }
 

diff --git a/test/mscclpp-test/alltoall_test.cu b/test/mscclpp-test/alltoall_test.cu
@@ -130,7 +130,7 @@ class AllToAllTestEngine : public BaseTestEngine {
   std::shared_ptr<int[]> expectedBuff_;
 };
 
-AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args) { inPlace_ = false; }
+AllToAllTestEngine::AllToAllTestEngine(const TestArgs& args) : BaseTestEngine(args, "alltoall") { inPlace_ = false; }
 
 void AllToAllTestEngine::allocateBuffer() {
   sendBuff_ = mscclpp::allocSharedCuda<int>(args_.maxBytes / sizeof(int));

diff --git a/test/mscclpp-test/check_perf_result.py b/test/mscclpp-test/check_perf_result.py
@@ -0,0 +1,80 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import json
+import logging
+
+
+def load_perf_file(perf_fine: str) -> dict:
+    res = {}
+    with open(perf_fine, "r") as f:
+        for line in f:
+            data = json.loads(line)
+            res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])] = {
+                "algBw": data["algBw"],
+                "busBw": data["busBw"],
+                "time": data["time"],
+            }
+            if "target" in data:
+                res[(data["name"], data["kernel"], data["ranks"], data["ranksPerNode"], data["size"])]["target"] = data[
+                    "target"
+                ]
+    return res
+
+
+def check_perf_result(perf_result: dict, baseline: dict, time_threshold: float, bandwidth_threshold: float) -> bool:
+    res = True
+    for key, value in perf_result.items():
+        if key not in baseline:
+            continue
+        if baseline[key]["target"] == "latency":
+            if abs(value["time"] - baseline[key]["time"]) / baseline[key]["time"] > time_threshold:
+                logging.error(
+                    "%s: time %f not match baseline %f with threshold %f",
+                    str(key),
+                    value["time"],
+                    baseline[key]["time"],
+                    time_threshold,
+                )
+                res = False
+        elif baseline[key]["target"] == "throughput":
+            if abs(value["algBw"] - baseline[key]["algBw"]) / baseline[key]["algBw"] > bandwidth_threshold:
+                logging.error(
+                    "%s: algBw %f not match baseline %f with threshold %f",
+                    str(key),
+                    value["algBw"],
+                    baseline[key]["algBw"],
+                    bandwidth_threshold,
+                )
+                res = False
+            if abs(value["busBw"] - baseline[key]["busBw"]) / baseline[key]["busBw"] > bandwidth_threshold:
+                logging.error(
+                    "%s: busBw %f not match baseline %f with threshold %f",
+                    str(key),
+                    value["busBw"],
+                    baseline[key]["busBw"],
+                    bandwidth_threshold,
+                )
+                res = False
+    return res
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--perf-file", type=str, required=True)
+    parser.add_argument("--baseline-file", type=str, required=True)
+    # We use different threshold for latency and bandwidth. For latency,
+    # small data size is used which introduces more variance. For bandwidth, the performance is more stable.
+    parser.add_argument("--time-threshold", type=float, default=0.15)
+    parser.add_argument("--bandwidth-threshold", type=float, default=0.05)
+    args = parser.parse_args()
+
+    perf_result = load_perf_file(args.perf_file)
+    baseline = load_perf_file(args.baseline_file)
+    if check_perf_result(perf_result, baseline, args.time_threshold, args.bandwidth_threshold):
+        print("PASS")
+    else:
+        print("FAIL")
+        exit(1)
diff --git a/test/mscclpp-test/common.cc b/test/mscclpp-test/common.cc
@@ -15,6 +15,7 @@
 #include <iomanip>
 #include <iostream>
 #include <mscclpp/utils.hpp>
+#include <nlohmann/json.hpp>
 #include <sstream>
 #include <string>
 #include <type_traits>
@@ -46,6 +47,7 @@ int iters = 20;
 int average = 1;
 int kernel_num = 0;
 int cudaGraphLaunches = 15;
+std::string output_file;
 
 double parseSize(const char* value) {
   std::string valueStr(value);
@@ -148,7 +150,8 @@ void numaBind(int node) {
   numa_bind_compat(&mask);
 }
 
-BaseTestEngine::BaseTestEngine(const TestArgs& args) : args_(args), inPlace_(true), error_(0) {
+BaseTestEngine::BaseTestEngine(const TestArgs& args, const std::string& name)
+    : args_(args), name_(name), inPlace_(true), error_(0) {
   this->coll_ = getTestColl();
   CUDATHROW(cudaStreamCreateWithFlags(&this->stream_, cudaStreamNonBlocking));
 }
@@ -254,6 +257,18 @@ void BaseTestEngine::runTest() {
     }
     double algBw, busBw;
     this->coll_->getBw(deltaSec, algBw, busBw);
+    if (!output_file.empty()) {
+      nlohmann::json perfOutput = {{"name", name_},
+                                   {"kernel", args_.kernelNum},
+                                   {"ranks", args_.totalRanks},
+                                   {"ranksPerNode", args_.nRanksPerNode},
+                                   {"size", size},
+                                   {"time", timeUsec},
+                                   {"algBw", algBw},
+                                   {"busBw", busBw}};
+      std::ofstream out(output_file, std::ios_base::app);
+      if (isMainProc) out << perfOutput << std::endl;
+    }
     if (!this->inPlace_) {
       ss << "                                 ";
     }
@@ -417,12 +432,13 @@ int main(int argc, char* argv[]) {
                               {"cudagraph", required_argument, 0, 'G'},
                               {"average", required_argument, 0, 'a'},
                               {"kernel_num", required_argument, 0, 'k'},
+                              {"output_file", required_argument, 0, 'o'},
                               {"help", no_argument, 0, 'h'},
                               {}};
 
   while (1) {
     int c;
-    c = getopt_long(argc, argv, "b:e:i:f:n:w:c:G:a:k:h:", longopts, &longindex);
+    c = getopt_long(argc, argv, "b:e:i:f:n:w:c:G:a:k:o:h:", longopts, &longindex);
 
     if (c == -1) break;
 
@@ -471,6 +487,9 @@ int main(int argc, char* argv[]) {
       case 'k':
         kernel_num = (int)strtol(optarg, NULL, 0);
         break;
+      case 'o':
+        output_file = optarg;
+        break;
       case 'h':
       default:
         if (c != 'h') printf("invalid option '%c'\n", c);
@@ -488,6 +507,7 @@ int main(int argc, char* argv[]) {
             "[-C,--report_cputime <0/1>] \n\t"
             "[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
             "[-k,--kernel_num <kernel number of commnication primitive>] \n\t"
+            "[-o, --output_file <output file name>] \n\t"
             "[-h,--help]\n",
             basename(argv[0]));
         return 0;