Merge pull request #4520 from ESMCI/azamat/baselines/update-perf-info

Update performance baseline checks Update performance baseline checks: add more details to TestStatus.log for base, comp, threshold tput+mem values append (not overwrite) to perf-baselines to keep history record sha, date, value of new blesses Test suite: PFS.ne30pg2_r05_oECv3.F2010.chrysalis_intel.bench-noio Test baseline: same Test namelist changes: none Test status: bit for bit User interface changes?: N Update gh-pages html (Y/N)?: N
ESMCI · Dec 6, 2023 · ea369e9 · ea369e9
2 parents 1c7eda6 + ce0ae65
commit ea369e9
Show file tree

Hide file tree

Showing 5 changed files with 181 additions and 144 deletions.
diff --git a/CIME/SystemTests/system_tests_common.py b/CIME/SystemTests/system_tests_common.py
@@ -28,7 +28,7 @@
 from CIME.locked_files import LOCKED_DIR, lock_file, is_locked
 from CIME.baselines.performance import (
     get_latest_cpl_logs,
-    _perf_get_memory,
+    perf_get_memory_list,
     perf_compare_memory_baseline,
     perf_compare_throughput_baseline,
     perf_write_baseline,
@@ -806,7 +806,7 @@ def perf_check_for_memory_leak(case, tolerance):
 
     for cpllog in latestcpllogs:
         try:
-            memlist = _perf_get_memory(case, cpllog)
+            memlist = perf_get_memory_list(case, cpllog)
         except RuntimeError:
             return False, "insufficient data for memleak test"
 

diff --git a/CIME/baselines/performance.py b/CIME/baselines/performance.py
@@ -121,25 +121,25 @@ def perf_write_baseline(case, basegen_dir, throughput=True, memory=True):
 
     if throughput:
         try:
-            tput = perf_get_throughput(case, config)
+            tput, mode = perf_get_throughput(case, config)
         except RuntimeError as e:
             logger.debug("Could not get throughput: {0!s}".format(e))
         else:
             baseline_file = os.path.join(basegen_dir, "cpl-tput.log")
 
-            write_baseline_file(baseline_file, tput)
+            write_baseline_file(baseline_file, tput, mode)
 
             logger.info("Updated throughput baseline to {!s}".format(tput))
 
     if memory:
         try:
-            mem = perf_get_memory(case, config)
+            mem, mode = perf_get_memory(case, config)
         except RuntimeError as e:
             logger.info("Could not get memory usage: {0!s}".format(e))
         else:
             baseline_file = os.path.join(basegen_dir, "cpl-mem.log")
 
-            write_baseline_file(baseline_file, mem)
+            write_baseline_file(baseline_file, mem, mode)
 
             logger.info("Updated memory usage baseline to {!s}".format(mem))
 
@@ -184,16 +184,11 @@ def perf_get_throughput(case, config):
         Model throughput.
     """
     try:
-        tput = config.perf_get_throughput(case)
+        tput, mode = config.perf_get_throughput(case)
     except AttributeError:
-        tput = _perf_get_throughput(case)
+        tput, mode = _perf_get_throughput(case)
 
-        if tput is None:
-            raise RuntimeError("Could not get default throughput") from None
-
-        tput = str(tput)
-
-    return tput
+    return tput, mode
 
 
 def perf_get_memory(case, config):
@@ -215,19 +210,14 @@ def perf_get_memory(case, config):
         Model memory usage.
     """
     try:
-        mem = config.perf_get_memory(case)
+        mem, mode = config.perf_get_memory(case)
     except AttributeError:
-        mem = _perf_get_memory(case)
-
-        if mem is None:
-            raise RuntimeError("Could not get default memory usage") from None
-
-        mem = str(mem[-1][1])
+        mem, mode = _perf_get_memory(case)
 
-    return mem
+    return mem, mode
 
 
-def write_baseline_file(baseline_file, value):
+def write_baseline_file(baseline_file, value, mode="a"):
     """
     Writes value to `baseline_file`.
 
@@ -237,13 +227,10 @@ def write_baseline_file(baseline_file, value):
         Path to the baseline file.
     value : str
         Value to write.
+    mode : str
+        Mode to open file with.
     """
-    commit_hash = get_current_commit(repo=get_src_root())
-
-    timestamp = get_timestamp(timestamp_format="%Y-%m-%d_%H:%M:%S")
-
-    with open(baseline_file, "w") as fd:
-        fd.write(f"# sha:{commit_hash} date: {timestamp}\n")
+    with open(baseline_file, mode) as fd:
         fd.write(value)
 
 
@@ -270,6 +257,17 @@ def _perf_get_memory(case, cpllog=None):
     RuntimeError
         If not enough sample were found.
     """
+    memlist = perf_get_memory_list(case, cpllog)
+
+    if memlist is None:
+        raise RuntimeError("Could not get default memory usage") from None
+
+    value = _format_baseline(memlist[-1][1])
+
+    return value, "a"
+
+
+def perf_get_memory_list(case, cpllog):
     if cpllog is None:
         cpllog = get_latest_cpl_logs(case)
     else:
@@ -317,7 +315,12 @@ def _perf_get_throughput(case):
 
         logger.debug("Could not parse throughput from coupler log")
 
-    return tput
+    if tput is None:
+        raise RuntimeError("Could not get default throughput") from None
+
+    value = _format_baseline(tput)
+
+    return value, "a"
 
 
 def get_latest_cpl_logs(case):
@@ -429,7 +432,7 @@ def read_baseline_file(baseline_file):
         Value stored in baseline file without comments.
     """
     with open(baseline_file) as fd:
-        lines = [x.strip() for x in fd.readlines() if not x.startswith("#")]
+        lines = [x.strip() for x in fd.readlines() if not x.startswith("#") and x != ""]
 
     return "\n".join(lines)
 
@@ -456,13 +459,20 @@ def _perf_compare_throughput_baseline(case, baseline, tolerance):
     comment : str
         provides explanation from comparison.
     """
-    current = _perf_get_throughput(case)
+    current, _ = _perf_get_throughput(case)
+
+    try:
+        current = float(_parse_baseline(current))
+    except (ValueError, TypeError):
+        comment = "Could not compare throughput to baseline, as baseline had no value."
+
+        return None, comment
 
     try:
         # default baseline is stored as single float
-        baseline = float(baseline)
-    except ValueError:
-        comment = "Could not compare throughput to baseline, as basline had no value."
+        baseline = float(_parse_baseline(baseline))
+    except (ValueError, TypeError):
+        comment = "Could not compare throughput to baseline, as baseline had no value."
 
         return None, comment
 
@@ -474,14 +484,13 @@ def _perf_compare_throughput_baseline(case, baseline, tolerance):
     if diff is not None:
         below_tolerance = diff < tolerance
 
+        info = "Throughput changed by {:.2f}%: baseline={:.3f} sypd, tolerance={:d}%, current={:.3f} sypd".format(
+            diff * 100, baseline, int(tolerance * 100), current
+        )
         if below_tolerance:
-            comment = "TPUTCOMP: Computation time changed by {:.2f}% relative to baseline".format(
-                diff * 100
-            )
+            comment = "TPUTCOMP: " + info
         else:
-            comment = "Error: TPUTCOMP: Computation time increase > {:d}% from baseline".format(
-                int(tolerance * 100)
-            )
+            comment = "Error: TPUTCOMP: " + info
 
     return below_tolerance, comment
 
@@ -509,16 +518,21 @@ def _perf_compare_memory_baseline(case, baseline, tolerance):
         provides explanation from comparison.
     """
     try:
-        current = _perf_get_memory(case)
+        current, _ = _perf_get_memory(case)
     except RuntimeError as e:
         return None, str(e)
-    else:
-        current = current[-1][1]
+
+    try:
+        current = float(_parse_baseline(current))
+    except (ValueError, TypeError):
+        comment = "Could not compare throughput to baseline, as baseline had no value."
+
+        return None, comment
 
     try:
         # default baseline is stored as single float
-        baseline = float(baseline)
-    except ValueError:
+        baseline = float(_parse_baseline(baseline))
+    except (ValueError, TypeError):
         baseline = 0.0
 
     try:
@@ -533,13 +547,64 @@ def _perf_compare_memory_baseline(case, baseline, tolerance):
     if diff is not None:
         below_tolerance = diff < tolerance
 
+        info = "Memory usage highwater changed by {:.2f}%: baseline={:.3f} MB, tolerance={:d}%, current={:.3f} MB".format(
+            diff * 100, baseline, int(tolerance * 100), current
+        )
         if below_tolerance:
-            comment = "MEMCOMP: Memory usage highwater has changed by {:.2f}% relative to baseline".format(
-                diff * 100
-            )
+            comment = "MEMCOMP: " + info
         else:
-            comment = "Error: Memory usage increase >{:d}% from baseline's {:f} to {:f}".format(
-                int(tolerance * 100), baseline, current
-            )
+            comment = "Error: MEMCOMP: " + info
 
     return below_tolerance, comment
+
+
+def _format_baseline(value):
+    """
+    Encodes value with default baseline format.
+
+    Default format:
+    sha: <commit sha> date: <date of bless> <value>
+
+    Parameters
+    ----------
+    value : str
+        Baseline value to encode.
+
+    Returns
+    -------
+    value : str
+        Baseline entry.
+    """
+    commit_hash = get_current_commit(repo=get_src_root())
+
+    timestamp = get_timestamp(timestamp_format="%Y-%m-%d_%H:%M:%S")
+
+    return f"sha:{commit_hash} date:{timestamp} {value}\n"
+
+
+def _parse_baseline(data):
+    """
+    Parses default baseline format.
+
+    Default format:
+    sha: <commit sha> date: <date of bless> <value>
+
+    Parameters
+    ----------
+    data : str
+        Containing contents of baseline file.
+
+    Returns
+    -------
+    value : str
+        Value of the latest blessed baseline.
+    """
+    lines = data.split("\n")
+    lines = [x for x in lines if x != ""]
+
+    try:
+        value = lines[-1].strip().split(" ")[-1]
+    except IndexError:
+        value = None
+
+    return value