From 48e58db74a7afb02df3bc117d5e3a304efe5e732 Mon Sep 17 00:00:00 2001
From: Kajetan Puchalski <kajetan.puchalski@arm.com>
Date: Tue, 8 Aug 2023 18:22:16 +0100
Subject: [PATCH] doc: Complete doc comments for WorkloadNotebookAnalysis

---
 README.md      |  14 +---
 wp/notebook.py | 193 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 133 insertions(+), 74 deletions(-)

diff --git a/README.md b/README.md
index 81b4619..1d79d65 100644
--- a/README.md
+++ b/README.md
@@ -178,7 +178,7 @@ The main idea is to contain analysis tied to different runs of a specific worklo
 `wp.notebook.WorkloadNotebookAnalysis` takes a directory with benchmark runs and a list of the run directories inside it as arguments.
 The notebooks should be able to automatically adjust to changing the number of runs.
 
-```
+```python
 gb5 = WorkloadNotebookAnalysis('/home/user/tmp/geekbench/', [
     'geekbench_baseline_3_3101',
     'geekbench_ufc_feec_all_cpus_3_3001',
@@ -195,18 +195,8 @@ The `plot` proxy can be used to accessed all the pre-defined plotting methods, f
 #### Manual plotting
 
 The `wp.notebook.WorkloadNotebookAnalysis.plot_gmean_bars` helper method can be used to plot a given dataframe as bars and automatically attach statistical analysis to it.
-It's mainly intended as a way of comparing gmean values of multiple iterations across workloads and so it expects a melt-like (`pd.melt`) dataframe to plot.
-It heavily relies on multiple assumptions about the underlying dataframe so it might break.
-The function returns a dataframe of the ASCII table that will be printed above the resulting plot. That dataframe can be included in the summary dict for later use as shown below.
-
 There is a corresponding helper method for line plots - `wp.notebook.WorkloadNotebookAnalysis.plot_lines_px`.
 
-```
-gb5.summary['scores'] = gb5.plot_gmean_bars(gb5.results, x='stat', y='value', facet_col='metric', facet_col_wrap=3, title='gmean benchmark score', width=1600, height=600)
-```
-
-Pre-defined plotting functions in `wp.notebook.WorkloadNotebookPlotter` will include the tables in the summary automatically.
-
 ### Loading metrics generated by the processor
 
 When using the pre-defined plotting functions the relevant metrics will automatically be loaded the first time the plot is generated and the re-used. No further steps should be necessary.
@@ -221,7 +211,7 @@ The function will take a filename, then go across every directory in `gb5.benchm
 In the below example the resulting dataframe can be found in `gb5.analysis['overutilized']`.
 
 
-```
+```python
 def postprocess_overutil(df):
     df['time'] = round(df['time'], 2)
     df['total_time'] = round(df['total_time'], 2)
diff --git a/wp/notebook.py b/wp/notebook.py
index cd4189c..5edea17 100644
--- a/wp/notebook.py
+++ b/wp/notebook.py
@@ -31,64 +31,11 @@
 from wp.constants import APP_NAME
 
 
-def setup_notebook():
-    import plotly.io as pio
-    from holoviews import opts
-    from bokeh.themes import built_in_themes
-
-    hv.extension('bokeh')
-    hv.renderer('bokeh').theme = built_in_themes['dark_minimal']
-    hv.renderer('bokeh').webgl = True
-    pio.templates.default = "plotly"
-    pio.templates.default = "plotly_dark"
-
-    color_cycle = hv.Cycle([
-        '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
-    ])
-
-    opts.defaults(
-        opts.Curve(tools=['hover'], show_grid=True, color=color_cycle, muted_alpha=0),
-        opts.Table(bgcolor='black')
-    )
-
-
-def trim_number(x):
-    if x > 1000000000:
-        return f"{round(x / 1000000000, 3)}B"
-    if x > 1000000:
-        return f"{round(x / 1000000, 3)}M"
-    if x > 10000:
-        return f"{round(x / 1000, 2)}k"
-        return str(x)
-    if x != 0 and x < 0.01:
-        return f"{round(x * 1000000, 2)}μ"
-    return str(x)
-
-
-def format_percentage(vals, perc, pvals, pval_threshold=0.02):
-    result = round(perc, 2).astype(str).apply(
-        lambda s: f"({'' if s.startswith('-') or (s == '0.0') else '+'}{s}%)"
-    ).to_frame()
-    result['vals'] = vals.apply(lambda x: trim_number(x))
-    result['pvals'] = pvals
-    result['pval_marker'] = pvals.apply(lambda x: "* " if x < pval_threshold else "")
-    result['value'] = result['vals'] + " " + result['pval_marker'] + result['value']
-    return result['value']
-
-
-def ptable(df):
-    print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False, floatfmt=".3f"))
-
-
-def trim_wa_path(path):
-    return "_".join(path.split("_")[1:-2])
-
-
 class WorkloadNotebookAnalysis:
     """
     Container class for analysis of different runs (potentially with many iterations) of a single workload.
 
-    ```
+    ```python
     gb5 = WorkloadNotebookAnalysis('/home/user/tmp/geekbench/', [
         'geekbench_baseline_3_3101',
         'geekbench_ufc_feec_all_cpus_3_3001',
@@ -192,6 +139,7 @@ def traces(self) -> Dict[str, Dict[int, lisa.trace.Trace]]:
         })
 
     def show(self):
+        """Print the results dataframe, benchmark_dirs, tags and kernels versions"""
         display(self.results)
         print('benchmark_dirs:', self.benchmark_dirs)
         print('tags:', self.tags)
@@ -226,10 +174,51 @@ def load_parquet(benchmark):
             result = postprocess(result)
         self.analysis[name.split('.')[0]] = result
 
-    def plot_gmean_bars(self, df, x='stat', y='value', facet_col='metric', facet_col_wrap=3, title='',
-                        width=None, height=600, gmean_round=1, include_columns=[], table_sort=None,
-                        order_cluster=False, sort_ascending=False, include_total=False, debug=False,
-                        percentage=True):
+    def plot_gmean_bars(self, df: pd.DataFrame, x: str = 'stat', y: str = 'value', color: str = 'tag',
+                        facet_col: str = 'metric', facet_col_wrap: int = 3, title: str = 'Gmean values',
+                        width: int = None, height: int = 600, gmean_round: int = 1,
+                        include_columns: List[str] = [], table_sort: List[str] = None,
+                        order_cluster: bool = False, sort_ascending: bool = False, include_total: bool = False,
+                        debug: bool = False, percentage: bool = True) -> pd.DataFrame:
+        """
+        Plot gmean values of some metric with statistical analysis attached.
+        It's mainly intended as a way of comparing multiple iterations across workloads and so
+        it expects a melt-like (`pd.melt`) dataframe to plot.
+
+        The function heavily relies on multiple assumptions about the underlying dataframe so it might break if those
+        are not met.
+
+        .. note:: This function has side-effects. The figure of the resulting plot will be saved to `self.px_figures`.
+
+        :param df: Dataframe with the data to plot
+        :param x: Column denoting the x axis
+        :param y: Column denoting the y axis
+        :param color: Column denoting how to split the data into bars
+        :param facet_col: Column denoting how to split the plot into facets
+        :param facet_col_wrap: Number of facet column in one row of the plot
+        :param title: Displayed title of the plot
+        :param width: Plot width
+        :param height: Plot height
+        :param gmean_round: Number of decimal places to round the gmeans to
+        :param include_columns: Other columns to include in the resulting data table
+        :param table_sort: List of columns to sort the data table by
+        :param order_cluster: Order the plot by CPU clusters
+        :param sort_ascending: Order the plot by ascending values
+        :param include_total: Include the 'total' column alongside the clusters
+        :param debug: Insert a pdb breakpoint before computing the dataframes
+        :param percentage: Include percentage differences and check pvalues
+
+        :return: Dataframe of the ASCII table that will be printed above the resulting plot.
+
+        The resulting dataframe can be included in the summary dict for later use as shown below.
+        ```python
+        gb5.summary['scores'] = gb5.plot_gmean_bars(
+            gb5.results, x='stat', y='value', facet_col='metric', facet_col_wrap=3,
+            title='gmean benchmark score', width=1600, height=600
+        )
+        ```
+
+        """
 
         shown_clusters = self.CLUSTERS if not include_total else self.CLUSTERS_TOTAL
         if 'unit' not in df.columns:
@@ -301,7 +290,7 @@ def plot_gmean_bars(self, df, x='stat', y='value', facet_col='metric', facet_col
         ) if percentage else gmeans_mean['value']
 
         # plot bars
-        fig = px.bar(gmeans_mean, x=x, y=y, color='tag', facet_col=facet_col, facet_col_wrap=facet_col_wrap,
+        fig = px.bar(gmeans_mean, x=x, y=y, color=color, facet_col=facet_col, facet_col_wrap=facet_col_wrap,
                      barmode='group', title=title, width=width, height=height,
                      text=plot_text)
         fig.update_traces(textposition='outside')
@@ -313,8 +302,27 @@ def plot_gmean_bars(self, df, x='stat', y='value', facet_col='metric', facet_col
 
         return data_table
 
-    def plot_lines_px(self, df, x='iteration', y='value', color='tag', facet_col=None, facet_col_wrap=2,
-                      height=600, width=None, title=None, scale_y=False, renderer='iframe'):
+    def plot_lines_px(self, df: pd.DataFrame, x: str = 'iteration', y: str = 'value',
+                      color: str = 'tag', facet_col: str = None, facet_col_wrap: int = 2,
+                      height: int = 600, width: int = None, title: str = None,
+                      scale_y: bool = False, renderer: str = 'iframe'):
+        """
+        Plot lines of some metric, e.g across iterations.
+
+        .. note:: This function has side-effects. The figure of the resulting plot will be saved to `self.px_figures`.
+
+        :param df: Dataframe with the data to plot
+        :param x: Column denoting the x axis
+        :param y: Column denoting the y axis
+        :param color: Column denoting how to split the data into lines
+        :param facet_col: Column denoting how to split the plot into facets
+        :param facet_col_wrap: Number of facet column in one row of the plot
+        :param title: Displayed title of the plot
+        :param width: Plot width
+        :param height: Plot height
+        :param scale_y: Whether the y axis should maintain the same scale across facets
+        :param renderer: Plotly express renderer to be used
+        """
         fig = px.line(df, x=x, y=y, color=color, facet_col=facet_col, facet_col_wrap=facet_col_wrap,
                       height=height, width=width, title=title)
         if not scale_y:
@@ -327,7 +335,14 @@ def _title_to_filename(self, title, suffix):
             '__', '_'
         ) + '__' + "__".join(self.tags) + suffix
 
-    def save_image_plots(self, directory, extension='png', width=1800):
+    def save_image_plots(self, directory: str, extension: str = 'png', width: int = 1800):
+        """
+        Save all image plots contained in `px_figures` and `hv_figures` into a directory.
+
+        :param directory: Directory to save the plots into
+        :param extension: File extension of the resulting plots
+        :param width: Width of the resulting plots
+        """
         for name, fig in self.px_figures.items():
             filename = f"{directory}/{name}.{extension}"
             self.px_figures[name].write_image(filename, width=width)
@@ -1551,3 +1566,57 @@ def uclamp_per_task_line(self, tasks=None, height=600, width=1600, include_label
             opts.Curve(height=height, width=width, interpolation='steps-post', framewise=True)
         )
         return layout
+
+
+def setup_notebook():
+    import plotly.io as pio
+    from holoviews import opts
+    from bokeh.themes import built_in_themes
+
+    hv.extension('bokeh')
+    hv.renderer('bokeh').theme = built_in_themes['dark_minimal']
+    hv.renderer('bokeh').webgl = True
+    pio.templates.default = "plotly"
+    pio.templates.default = "plotly_dark"
+
+    color_cycle = hv.Cycle([
+        '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
+    ])
+
+    opts.defaults(
+        opts.Curve(tools=['hover'], show_grid=True, color=color_cycle, muted_alpha=0),
+        opts.Table(bgcolor='black')
+    )
+
+
+def trim_number(x):
+    if x > 1000000000:
+        return f"{round(x / 1000000000, 3)}B"
+    if x > 1000000:
+        return f"{round(x / 1000000, 3)}M"
+    if x > 10000:
+        return f"{round(x / 1000, 2)}k"
+        return str(x)
+    if x != 0 and x < 0.01:
+        return f"{round(x * 1000000, 2)}μ"
+    return str(x)
+
+
+def format_percentage(vals, perc, pvals, pval_threshold=0.02):
+    result = round(perc, 2).astype(str).apply(
+        lambda s: f"({'' if s.startswith('-') or (s == '0.0') else '+'}{s}%)"
+    ).to_frame()
+    result['vals'] = vals.apply(lambda x: trim_number(x))
+    result['pvals'] = pvals
+    result['pval_marker'] = pvals.apply(lambda x: "* " if x < pval_threshold else "")
+    result['value'] = result['vals'] + " " + result['pval_marker'] + result['value']
+    return result['value']
+
+
+def ptable(df):
+    print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False, floatfmt=".3f"))
+
+
+def trim_wa_path(path):
+    return "_".join(path.split("_")[1:-2])
+