Skip to content

Commit

Permalink
doc: Complete doc comments for WorkloadNotebookAnalysis
Browse files Browse the repository at this point in the history
  • Loading branch information
mrkajetanp committed Aug 8, 2023
1 parent f3a5737 commit 48e58db
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 74 deletions.
14 changes: 2 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ The main idea is to contain analysis tied to different runs of a specific worklo
`wp.notebook.WorkloadNotebookAnalysis` takes a directory with benchmark runs and a list of the run directories inside it as arguments.
The notebooks should be able to automatically adjust to changing the number of runs.

```
```python
gb5 = WorkloadNotebookAnalysis('/home/user/tmp/geekbench/', [
'geekbench_baseline_3_3101',
'geekbench_ufc_feec_all_cpus_3_3001',
Expand All @@ -195,18 +195,8 @@ The `plot` proxy can be used to accessed all the pre-defined plotting methods, f
#### Manual plotting

The `wp.notebook.WorkloadNotebookAnalysis.plot_gmean_bars` helper method can be used to plot a given dataframe as bars and automatically attach statistical analysis to it.
It's mainly intended as a way of comparing gmean values of multiple iterations across workloads and so it expects a melt-like (`pd.melt`) dataframe to plot.
It heavily relies on multiple assumptions about the underlying dataframe so it might break.
The function returns a dataframe of the ASCII table that will be printed above the resulting plot. That dataframe can be included in the summary dict for later use as shown below.

There is a corresponding helper method for line plots - `wp.notebook.WorkloadNotebookAnalysis.plot_lines_px`.

```
gb5.summary['scores'] = gb5.plot_gmean_bars(gb5.results, x='stat', y='value', facet_col='metric', facet_col_wrap=3, title='gmean benchmark score', width=1600, height=600)
```

Pre-defined plotting functions in `wp.notebook.WorkloadNotebookPlotter` will include the tables in the summary automatically.

### Loading metrics generated by the processor

When using the pre-defined plotting functions the relevant metrics will automatically be loaded the first time the plot is generated and the re-used. No further steps should be necessary.
Expand All @@ -221,7 +211,7 @@ The function will take a filename, then go across every directory in `gb5.benchm
In the below example the resulting dataframe can be found in `gb5.analysis['overutilized']`.


```
```python
def postprocess_overutil(df):
df['time'] = round(df['time'], 2)
df['total_time'] = round(df['total_time'], 2)
Expand Down
193 changes: 131 additions & 62 deletions wp/notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,64 +31,11 @@
from wp.constants import APP_NAME


def setup_notebook():
import plotly.io as pio
from holoviews import opts
from bokeh.themes import built_in_themes

hv.extension('bokeh')
hv.renderer('bokeh').theme = built_in_themes['dark_minimal']
hv.renderer('bokeh').webgl = True
pio.templates.default = "plotly"
pio.templates.default = "plotly_dark"

color_cycle = hv.Cycle([
'#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
])

opts.defaults(
opts.Curve(tools=['hover'], show_grid=True, color=color_cycle, muted_alpha=0),
opts.Table(bgcolor='black')
)


def trim_number(x):
if x > 1000000000:
return f"{round(x / 1000000000, 3)}B"
if x > 1000000:
return f"{round(x / 1000000, 3)}M"
if x > 10000:
return f"{round(x / 1000, 2)}k"
return str(x)
if x != 0 and x < 0.01:
return f"{round(x * 1000000, 2)}μ"
return str(x)


def format_percentage(vals, perc, pvals, pval_threshold=0.02):
result = round(perc, 2).astype(str).apply(
lambda s: f"({'' if s.startswith('-') or (s == '0.0') else '+'}{s}%)"
).to_frame()
result['vals'] = vals.apply(lambda x: trim_number(x))
result['pvals'] = pvals
result['pval_marker'] = pvals.apply(lambda x: "* " if x < pval_threshold else "")
result['value'] = result['vals'] + " " + result['pval_marker'] + result['value']
return result['value']


def ptable(df):
print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False, floatfmt=".3f"))


def trim_wa_path(path):
return "_".join(path.split("_")[1:-2])


class WorkloadNotebookAnalysis:
"""
Container class for analysis of different runs (potentially with many iterations) of a single workload.
```
```python
gb5 = WorkloadNotebookAnalysis('/home/user/tmp/geekbench/', [
'geekbench_baseline_3_3101',
'geekbench_ufc_feec_all_cpus_3_3001',
Expand Down Expand Up @@ -192,6 +139,7 @@ def traces(self) -> Dict[str, Dict[int, lisa.trace.Trace]]:
})

def show(self):
"""Print the results dataframe, benchmark_dirs, tags and kernels versions"""
display(self.results)
print('benchmark_dirs:', self.benchmark_dirs)
print('tags:', self.tags)
Expand Down Expand Up @@ -226,10 +174,51 @@ def load_parquet(benchmark):
result = postprocess(result)
self.analysis[name.split('.')[0]] = result

def plot_gmean_bars(self, df, x='stat', y='value', facet_col='metric', facet_col_wrap=3, title='',
width=None, height=600, gmean_round=1, include_columns=[], table_sort=None,
order_cluster=False, sort_ascending=False, include_total=False, debug=False,
percentage=True):
def plot_gmean_bars(self, df: pd.DataFrame, x: str = 'stat', y: str = 'value', color: str = 'tag',
facet_col: str = 'metric', facet_col_wrap: int = 3, title: str = 'Gmean values',
width: int = None, height: int = 600, gmean_round: int = 1,
include_columns: List[str] = [], table_sort: List[str] = None,
order_cluster: bool = False, sort_ascending: bool = False, include_total: bool = False,
debug: bool = False, percentage: bool = True) -> pd.DataFrame:
"""
Plot gmean values of some metric with statistical analysis attached.
It's mainly intended as a way of comparing multiple iterations across workloads and so
it expects a melt-like (`pd.melt`) dataframe to plot.
The function heavily relies on multiple assumptions about the underlying dataframe so it might break if those
are not met.
.. note:: This function has side-effects. The figure of the resulting plot will be saved to `self.px_figures`.
:param df: Dataframe with the data to plot
:param x: Column denoting the x axis
:param y: Column denoting the y axis
:param color: Column denoting how to split the data into bars
:param facet_col: Column denoting how to split the plot into facets
:param facet_col_wrap: Number of facet column in one row of the plot
:param title: Displayed title of the plot
:param width: Plot width
:param height: Plot height
:param gmean_round: Number of decimal places to round the gmeans to
:param include_columns: Other columns to include in the resulting data table
:param table_sort: List of columns to sort the data table by
:param order_cluster: Order the plot by CPU clusters
:param sort_ascending: Order the plot by ascending values
:param include_total: Include the 'total' column alongside the clusters
:param debug: Insert a pdb breakpoint before computing the dataframes
:param percentage: Include percentage differences and check pvalues
:return: Dataframe of the ASCII table that will be printed above the resulting plot.
The resulting dataframe can be included in the summary dict for later use as shown below.
```python
gb5.summary['scores'] = gb5.plot_gmean_bars(
gb5.results, x='stat', y='value', facet_col='metric', facet_col_wrap=3,
title='gmean benchmark score', width=1600, height=600
)
```
"""

shown_clusters = self.CLUSTERS if not include_total else self.CLUSTERS_TOTAL
if 'unit' not in df.columns:
Expand Down Expand Up @@ -301,7 +290,7 @@ def plot_gmean_bars(self, df, x='stat', y='value', facet_col='metric', facet_col
) if percentage else gmeans_mean['value']

# plot bars
fig = px.bar(gmeans_mean, x=x, y=y, color='tag', facet_col=facet_col, facet_col_wrap=facet_col_wrap,
fig = px.bar(gmeans_mean, x=x, y=y, color=color, facet_col=facet_col, facet_col_wrap=facet_col_wrap,
barmode='group', title=title, width=width, height=height,
text=plot_text)
fig.update_traces(textposition='outside')
Expand All @@ -313,8 +302,27 @@ def plot_gmean_bars(self, df, x='stat', y='value', facet_col='metric', facet_col

return data_table

def plot_lines_px(self, df, x='iteration', y='value', color='tag', facet_col=None, facet_col_wrap=2,
height=600, width=None, title=None, scale_y=False, renderer='iframe'):
def plot_lines_px(self, df: pd.DataFrame, x: str = 'iteration', y: str = 'value',
color: str = 'tag', facet_col: str = None, facet_col_wrap: int = 2,
height: int = 600, width: int = None, title: str = None,
scale_y: bool = False, renderer: str = 'iframe'):
"""
Plot lines of some metric, e.g across iterations.
.. note:: This function has side-effects. The figure of the resulting plot will be saved to `self.px_figures`.
:param df: Dataframe with the data to plot
:param x: Column denoting the x axis
:param y: Column denoting the y axis
:param color: Column denoting how to split the data into lines
:param facet_col: Column denoting how to split the plot into facets
:param facet_col_wrap: Number of facet column in one row of the plot
:param title: Displayed title of the plot
:param width: Plot width
:param height: Plot height
:param scale_y: Whether the y axis should maintain the same scale across facets
:param renderer: Plotly express renderer to be used
"""
fig = px.line(df, x=x, y=y, color=color, facet_col=facet_col, facet_col_wrap=facet_col_wrap,
height=height, width=width, title=title)
if not scale_y:
Expand All @@ -327,7 +335,14 @@ def _title_to_filename(self, title, suffix):
'__', '_'
) + '__' + "__".join(self.tags) + suffix

def save_image_plots(self, directory, extension='png', width=1800):
def save_image_plots(self, directory: str, extension: str = 'png', width: int = 1800):
"""
Save all image plots contained in `px_figures` and `hv_figures` into a directory.
:param directory: Directory to save the plots into
:param extension: File extension of the resulting plots
:param width: Width of the resulting plots
"""
for name, fig in self.px_figures.items():
filename = f"{directory}/{name}.{extension}"
self.px_figures[name].write_image(filename, width=width)
Expand Down Expand Up @@ -1551,3 +1566,57 @@ def uclamp_per_task_line(self, tasks=None, height=600, width=1600, include_label
opts.Curve(height=height, width=width, interpolation='steps-post', framewise=True)
)
return layout


def setup_notebook():
import plotly.io as pio
from holoviews import opts
from bokeh.themes import built_in_themes

hv.extension('bokeh')
hv.renderer('bokeh').theme = built_in_themes['dark_minimal']
hv.renderer('bokeh').webgl = True
pio.templates.default = "plotly"
pio.templates.default = "plotly_dark"

color_cycle = hv.Cycle([
'#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
])

opts.defaults(
opts.Curve(tools=['hover'], show_grid=True, color=color_cycle, muted_alpha=0),
opts.Table(bgcolor='black')
)


def trim_number(x):
if x > 1000000000:
return f"{round(x / 1000000000, 3)}B"
if x > 1000000:
return f"{round(x / 1000000, 3)}M"
if x > 10000:
return f"{round(x / 1000, 2)}k"
return str(x)
if x != 0 and x < 0.01:
return f"{round(x * 1000000, 2)}μ"
return str(x)


def format_percentage(vals, perc, pvals, pval_threshold=0.02):
result = round(perc, 2).astype(str).apply(
lambda s: f"({'' if s.startswith('-') or (s == '0.0') else '+'}{s}%)"
).to_frame()
result['vals'] = vals.apply(lambda x: trim_number(x))
result['pvals'] = pvals
result['pval_marker'] = pvals.apply(lambda x: "* " if x < pval_threshold else "")
result['value'] = result['vals'] + " " + result['pval_marker'] + result['value']
return result['value']


def ptable(df):
print(tabulate(df, headers='keys', tablefmt='pretty', showindex=False, floatfmt=".3f"))


def trim_wa_path(path):
return "_".join(path.split("_")[1:-2])

0 comments on commit 48e58db

Please sign in to comment.