Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Preparation progress dashboard utility script #579

Merged
merged 33 commits into from
May 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
be72a1a
Add imports to relevant models for user permission
aristizabal95 Apr 15, 2024
db33ada
Implement permission for mlcube owners to see dset emails
aristizabal95 Apr 16, 2024
2d40c02
Implement get_user function
aristizabal95 Apr 17, 2024
c38ea96
Implement rano-dashboard script
aristizabal95 Apr 17, 2024
ebd7788
Remove unnecessary try-except block
aristizabal95 Apr 17, 2024
6d82c54
Merge branch 'main' into preparation-dashboard
aristizabal95 Apr 17, 2024
76ba633
Organize code
aristizabal95 Apr 17, 2024
d3c14c9
Set correct default paths
aristizabal95 Apr 17, 2024
3f9dc0c
Apply linter fixes
aristizabal95 Apr 17, 2024
d06067a
Merge branch 'main' into preparation-dashboard
aristizabal95 Apr 24, 2024
ec5659e
Use int ids for user tests
aristizabal95 Apr 24, 2024
83f21e0
Merge branch 'preparation-dashboard' of https://github.com/aristizaba…
aristizabal95 Apr 24, 2024
74dae67
Check for anon user. Add test
aristizabal95 Apr 24, 2024
d759045
fix linting issues
aristizabal95 Apr 24, 2024
960d686
Fix user tests
aristizabal95 Apr 25, 2024
1d152eb
Increase version number
aristizabal95 Apr 25, 2024
b8873a8
Remove duplicate dashboard
aristizabal95 Apr 26, 2024
9952c59
Move medperf initialization to the very beginning
aristizabal95 May 2, 2024
aff1c3b
Revert to using floats for indices
aristizabal95 May 2, 2024
81d47f1
Use queryset.exists() instead of length check
aristizabal95 May 2, 2024
21da575
Build dashboard as a package
aristizabal95 May 2, 2024
e907e1c
Require stages file to be passed explicitly
aristizabal95 May 2, 2024
b48b2e4
Add a README explaining how to use the dashboard
aristizabal95 May 2, 2024
f03b425
Display a warning about report visibility
aristizabal95 May 2, 2024
82eec07
Add warning of email visibility on dset submission
aristizabal95 May 3, 2024
52019d9
Fix linter issues
aristizabal95 May 3, 2024
8f5d0bf
Fix linter issues
aristizabal95 May 3, 2024
af4a07c
Merge branch 'main' into preparation-dashboard
aristizabal95 May 6, 2024
b987a08
Update server/user/tests/test_pk.py
aristizabal95 May 10, 2024
ffee542
Remove typer app from get_data
aristizabal95 May 10, 2024
134dad4
Merge branch 'preparation-dashboard' of https://github.com/aristizaba…
aristizabal95 May 10, 2024
e64f010
Remove stages.csv from PR
aristizabal95 May 10, 2024
1d0d7b3
Make stages_path a required argument
aristizabal95 May 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/medperf/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.2"
__version__ = "0.1.3"
5 changes: 5 additions & 0 deletions cli/medperf/commands/dataset/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,8 +309,13 @@ def prompt_for_report_sending_approval(self):
+ " dataset subjects have reached Stage 1, and that 60% of your dataset subjects"
+ " have reached Stage 3:"
)
warning = (
"Note that reports will be visible by the Data Preparation MLCube owner and by the"
" Benchmark owner to keep track of progress and provide support if needed."
)
config.ui.print(msg)
dict_pretty_print(example)
config.ui.print_warning(warning)
aristizabal95 marked this conversation as resolved.
Show resolved Hide resolved

msg = (
" \nDo you approve the automatic submission of summaries similar to the one above"
Expand Down
5 changes: 5 additions & 0 deletions cli/medperf/commands/dataset/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,11 @@ def upload(self):
submission_dict = self.dataset.todict()
dict_pretty_print(submission_dict)
msg = "Do you approve the registration of the presented data to MedPerf? [Y/n] "
warning = (
"Upon submission, your email address will be visible to the Data Preparation"
+ " Owner for traceability and debugging purposes."
)
self.ui.print_warning(warning)
self.approved = self.approved or approval_prompt(msg)

if self.approved:
Expand Down
14 changes: 14 additions & 0 deletions cli/medperf/comms/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,17 @@ def update_dataset(self, dataset_id: int, data: dict):
dataset_id (int): ID of the dataset to update
data (dict): Updated information of the dataset.
"""

@abstractmethod
def get_user(self, user_id: int) -> dict:
"""Retrieves the specified user. This will only return if
the current user has permission to view the requested user,
either by being himself, an admin or an owner of a data preparation
mlcube used by the requested user

Args:
user_id (int): User UID

Returns:
dict: Requested user information
"""
20 changes: 20 additions & 0 deletions cli/medperf/comms/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,3 +531,23 @@ def get_mlcube_datasets(self, mlcube_id: int) -> dict:

datasets = self.__get_list(f"{self.server_url}/mlcubes/{mlcube_id}/datasets/")
return datasets

def get_user(self, user_id: int) -> dict:
"""Retrieves the specified user. This will only return if
the current user has permission to view the requested user,
either by being himself, an admin or an owner of a data preparation
mlcube used by the requested user

Args:
user_id (int): User UID

Returns:
dict: Requested user information
"""
url = f"{self.server_url}/users/{user_id}/"
res = self.__auth_get(url)
if res.status_code != 200:
log_response_error(res)
details = format_errors_dict(res.json())
raise CommunicationRequestError(f"Could not retrieve user: {details}")
return res.json()
26 changes: 26 additions & 0 deletions scripts/dashboard/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Medperf Data Preparation Dashboard

The medperf data preparation dashboard provides visualization on the usage of a data preparation mlcube and the stages data owners are at. This will hopefully provide insights into how far along the process is going, and wether users are having trouble specific to the execution of the data preparation pipeline.

## Installation

To install, execute the following command at this folder:

```
pip install -e .
```

## How to use

To use, you need to have a few assets and identifiers beforehand:
- MLCube ID: The ID of the MLCube that is being used as a data preparation MLCube. To be able to see progress, you must be the owner of this MLCube
- Stages File: A `CSV` file that contains the human-readable information of each of the stages that the data preparation MLCube contains. The CSV should have the following columns: `Status Code, status_name, comment, docs_url, color`
- Institutions File: A `CSV` file that maps emails to institutions that are expected to be part of the preparation procedure. The CSV should have the following columns: `institution, email`

Once all requirements are covered, you can execute the following command:

```
medperf-dashboard -m <MLCube ID> -s <Stages File> -i <Institutions File>
```

Running this command will fetch the latest reports from the medperf server, and start a local server that will contain the visualization of the progress. To access this server, head to `http://localhost:8050` on your preferred browser.
5 changes: 5 additions & 0 deletions scripts/dashboard/medperf_dashboard/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from medperf.init import initialize

initialize()

from .preparation_dashboard import t_app # noqa
97 changes: 97 additions & 0 deletions scripts/dashboard/medperf_dashboard/get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os
import pandas as pd
import datetime

from medperf.entities.dataset import Dataset
from medperf import config

from .utils import get_institution_from_email, get_reports_path, stage_id2name


def get_dsets(mlcube_id):
dsets = Dataset.all(filters={"mlcube": mlcube_id})
aristizabal95 marked this conversation as resolved.
Show resolved Hide resolved
dsets = [dset.todict() for dset in dsets]
for dset in dsets:
user_id = dset["owner"]
dset["user"] = config.comms.get_user(user_id)
Comment on lines +14 to +16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how many API calls are we expecting here? maybe we can have an endpoint that returns all of this

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I expect here a large number of unnecessary API calls (this includes multiple datasets for the same user, so we are talking more than 30 maybe?)
let's have an issue to replace this code later. in the FL PR we will have an endpoint that lists experiment datasets and their owners information so we can reuse the logic defined there to make this better.


return dsets


def build_dset_df(dsets, user2institution, stages_df):
formatted_dsets = []
for dset in dsets:
email = dset["user"]["email"]
institution = get_institution_from_email(email, user2institution)
formatted_dset = {
"name": dset["name"],
"owner": dset["owner"],
"email": email,
"is_valid": dset["is_valid"],
"created_at": dset["created_at"],
"modified_at": dset["modified_at"],
"institution": institution,
}
if len(dset["report"]):
# Contains a readable report
report = dset["report"]
exec_status = report["execution_status"]
formatted_dset["execution_status"] = exec_status
formatted_dset["progress"] = report["progress"]

formatted_dsets.append(formatted_dset)
dsets_df = pd.DataFrame(formatted_dsets)

progress = dsets_df["progress"].fillna({})
progress_df = pd.DataFrame(progress.values.tolist())
progress_df.rename(columns=lambda x: stage_id2name(x, stages_df), inplace=True)
progress_df = progress_df.fillna("0.0%").map(lambda x: float(x[:-1]) / 100)
progress_df = progress_df.groupby(level=0, axis=1).sum()

full_table = dsets_df.join(progress_df)
full_table = full_table[full_table["is_valid"]]
full_table.drop(columns=["owner", "progress"], inplace=True)

return full_table


def write_dsets_df(dsets_df, full_path):
timenow = datetime.datetime.now(datetime.timezone.utc)

full_table = dsets_df
latest_table = full_table.sort_values("modified_at").groupby("institution").last()
latest_table = latest_table.loc[:, (latest_table != 0).any(axis=0)]

full_table_path = os.path.join(full_path, "full_table.csv")
latest_table_path = os.path.join(full_path, "latest_table.csv")
timestamp_path = os.path.join(full_path, f"{timenow}.csv")

full_table.to_csv(full_table_path)
full_table.to_csv(timestamp_path)
latest_table.to_csv(latest_table_path)


def write_sites(dsets_df, institutions_df, full_path):
sites_path = os.path.join(full_path, "sites.txt")

expected_sites = institutions_df["institution"].values.tolist()
registered_sites = dsets_df["institution"].values.tolist()
sites = list(set(expected_sites + registered_sites))

with open(sites_path, "w") as f:
f.write("\n".join(sites))


def get_data(mlcube_id, stages_path, institutions_path, out_path):
dsets = get_dsets(mlcube_id)
full_path = get_reports_path(out_path, mlcube_id)
os.makedirs(full_path, exist_ok=True)

institutions_df = pd.read_csv(institutions_path)
user2institution = {u: i for i, u in institutions_df.values.tolist()}
stages_df = pd.read_csv(stages_path)
stages_df.set_index("Status Code", inplace=True)

dsets_df = build_dset_df(dsets, user2institution, stages_df)
write_dsets_df(dsets_df, full_path)
write_sites(dsets_df, institutions_df, full_path)
Loading
Loading