mlcommons · aristizabal95 · May 10, 2024 · Apr 15, 2024 · Apr 16, 2024 · Apr 17, 2024
@@ -1 +1 @@
-__version__ = "0.1.2"
+__version__ = "0.1.3"
@@ -309,8 +309,13 @@ def prompt_for_report_sending_approval(self):
             + " dataset subjects have reached Stage 1, and that 60% of your dataset subjects"
             + " have reached Stage 3:"
         )
+        warning = (
+            "Note that reports will be visible by the Data Preparation MLCube owner and by the"
+            " Benchmark owner to keep track of progress and provide support if needed."
+        )
         config.ui.print(msg)
         dict_pretty_print(example)
+        config.ui.print_warning(warning)
 
         msg = (
             " \nDo you approve the automatic submission of summaries similar to the one above"

@@ -151,6 +151,11 @@ def upload(self):
         submission_dict = self.dataset.todict()
         dict_pretty_print(submission_dict)
         msg = "Do you approve the registration of the presented data to MedPerf? [Y/n] "
+        warning = (
+            "Upon submission, your email address will be visible to the Data Preparation"
+            + " Owner for traceability and debugging purposes."
+        )
+        self.ui.print_warning(warning)
         self.approved = self.approved or approval_prompt(msg)
 
         if self.approved:

@@ -285,3 +285,17 @@ def update_dataset(self, dataset_id: int, data: dict):
             dataset_id (int): ID of the dataset to update
             data (dict): Updated information of the dataset.
         """
+
+    @abstractmethod
+    def get_user(self, user_id: int) -> dict:
+        """Retrieves the specified user. This will only return if
+        the current user has permission to view the requested user,
+        either by being himself, an admin or an owner of a data preparation
+        mlcube used by the requested user
+
+        Args:
+            user_id (int): User UID
+
+        Returns:
+            dict: Requested user information
+        """
@@ -531,3 +531,23 @@ def get_mlcube_datasets(self, mlcube_id: int) -> dict:
 
         datasets = self.__get_list(f"{self.server_url}/mlcubes/{mlcube_id}/datasets/")
         return datasets
+
+    def get_user(self, user_id: int) -> dict:
+        """Retrieves the specified user. This will only return if
+        the current user has permission to view the requested user,
+        either by being himself, an admin or an owner of a data preparation
+        mlcube used by the requested user
+
+        Args:
+            user_id (int): User UID
+
+        Returns:
+            dict: Requested user information
+        """
+        url = f"{self.server_url}/users/{user_id}/"
+        res = self.__auth_get(url)
+        if res.status_code != 200:
+            log_response_error(res)
+            details = format_errors_dict(res.json())
+            raise CommunicationRequestError(f"Could not retrieve user: {details}")
+        return res.json()
@@ -0,0 +1,26 @@
+# Medperf Data Preparation Dashboard
+
+The medperf data preparation dashboard provides visualization on the usage of a data preparation mlcube and the stages data owners are at. This will hopefully provide insights into how far along the process is going, and wether users are having trouble specific to the execution of the data preparation pipeline.
+
+## Installation
+
+To install, execute the following command at this folder:
+
+```
+pip install -e .
+```
+
+## How to use
+
+To use, you need to have a few assets and identifiers beforehand:
+- MLCube ID: The ID of the MLCube that is being used as a data preparation MLCube. To be able to see progress, you must be the owner of this MLCube
+- Stages File: A `CSV` file that contains the human-readable information of each of the stages that the data preparation MLCube contains. The CSV should have the following columns: `Status Code, status_name, comment, docs_url, color`
+- Institutions File: A `CSV` file that maps emails to institutions that are expected to be part of the preparation procedure. The CSV should have the following columns: `institution, email`
+
+Once all requirements are covered, you can execute the following command:
+
+```
+medperf-dashboard -m <MLCube ID> -s <Stages File> -i <Institutions File>
+```
+
+Running this command will fetch the latest reports from the medperf server, and start a local server that will contain the visualization of the progress. To access this server, head to `http://localhost:8050` on your preferred browser.
@@ -0,0 +1,5 @@
+from medperf.init import initialize
+
+initialize()
+
+from .preparation_dashboard import t_app # noqa
@@ -0,0 +1,97 @@
+import os
+import pandas as pd
+import datetime
+
+from medperf.entities.dataset import Dataset
+from medperf import config
+
+from .utils import get_institution_from_email, get_reports_path, stage_id2name
+
+
+def get_dsets(mlcube_id):
+    dsets = Dataset.all(filters={"mlcube": mlcube_id})
+    dsets = [dset.todict() for dset in dsets]
+    for dset in dsets:
+        user_id = dset["owner"]
+        dset["user"] = config.comms.get_user(user_id)
+
+    return dsets
+
+
+def build_dset_df(dsets, user2institution, stages_df):
+    formatted_dsets = []
+    for dset in dsets:
+        email = dset["user"]["email"]
+        institution = get_institution_from_email(email, user2institution)
+        formatted_dset = {
+            "name": dset["name"],
+            "owner": dset["owner"],
+            "email": email,
+            "is_valid": dset["is_valid"],
+            "created_at": dset["created_at"],
+            "modified_at": dset["modified_at"],
+            "institution": institution,
+        }
+        if len(dset["report"]):
+            # Contains a readable report
+            report = dset["report"]
+            exec_status = report["execution_status"]
+            formatted_dset["execution_status"] = exec_status
+            formatted_dset["progress"] = report["progress"]
+
+        formatted_dsets.append(formatted_dset)
+        dsets_df = pd.DataFrame(formatted_dsets)
+
+        progress = dsets_df["progress"].fillna({})
+        progress_df = pd.DataFrame(progress.values.tolist())
+        progress_df.rename(columns=lambda x: stage_id2name(x, stages_df), inplace=True)
+        progress_df = progress_df.fillna("0.0%").map(lambda x: float(x[:-1]) / 100)
+        progress_df = progress_df.groupby(level=0, axis=1).sum()
+
+        full_table = dsets_df.join(progress_df)
+        full_table = full_table[full_table["is_valid"]]
+        full_table.drop(columns=["owner", "progress"], inplace=True)
+
+    return full_table
+
+
+def write_dsets_df(dsets_df, full_path):
+    timenow = datetime.datetime.now(datetime.timezone.utc)
+
+    full_table = dsets_df
+    latest_table = full_table.sort_values("modified_at").groupby("institution").last()
+    latest_table = latest_table.loc[:, (latest_table != 0).any(axis=0)]
+
+    full_table_path = os.path.join(full_path, "full_table.csv")
+    latest_table_path = os.path.join(full_path, "latest_table.csv")
+    timestamp_path = os.path.join(full_path, f"{timenow}.csv")
+
+    full_table.to_csv(full_table_path)
+    full_table.to_csv(timestamp_path)
+    latest_table.to_csv(latest_table_path)
+
+
+def write_sites(dsets_df, institutions_df, full_path):
+    sites_path = os.path.join(full_path, "sites.txt")
+
+    expected_sites = institutions_df["institution"].values.tolist()
+    registered_sites = dsets_df["institution"].values.tolist()
+    sites = list(set(expected_sites + registered_sites))
+
+    with open(sites_path, "w") as f:
+        f.write("\n".join(sites))
+
+
+def get_data(mlcube_id, stages_path, institutions_path, out_path):
+    dsets = get_dsets(mlcube_id)
+    full_path = get_reports_path(out_path, mlcube_id)
+    os.makedirs(full_path, exist_ok=True)
+
+    institutions_df = pd.read_csv(institutions_path)
+    user2institution = {u: i for i, u in institutions_df.values.tolist()}
+    stages_df = pd.read_csv(stages_path)
+    stages_df.set_index("Status Code", inplace=True)
+
+    dsets_df = build_dset_df(dsets, user2institution, stages_df)
+    write_dsets_df(dsets_df, full_path)
+    write_sites(dsets_df, institutions_df, full_path)