NCI-CGR · jaamarks · Sep 17, 2024 · Sep 17, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/src/cgr_gwas_qc/reporting/QC_Report_Data_Dictionary.xlsx b/src/cgr_gwas_qc/reporting/QC_Report_Data_Dictionary.xlsx
diff --git a/src/cgr_gwas_qc/reporting/__init__.py b/src/cgr_gwas_qc/reporting/__init__.py
@@ -3,7 +3,13 @@
 These tools include report templates and various constants used during
 reporting.
 """
-from .constants import CASE_CONTROL_COLORS, CASE_CONTROL_DTYPE, REPORT_NAME_MAPPER, SEX_DTYPE
+from .constants import (
+    CASE_CONTROL_COLORS,
+    CASE_CONTROL_DTYPE,
+    REPORT_NAME_MAPPER,
+    SEX_DTYPE,
+    UNEXPECTED_REPLICATE_STATUS_DTYPE,
+)
 from .qc_exclusions import ExclusionTables
 from .sample_qc import SampleQC
 from .subject_qc import SubjectQC
@@ -18,4 +24,5 @@
     "SampleQC",
     "SEX_DTYPE",
     "SubjectQC",
+    "UNEXPECTED_REPLICATE_STATUS_DTYPE",
 ]
diff --git a/src/cgr_gwas_qc/reporting/constants.py b/src/cgr_gwas_qc/reporting/constants.py
@@ -13,6 +13,8 @@
 
 SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])
 
+UNEXPECTED_REPLICATE_STATUS_DTYPE = pd.CategoricalDtype(categories=[0, 1, 2, 3])
+
 # Mapping current column names to names from the legacy workflow to maintain
 # consistency in deliverables.
 REPORT_NAME_MAPPER = {

diff --git a/src/cgr_gwas_qc/workflow/scripts/subject_qc_table.py b/src/cgr_gwas_qc/workflow/scripts/subject_qc_table.py
@@ -31,7 +31,7 @@
 import pandas as pd
 import typer
 
-from cgr_gwas_qc.reporting import CASE_CONTROL_DTYPE, SEX_DTYPE
+from cgr_gwas_qc.reporting import CASE_CONTROL_DTYPE, SEX_DTYPE, UNEXPECTED_REPLICATE_STATUS_DTYPE
 from cgr_gwas_qc.typing import PathLike
 from cgr_gwas_qc.workflow.scripts import sample_concordance, sample_qc_table
 
@@ -43,6 +43,7 @@
     "case_control": CASE_CONTROL_DTYPE,
     "is_unexpected_replicate": "boolean",
     "unexpected_replicate_ids": "string",
+    "unexpected_replicate_status": UNEXPECTED_REPLICATE_STATUS_DTYPE,
     "expected_sex": SEX_DTYPE,
     "predicted_sex": SEX_DTYPE,
     "X_inbreeding_coefficient": "float",
@@ -64,6 +65,7 @@ def read(filename: PathLike) -> pd.DataFrame:
         - case_control
         - is_unexpected_replicate
         - unexpected_replicate_ids
+        - unexpected_replicate_status
         - expected_sex
         - predicted_sex
         - X_inbreeding_coefficient
@@ -85,6 +87,7 @@ def main(
         .pipe(_fix_hyphen_in_ancestry_name)
         .pipe(_sample_qc_to_subject_qc)
         .pipe(_add_unexpected_replicate_ids, sample_concordance_csv)
+        .pipe(_add_unexpected_replicate_status)
         .reindex(DTYPES.keys(), axis=1)
         .to_csv(outfile, index=False)
     )
@@ -97,9 +100,10 @@ def _fix_hyphen_in_ancestry_name(df: pd.DataFrame) -> pd.DataFrame:
 
 
 def _sample_qc_to_subject_qc(df: pd.DataFrame) -> pd.DataFrame:
+    include_contam = list(DTYPES.keys()) + ["is_contaminated"]
     return (
         df.query("is_subject_representative")
-        .reindex(DTYPES.keys(), axis=1)
+        .reindex(include_contam, axis=1)
         .dropna(how="all", axis=1)
     )
 
@@ -139,6 +143,64 @@ def _add_unexpected_replicate_ids(df: pd.DataFrame, sample_concordance_csv: Path
     )
 
 
+def _add_unexpected_replicate_status(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Create a new column: "unexpected_replicate_status"
+
+    Values for "unexpected_replicate_status":
+
+    0: Not an unexpected replicate
+    1: Retained unexpected replicate (only the other sample is contaminated)
+    2: Not retained unexpected replicate (this sample is contaminated)
+    3: Not retained unexpected replicate (neither sample is contaminated)
+
+    See the following GitHub comments for more information:
+    - https://github.com/NCI-CGR/GwasQcPipeline/issues/269#issuecomment-2273906441
+    - https://github.com/NCI-CGR/GwasQcPipeline/issues/269#issuecomment-2278345857
+    """
+
+    # iterate over each row in the dataframe
+    for index, row in df.iterrows():
+        # Check if unexpected replicate and extract information
+        if row["is_unexpected_replicate"]:
+            pair = row["unexpected_replicate_ids"].split("|")
+            current_subject = df.loc[
+                index, "Group_By_Subject_ID"
+            ]  # get the subject ID of current row
+
+            other_subject = "".join([str(subid) for subid in pair if subid != current_subject])
+
+            # Check contamination status of current subject
+            if not df.loc[index, "is_contaminated"]:
+                other_subject_index = df[df["Group_By_Subject_ID"] == other_subject].index
+                other_subject_row = df.iloc[other_subject_index]
+                is_contaminated = other_subject_row["is_contaminated"]
+
+                # Update status of current subject if other subject is contaminated
+                if is_contaminated.all():
+                    df.loc[index, "unexpected_replicate_status"] = 1
+                    df.loc[index, "is_unexpected_replicate"] = False
+
+                else:
+                    # Neither contaminated, no change
+                    df.loc[index, "unexpected_replicate_status"] = 3
+
+            # Current subject is contaminated (no status change)
+            else:
+                df.loc[index, "unexpected_replicate_status"] = 2
+                current_subject = typer.style(current_subject, fg=typer.colors.RED)
+
+        # Not an unexpected replicate (no change)
+        else:
+            df.loc[index, "unexpected_replicate_status"] = 0
+
+    # Ensure "unexpected_replicate_status" has the expected data type
+    df["unexpected_replicate_status"] = df["unexpected_replicate_status"].astype(
+        UNEXPECTED_REPLICATE_STATUS_DTYPE
+    )
+    return df
+
+
 def _connected_ids(ids: Iterable[Tuple[str, str]]) -> pd.Series:
     """Create groups of connected IDs.
 

diff --git a/tests/workflow/scripts/test_subject_qc_table.py b/tests/workflow/scripts/test_subject_qc_table.py
@@ -1,3 +1,4 @@
+import pandas as pd
 import pytest
 
 from cgr_gwas_qc.workflow.scripts import subject_qc_table
@@ -11,3 +12,47 @@ def test_subject_qc_table(real_data_cache, sample_qc_df):
     )
     assert 0 == df.is_unexpected_replicate.sum()
     assert 3 == df.is_sex_discordant.sum()
+
+
+@pytest.fixture
+def fake_sample_qc() -> pd.DataFrame:
+    columns = [
+        "Sample_ID",
+        "Group_By_Subject_ID",
+        "is_contaminated",
+        "is_unexpected_replicate",
+        "unexpected_replicate_ids",
+    ]
+    data = [
+        ("SP00001", "SB00001", True, True, "SB00001|SB00002"),
+        ("SP00002", "SB00002", False, True, "SB00001|SB00002"),
+        ("SP00003", "SB00003", False, False, ""),
+        ("SP00004", "SB00004", True, False, ""),
+        ("SP00005", "SB00005", False, True, "SB00005|SB00006"),
+        ("SP00006", "SB00006", False, True, "SB00005|SB00006"),
+    ]
+    return pd.DataFrame(data, columns=columns)
+
+
+# create a test case that tests the new column "unexpected_replicate_status"
+@pytest.mark.parametrize(
+    "subject_id, predicted_status, predicted_unexp_rep",
+    [
+        ("SB00001", 2, True),
+        ("SB00002", 1, False),
+        ("SB00003", 0, False),
+        ("SB00004", 0, False),
+        ("SB00005", 3, True),
+        ("SB00006", 3, True),
+    ],
+)
+def test_add_unexpected_replicate_status(
+    fake_sample_qc, subject_id, predicted_status, predicted_unexp_rep
+):
+    df = subject_qc_table._add_unexpected_replicate_status(fake_sample_qc).copy()
+    df = df.set_index("Group_By_Subject_ID")
+
+    assert (
+        df.loc[subject_id, "unexpected_replicate_status"] == predicted_status
+        and df.loc[subject_id, "is_unexpected_replicate"] == predicted_unexp_rep
+    )