Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Improve handling of unexpected replicates (issue 269) #322

Open
wants to merge 4 commits into
base: default
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified src/cgr_gwas_qc/reporting/QC_Report_Data_Dictionary.xlsx
Binary file not shown.
9 changes: 8 additions & 1 deletion src/cgr_gwas_qc/reporting/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,13 @@
These tools include report templates and various constants used during
reporting.
"""
from .constants import CASE_CONTROL_COLORS, CASE_CONTROL_DTYPE, REPORT_NAME_MAPPER, SEX_DTYPE
from .constants import (
CASE_CONTROL_COLORS,
CASE_CONTROL_DTYPE,
REPORT_NAME_MAPPER,
SEX_DTYPE,
UNEXPECTED_REPLICATE_STATUS_DTYPE,
)
from .qc_exclusions import ExclusionTables
from .sample_qc import SampleQC
from .subject_qc import SubjectQC
Expand All @@ -18,4 +24,5 @@
"SampleQC",
"SEX_DTYPE",
"SubjectQC",
"UNEXPECTED_REPLICATE_STATUS_DTYPE",
]
2 changes: 2 additions & 0 deletions src/cgr_gwas_qc/reporting/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

SEX_DTYPE = pd.CategoricalDtype(categories=["F", "M", "U"])

UNEXPECTED_REPLICATE_STATUS_DTYPE = pd.CategoricalDtype(categories=[0, 1, 2, 3])

# Mapping current column names to names from the legacy workflow to maintain
# consistency in deliverables.
REPORT_NAME_MAPPER = {
Expand Down
66 changes: 64 additions & 2 deletions src/cgr_gwas_qc/workflow/scripts/subject_qc_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import pandas as pd
import typer

from cgr_gwas_qc.reporting import CASE_CONTROL_DTYPE, SEX_DTYPE
from cgr_gwas_qc.reporting import CASE_CONTROL_DTYPE, SEX_DTYPE, UNEXPECTED_REPLICATE_STATUS_DTYPE
from cgr_gwas_qc.typing import PathLike
from cgr_gwas_qc.workflow.scripts import sample_concordance, sample_qc_table

Expand All @@ -43,6 +43,7 @@
"case_control": CASE_CONTROL_DTYPE,
"is_unexpected_replicate": "boolean",
"unexpected_replicate_ids": "string",
"unexpected_replicate_status": UNEXPECTED_REPLICATE_STATUS_DTYPE,
"expected_sex": SEX_DTYPE,
"predicted_sex": SEX_DTYPE,
"X_inbreeding_coefficient": "float",
Expand All @@ -64,6 +65,7 @@ def read(filename: PathLike) -> pd.DataFrame:
- case_control
- is_unexpected_replicate
- unexpected_replicate_ids
- unexpected_replicate_status
- expected_sex
- predicted_sex
- X_inbreeding_coefficient
Expand All @@ -85,6 +87,7 @@ def main(
.pipe(_fix_hyphen_in_ancestry_name)
.pipe(_sample_qc_to_subject_qc)
.pipe(_add_unexpected_replicate_ids, sample_concordance_csv)
.pipe(_add_unexpected_replicate_status)
.reindex(DTYPES.keys(), axis=1)
.to_csv(outfile, index=False)
)
Expand All @@ -97,9 +100,10 @@ def _fix_hyphen_in_ancestry_name(df: pd.DataFrame) -> pd.DataFrame:


def _sample_qc_to_subject_qc(df: pd.DataFrame) -> pd.DataFrame:
include_contam = list(DTYPES.keys()) + ["is_contaminated"]
return (
df.query("is_subject_representative")
.reindex(DTYPES.keys(), axis=1)
.reindex(include_contam, axis=1)
.dropna(how="all", axis=1)
)

Expand Down Expand Up @@ -139,6 +143,64 @@ def _add_unexpected_replicate_ids(df: pd.DataFrame, sample_concordance_csv: Path
)


def _add_unexpected_replicate_status(df: pd.DataFrame) -> pd.DataFrame:
"""
Create a new column: "unexpected_replicate_status"

Values for "unexpected_replicate_status":

0: Not an unexpected replicate
1: Retained unexpected replicate (only the other sample is contaminated)
2: Not retained unexpected replicate (this sample is contaminated)
3: Not retained unexpected replicate (neither sample is contaminated)

See the following GitHub comments for more information:
- https://github.com/NCI-CGR/GwasQcPipeline/issues/269#issuecomment-2273906441
- https://github.com/NCI-CGR/GwasQcPipeline/issues/269#issuecomment-2278345857
"""

# iterate over each row in the dataframe
for index, row in df.iterrows():
# Check if unexpected replicate and extract information
if row["is_unexpected_replicate"]:
pair = row["unexpected_replicate_ids"].split("|")
current_subject = df.loc[
index, "Group_By_Subject_ID"
] # get the subject ID of current row

other_subject = "".join([str(subid) for subid in pair if subid != current_subject])

# Check contamination status of current subject
if not df.loc[index, "is_contaminated"]:
other_subject_index = df[df["Group_By_Subject_ID"] == other_subject].index
other_subject_row = df.iloc[other_subject_index]
is_contaminated = other_subject_row["is_contaminated"]

# Update status of current subject if other subject is contaminated
if is_contaminated.all():
df.loc[index, "unexpected_replicate_status"] = 1
df.loc[index, "is_unexpected_replicate"] = False

else:
# Neither contaminated, no change
df.loc[index, "unexpected_replicate_status"] = 3

# Current subject is contaminated (no status change)
else:
df.loc[index, "unexpected_replicate_status"] = 2
current_subject = typer.style(current_subject, fg=typer.colors.RED)

# Not an unexpected replicate (no change)
else:
df.loc[index, "unexpected_replicate_status"] = 0

# Ensure "unexpected_replicate_status" has the expected data type
df["unexpected_replicate_status"] = df["unexpected_replicate_status"].astype(
UNEXPECTED_REPLICATE_STATUS_DTYPE
)
return df


def _connected_ids(ids: Iterable[Tuple[str, str]]) -> pd.Series:
"""Create groups of connected IDs.

Expand Down
45 changes: 45 additions & 0 deletions tests/workflow/scripts/test_subject_qc_table.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
import pytest

from cgr_gwas_qc.workflow.scripts import subject_qc_table
Expand All @@ -11,3 +12,47 @@ def test_subject_qc_table(real_data_cache, sample_qc_df):
)
assert 0 == df.is_unexpected_replicate.sum()
assert 3 == df.is_sex_discordant.sum()


@pytest.fixture
def fake_sample_qc() -> pd.DataFrame:
columns = [
"Sample_ID",
"Group_By_Subject_ID",
"is_contaminated",
"is_unexpected_replicate",
"unexpected_replicate_ids",
]
data = [
("SP00001", "SB00001", True, True, "SB00001|SB00002"),
("SP00002", "SB00002", False, True, "SB00001|SB00002"),
("SP00003", "SB00003", False, False, ""),
("SP00004", "SB00004", True, False, ""),
("SP00005", "SB00005", False, True, "SB00005|SB00006"),
("SP00006", "SB00006", False, True, "SB00005|SB00006"),
]
return pd.DataFrame(data, columns=columns)


# create a test case that tests the new column "unexpected_replicate_status"
@pytest.mark.parametrize(
"subject_id, predicted_status, predicted_unexp_rep",
[
("SB00001", 2, True),
("SB00002", 1, False),
("SB00003", 0, False),
("SB00004", 0, False),
("SB00005", 3, True),
("SB00006", 3, True),
],
)
def test_add_unexpected_replicate_status(
fake_sample_qc, subject_id, predicted_status, predicted_unexp_rep
):
df = subject_qc_table._add_unexpected_replicate_status(fake_sample_qc).copy()
df = df.set_index("Group_By_Subject_ID")

assert (
df.loc[subject_id, "unexpected_replicate_status"] == predicted_status
and df.loc[subject_id, "is_unexpected_replicate"] == predicted_unexp_rep
)
Loading