Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moved VariantEffects so it's using the Enum #78

Merged
merged 15 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,15 @@ We can then view the data using the list commands.
[('NP_09876.5', 26)]
>>> tx_dict = cohort.list_data_by_tx('NM_1234.5')
>>> sorted(tx_dict['NM_1234.5'].items())
[('frameshift_variant', 1), ('missense_variant', 1)]
[('FRAMESHIFT_VARIANT', 1), ('MISSENSE_VARIANT', 1)]

Using the counts, we can choose and run what analyses we want.
For instance, we can partition the patients into two groups based on presence/absence of a *frameshift* variant:

.. doctest:: tutorial

>>> from genophenocorr.analysis import CohortAnalysis
>>> from genophenocorr.constants import VariantEffect
>>> from genophenocorr.model import VariantEffect
>>> cohort_analysis = CohortAnalysis(cohort, 'NM_1234.5', hpo, include_unmeasured=False)
>>> frameshift = cohort_analysis.compare_by_variant_type(VariantEffect.FRAMESHIFT_VARIANT)
>>> frameshift # doctest: +NORMALIZE_WHITESPACE
Expand Down
2,711 changes: 1,427 additions & 1,284 deletions notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions src/genophenocorr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,4 @@
from . import preprocessing
from . import view

from .constants import VariantEffect

__version__ = "0.1.1dev"
3 changes: 1 addition & 2 deletions src/genophenocorr/analysis/_analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@
from pandas import DataFrame, MultiIndex
from collections import Counter, namedtuple

from genophenocorr.constants import VariantEffect
from genophenocorr.model import Cohort, FeatureType
from genophenocorr.model import Cohort, FeatureType, VariantEffect

from .predicate import VariantEffectPredicate, HPOPresentPredicate, VariantPredicate, ExonPredicate, ProtFeatureTypePredicate, ProtFeaturePredicate
from .predicate import HOMOZYGOUS, HETEROZYGOUS, NO_VARIANT
Expand Down
5 changes: 2 additions & 3 deletions src/genophenocorr/analysis/predicate/_all_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

import hpotk

from genophenocorr.constants import VariantEffect
from genophenocorr.model import Patient, FeatureType, Genotype
from genophenocorr.model import Patient, FeatureType, Genotype, VariantEffect
from ._api import PolyPredicate, PatientCategory


Expand Down Expand Up @@ -105,7 +104,7 @@ def test(self, patient: Patient, query: VariantEffect) -> typing.Optional[Patien
for trans in var.tx_annotations:
if trans.transcript_id == self._transcript:
for var_eff in trans.variant_effects:
if var_eff == str(query):
if var_eff == query:
vars.add(var)
if len(vars) == 1:
for v in vars:
Expand Down
1 change: 0 additions & 1 deletion src/genophenocorr/constants/__init__.py

This file was deleted.

8 changes: 4 additions & 4 deletions src/genophenocorr/data/_toy.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def get_toy_cohort() -> Cohort:
prot = ProteinMetadata('NP_09876.5', 'FakeProtein', [prot_feat_1, prot_feat_2])

snv = Variant.create_variant_from_scratch(VariantCoordinates(make_region(280, 281), 'A', 'G', 0), 'FakeGene',
'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'], [1],
'NM_1234.5', 'NM_1234.5:c.180A>G', False, [VariantEffect.MISSENSE_VARIANT], [1],
[prot], 60, 60,
Genotypes.from_mapping({
'A': Genotype.HETEROZYGOUS, 'B': Genotype.HETEROZYGOUS,
Expand All @@ -54,7 +54,7 @@ def get_toy_cohort() -> Cohort:
)
deletion = Variant.create_variant_from_scratch(VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del',
False, ['frameshift_variant'],
False, [VariantEffect.FRAMESHIFT_VARIANT],
[2], [prot], 86, 87,
Genotypes.from_mapping({
'D': Genotype.HETEROZYGOUS, 'F': Genotype.HETEROZYGOUS,
Expand All @@ -69,11 +69,11 @@ def get_toy_cohort() -> Cohort:
)
het_dup = Variant.create_variant_from_scratch(
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1), 'FakeGene', 'NM_1234.5',
'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25,
'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], [prot], 25, 25,
Genotypes.empty()) # Not used in the patients below, hence `empty()`.
hom_dup = Variant.create_variant_from_scratch(
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1),'FakeGene', 'NM_1234.5',
'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25,
'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], [prot], 25, 25,
Genotypes.empty()) # Not used in the patients below, hence `empty()`.

patients = (
Expand Down
3 changes: 2 additions & 1 deletion src/genophenocorr/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
from ._tx import TranscriptCoordinates
from ._variant import VariantCoordinates, TranscriptAnnotation, TranscriptInfoAware, Variant
from ._variant_effects import VariantEffect

__all__ = [
'Cohort', 'Patient',
'Phenotype',
'Variant', 'VariantCoordinates', 'Genotype', 'Genotypes', 'Genotyped',
'TranscriptAnnotation', 'TranscriptInfoAware', 'TranscriptCoordinates',
'TranscriptAnnotation', 'VariantEffect', 'TranscriptInfoAware', 'TranscriptCoordinates',
'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
]
2 changes: 1 addition & 1 deletion src/genophenocorr/model/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def list_data_by_tx(self, transcript=None):
for var in self.all_variants:
for trans in var.tx_annotations:
if trans.transcript_id in var_type_dict:
var_type_dict.get(trans.transcript_id).update(trans.variant_effects)
var_type_dict.get(trans.transcript_id).update([var_eff.name for var_eff in trans.variant_effects])
too_small = []
for tx_id, var_effect_counter in var_type_dict.items():
if len(var_effect_counter) <= 1:
Expand Down
9 changes: 5 additions & 4 deletions src/genophenocorr/model/_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .genome import GenomicRegion
from ._gt import Genotyped, Genotypes
from ._protein import ProteinMetadata
from ._variant_effects import VariantEffect


class TranscriptInfoAware(metaclass=abc.ABCMeta):
Expand Down Expand Up @@ -52,7 +53,7 @@ def __init__(self, gene_id: str,
tx_id: str,
hgvsc: typing.Optional[str],
is_preferred: bool,
variant_effects,
variant_effects: typing.Iterable[VariantEffect],
affected_exons: typing.Optional[typing.Sequence[int]],
affected_protein: typing.Sequence[ProteinMetadata],
protein_effect_start: typing.Optional[int],
Expand All @@ -63,7 +64,7 @@ def __init__(self, gene_id: str,
gene_id (string): The gene symbol associated with the transcript
tx_id (string): The transcript ID
hgvsc (string, Optional): The HGVS "coding-DNA" ID if available, else None
variant_effects (Sequence[string]): A sequence of predicted effects given by VEP
variant_effects (Iterable[string]): An iterable of predicted effects given by functional annotator
affected_exons (Sequence[integer], Optional): A sequence of exons affected by the variant. Returns None if none are affected.
affected_protein (Sequence[ProteinMetadata]): A ProteinMetadata object representing the protein affected by this transcript
protein_effect_start (integer, Optional): The start coordinate of the effect on the protein sequence.
Expand Down Expand Up @@ -115,7 +116,7 @@ def hgvsc_id(self) -> typing.Optional[str]:
return self._hgvsc_id

@property
def variant_effects(self) -> typing.Sequence[str]:
def variant_effects(self) -> typing.Sequence[VariantEffect]:
"""
Returns:
Sequence[string]: A sequence of variant effects.
Expand Down Expand Up @@ -377,7 +378,7 @@ def create_variant_from_scratch(variant_coordinates: VariantCoordinates,
trans_id: str,
hgvsc_id: str,
is_preferred: bool,
consequences: typing.Sequence[str],
consequences: typing.Iterable[VariantEffect],
exons_effected: typing.Sequence[int],
protein: typing.Sequence[ProteinMetadata],
protein_effect_start: int,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,26 @@
from enum import Enum


class VariantEffect(Enum):
"""
`VariantEffect` represents consequences of a variant on transcript that are supported by Genophenocorr.

.. doctest::

>>> from genophenocorr.model import VariantEffect
>>> missense = VariantEffect.MISSENSE_VARIANT
>>> print(missense)
missense_variant

The `VariantEffect` has a :prop:`curie` attribute that represents the ontology class from
`Sequence Ontology <http://www.sequenceontology.org/>`_.

.. doctest::

>>> missense.curie
'SO:0001583'
"""

TRANSCRIPT_ABLATION = "SO:0001893"
SPLICE_ACCEPTOR_VARIANT = "SO:0001574"
SPLICE_DONOR_VARIANT = "SO:0001575"
Expand Down Expand Up @@ -42,5 +62,15 @@ class VariantEffect(Enum):
INTERGENIC_VARIANT = "SO:0001628"
SEQUENCE_VARIANT = "SO:0001060"

def __init__(self, curie: str):
self._curie = curie

@property
def curie(self) -> str:
"""
Get a compact URI (CURIE) of the variant effect (e.g. `SO:0001583` for a missense variant).
"""
return self._curie

def __str__(self) -> str:
return self.name.lower()
return self.name.lower()
11 changes: 2 additions & 9 deletions src/genophenocorr/preprocessing/_phenopacket.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,8 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC
alt = '<DUP>'
else:
alt = '<DEL>'
chrom = re.findall(r'NC_0000(\d{2})\.\d*',
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id)[0]
if chrom.startswith('0'):
chrom = str(int(chrom))
elif chrom == '23':
chrom = 'X'
elif chrom == '24':
chrom = 'Y'
contig = self._build.contig_by_name(chrom)
refseq_contig_name = variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id.split(':')[1]
contig = self._build.contig_by_name(refseq_contig_name)
elif len(variant_descriptor.vcf_record.chrom) != 0 and len(
variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) == 0:
ref = variant_descriptor.vcf_record.ref
Expand Down
6 changes: 3 additions & 3 deletions src/genophenocorr/preprocessing/_test_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from genophenocorr.model import VariantCoordinates
from genophenocorr.model import VariantCoordinates, VariantEffect
from genophenocorr.model.genome import GenomicRegion, Strand, GRCh38

from ._vep import verify_start_end_coordinates, VepFunctionalAnnotator
Expand Down Expand Up @@ -79,7 +79,7 @@ def test__process_item_missense(self, variant_annotator: VepFunctionalAnnotator)
assert preferred.transcript_id == 'NM_013275.6'
assert preferred.is_preferred is True
assert preferred.hgvsc_id == 'NM_013275.6:c.7407C>G'
assert preferred.variant_effects == ('stop_gained',)
assert preferred.variant_effects == (VariantEffect.STOP_GAINED,)
assert preferred.overlapping_exons == (9,)

def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator):
Expand All @@ -99,7 +99,7 @@ def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator)
assert preferred.transcript_id == 'NM_013275.6'
assert preferred.is_preferred is True
assert preferred.hgvsc_id == 'NM_013275.6:c.2408_2412del'
assert preferred.variant_effects == ('frameshift_variant',)
assert preferred.variant_effects == (VariantEffect.FRAMESHIFT_VARIANT,)
assert preferred.overlapping_exons == (9,)

def _load_response_json(self, test_name: str):
Expand Down
21 changes: 19 additions & 2 deletions src/genophenocorr/preprocessing/_vep.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import requests

from genophenocorr.model import VariantCoordinates, TranscriptAnnotation
from genophenocorr.model import VariantCoordinates, TranscriptAnnotation, VariantEffect
from ._api import FunctionalAnnotator, ProteinMetadataService


Expand Down Expand Up @@ -80,6 +80,18 @@ def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[T

return annotations

def _parse_variant_effect(self, effect: str) -> typing.Optional[VariantEffect]:
if effect.upper() == "5_PRIME_UTR_VARIANT":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have two minor suggestions:

  • please refactor such that you do the conversion to uppercase at most once per method body instead of 3 times in the worst case.
  • logging, as currently written, will construct the string unconditionally, regardless of the set log level threshold (e.g. even if the user sets the threshold to see ERRORs only). This is because you're submitting the f-string which should be avoided when logging a statement. Instead, we should do something like this:
    self._logging.warning('Variant effect %s was not found ...', effect)
    Using this form, the logging framework will only create the final string if the logging threshold is low enough (WARN or lower in this case).

effect = "FIVE_PRIME_UTR_VARIANT"
elif effect.upper() == "3_PRIME_UTR_VARIANT":
effect = 'THREE_PRIME_UTR_VARIANT'
try:
var_effect = VariantEffect[effect.upper()]
except KeyError:
self._logging.warning(f"VariantEffect {effect} was not found in our record of possible effects. Please report this issue to the genophenocorr GitHub.")
return None
return var_effect

def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
"""
Parse one transcript annotation from the JSON response.
Expand All @@ -90,7 +102,12 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
return None
is_preferred = True if 'canonical' in item and item['canonical'] else False
hgvsc_id = item.get('hgvsc')
var_effects = []
consequences = item.get('consequence_terms')
for con in consequences:
var_effect = self._parse_variant_effect(con)
if var_effect is not None:
var_effects.append(var_effect)
gene_name = item.get('gene_symbol')
protein_id = item.get('protein_id')
protein = self._protein_annotator.annotate(protein_id)
Expand All @@ -112,7 +129,7 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
trans_id,
hgvsc_id,
is_preferred,
consequences,
var_effects,
exons_effected,
protein,
protein_effect_start,
Expand Down
16 changes: 8 additions & 8 deletions tests/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,44 +36,44 @@ def toy_cohort() -> Cohort:

dup = Variant(VariantCoordinates(make_region("16", 89279849, 89279850), ref='G', alt='GC', change_length=1),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, ['frameshift_variant'], [9],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, [VariantEffect.FRAMESHIFT_VARIANT], [9],
[prot], 2231, 2231)
],
Genotypes.from_mapping({'HetSingleVar': Genotype.HETEROZYGOUS}))
indel = Variant(VariantCoordinates(make_region("16", 89284600, 89284602), ref='GG', alt='A', change_length=-1),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, ['frameshift_variant'],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, [VariantEffect.FRAMESHIFT_VARIANT],
[9], [prot], 647, 647)
],
Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS}))
snv_stop_gain = Variant(VariantCoordinates(make_region("16", 89280751, 89280752), ref='G', alt='T', change_length=0),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, ['stop_gained'], [9], [prot],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, [VariantEffect.STOP_GAINED], [9], [prot],
1930, 1930)],
Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS}))
snv_missense = Variant(VariantCoordinates(make_region("16", 89275127, 89275128), ref='G', alt='A', change_length=0),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, ['missense_variant'], [10],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, [VariantEffect.MISSENSE_VARIANT], [10],
[prot], 2512, 2512)
],
Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS}))
del_frameshift = Variant(VariantCoordinates(make_region("16", 89279707, 89279725), ref='AGTGTTCGGGGCGGGGCC', alt='A', change_length=-17),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, ['frameshift_variant'],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, [VariantEffect.FRAMESHIFT_VARIANT],
[9], [prot], 2273, 2278)
],
Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS}))
del_small = Variant(VariantCoordinates(make_region("16", 89279457, 89279459), ref='TG', alt='T', change_length=-1),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, ['frameshift_variant'], [9],
TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, [VariantEffect.FRAMESHIFT_VARIANT], [9],
[prot], 2361, 2362)
],
Genotypes.from_mapping({'HomoVar': Genotype.HOMOZYGOUS_ALTERNATE}))
del_large = Variant(VariantCoordinates(make_region("16", 89_190_070, 89_439_815), ref='N', alt='<DEL>', change_length=-249_745),
[
TranscriptAnnotation('ANKRD11', 'NM_013275.6', None, False,
['stop_lost', 'feature_truncation', 'coding_sequence_variant', '5_prime_UTR_variant',
'3_prime_UTR_variant', 'intron_variant'], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
[VariantEffect.STOP_LOST, VariantEffect.FEATURE_TRUNCATION, VariantEffect.CODING_SEQUENCE_VARIANT, VariantEffect.FIVE_PRIME_UTR_VARIANT,
VariantEffect.THREE_PRIME_UTR_VARIANT, VariantEffect.INTRON_VARIANT], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
[prot], None, None)
],
Genotypes.from_mapping({'LargeCNV': Genotype.HETEROZYGOUS}))
Expand Down
3 changes: 1 addition & 2 deletions tests/genome_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@

import pytest

from genophenocorr import VariantEffect
from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Variant, VariantCoordinates, TranscriptAnnotation, Genotypes
from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Variant, VariantCoordinates, TranscriptAnnotation, Genotypes, VariantEffect
from genophenocorr.model.genome import Contig, GenomicRegion, Strand


Expand Down
3 changes: 1 addition & 2 deletions tests/test_predicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import pytest

from genophenocorr.analysis.predicate import *
from genophenocorr.constants import VariantEffect
from genophenocorr.model import Cohort, Patient, FeatureType
from genophenocorr.model import Cohort, Patient, FeatureType, VariantEffect
from .fixtures import toy_hpo, toy_cohort


Expand Down