monarch-initiative · lnrekerle · Oct 19, 2023 · Oct 10, 2023 · Oct 10, 2023 · Oct 10, 2023
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -51,15 +51,15 @@ We can then view the data using the list commands.
   [('NP_09876.5', 26)]
   >>> tx_dict = cohort.list_data_by_tx('NM_1234.5')
   >>> sorted(tx_dict['NM_1234.5'].items())
-  [('frameshift_variant', 1), ('missense_variant', 1)]
+  [('FRAMESHIFT_VARIANT', 1), ('MISSENSE_VARIANT', 1)]
 
 Using the counts, we can choose and run what analyses we want.
 For instance, we can partition the patients into two groups based on presence/absence of a *frameshift* variant:
 
 .. doctest:: tutorial
 
   >>> from genophenocorr.analysis import CohortAnalysis
-  >>> from genophenocorr.constants import VariantEffect
+  >>> from genophenocorr.model import VariantEffect
   >>> cohort_analysis = CohortAnalysis(cohort, 'NM_1234.5', hpo, include_unmeasured=False)
   >>> frameshift = cohort_analysis.compare_by_variant_type(VariantEffect.FRAMESHIFT_VARIANT)
   >>> frameshift # doctest: +NORMALIZE_WHITESPACE

diff --git a/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb b/notebooks/KBG/KBG_Martinez_PMID_36446582_RunGenoPhenoCorr.ipynb
diff --git a/src/genophenocorr/__init__.py b/src/genophenocorr/__init__.py
@@ -8,6 +8,4 @@
 from . import preprocessing
 from . import view
 
-from .constants import VariantEffect
-
 __version__ = "0.1.1dev"
diff --git a/src/genophenocorr/analysis/_analyzers.py b/src/genophenocorr/analysis/_analyzers.py
@@ -11,8 +11,7 @@
 from pandas import DataFrame, MultiIndex
 from collections import Counter, namedtuple
 
-from genophenocorr.constants import VariantEffect
-from genophenocorr.model import Cohort, FeatureType
+from genophenocorr.model import Cohort, FeatureType, VariantEffect
 
 from .predicate import VariantEffectPredicate, HPOPresentPredicate, VariantPredicate, ExonPredicate, ProtFeatureTypePredicate, ProtFeaturePredicate
 from .predicate import HOMOZYGOUS, HETEROZYGOUS, NO_VARIANT

diff --git a/src/genophenocorr/analysis/predicate/_all_predicates.py b/src/genophenocorr/analysis/predicate/_all_predicates.py
@@ -2,8 +2,7 @@
 
 import hpotk
 
-from genophenocorr.constants import VariantEffect
-from genophenocorr.model import Patient, FeatureType, Genotype
+from genophenocorr.model import Patient, FeatureType, Genotype, VariantEffect
 from ._api import PolyPredicate, PatientCategory
 
 
@@ -105,7 +104,7 @@ def test(self, patient: Patient, query: VariantEffect) -> typing.Optional[Patien
             for trans in var.tx_annotations:
                 if trans.transcript_id == self._transcript:
                     for var_eff in trans.variant_effects:
-                        if var_eff == str(query):
+                        if var_eff == query:
                             vars.add(var)
         if len(vars) == 1:
             for v in vars:

diff --git a/src/genophenocorr/constants/__init__.py b/src/genophenocorr/constants/__init__.py
diff --git a/src/genophenocorr/data/_toy.py b/src/genophenocorr/data/_toy.py
@@ -37,7 +37,7 @@ def get_toy_cohort() -> Cohort:
     prot = ProteinMetadata('NP_09876.5', 'FakeProtein', [prot_feat_1, prot_feat_2])
 
     snv = Variant.create_variant_from_scratch(VariantCoordinates(make_region(280, 281), 'A', 'G', 0), 'FakeGene',
-                                                  'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'], [1],
+                                                  'NM_1234.5', 'NM_1234.5:c.180A>G', False, [VariantEffect.MISSENSE_VARIANT], [1],
                                                   [prot], 60, 60,
                                                   Genotypes.from_mapping({
                                                       'A': Genotype.HETEROZYGOUS, 'B': Genotype.HETEROZYGOUS,
@@ -54,7 +54,7 @@ def get_toy_cohort() -> Cohort:
                                               )
     deletion = Variant.create_variant_from_scratch(VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2),
                                                   'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del',
-                                                  False, ['frameshift_variant'],
+                                                  False, [VariantEffect.FRAMESHIFT_VARIANT],
                                                   [2], [prot], 86, 87,
                                                   Genotypes.from_mapping({
                                                       'D': Genotype.HETEROZYGOUS, 'F': Genotype.HETEROZYGOUS,
@@ -69,11 +69,11 @@ def get_toy_cohort() -> Cohort:
                                                    )
     het_dup = Variant.create_variant_from_scratch(
         VariantCoordinates(make_region(175, 176), 'T', 'TG', 1), 'FakeGene', 'NM_1234.5',
-        'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25,
+        'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], [prot], 25, 25,
         Genotypes.empty())  # Not used in the patients below, hence `empty()`.
     hom_dup = Variant.create_variant_from_scratch(
         VariantCoordinates(make_region(175, 176), 'T', 'TG', 1),'FakeGene', 'NM_1234.5',
-        'NM_1234.5:c.75A>G', False, ['frameshift_variant'], [1], [prot], 25, 25,
+        'NM_1234.5:c.75A>G', False, [VariantEffect.FRAMESHIFT_VARIANT], [1], [prot], 25, 25,
         Genotypes.empty())  # Not used in the patients below, hence `empty()`.
 
     patients = (

diff --git a/src/genophenocorr/model/__init__.py b/src/genophenocorr/model/__init__.py
@@ -11,11 +11,12 @@
 from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
 from ._tx import TranscriptCoordinates
 from ._variant import VariantCoordinates, TranscriptAnnotation, TranscriptInfoAware, Variant
+from ._variant_effects import VariantEffect
 
 __all__ = [
     'Cohort', 'Patient',
     'Phenotype',
     'Variant', 'VariantCoordinates', 'Genotype', 'Genotypes', 'Genotyped',
-    'TranscriptAnnotation', 'TranscriptInfoAware', 'TranscriptCoordinates',
+    'TranscriptAnnotation', 'VariantEffect', 'TranscriptInfoAware', 'TranscriptCoordinates',
     'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
 ]
diff --git a/src/genophenocorr/model/_cohort.py b/src/genophenocorr/model/_cohort.py
@@ -252,7 +252,7 @@ def list_data_by_tx(self, transcript=None):
         for var in self.all_variants:
             for trans in var.tx_annotations:
                 if trans.transcript_id in var_type_dict:
-                    var_type_dict.get(trans.transcript_id).update(trans.variant_effects)
+                    var_type_dict.get(trans.transcript_id).update([var_eff.name for var_eff in trans.variant_effects])
         too_small = []
         for tx_id, var_effect_counter in var_type_dict.items():
             if len(var_effect_counter) <= 1:

diff --git a/src/genophenocorr/model/_variant.py b/src/genophenocorr/model/_variant.py
@@ -7,6 +7,7 @@
 from .genome import GenomicRegion
 from ._gt import Genotyped, Genotypes
 from ._protein import ProteinMetadata
+from ._variant_effects import VariantEffect
 
 
 class TranscriptInfoAware(metaclass=abc.ABCMeta):
@@ -52,7 +53,7 @@ def __init__(self, gene_id: str,
                  tx_id: str,
                  hgvsc: typing.Optional[str],
                  is_preferred: bool,
-                 variant_effects,
+                 variant_effects: typing.Iterable[VariantEffect],
                  affected_exons: typing.Optional[typing.Sequence[int]],
                  affected_protein: typing.Sequence[ProteinMetadata],
                  protein_effect_start: typing.Optional[int],
@@ -63,7 +64,7 @@ def __init__(self, gene_id: str,
             gene_id (string): The gene symbol associated with the transcript
             tx_id (string): The transcript ID
             hgvsc (string, Optional): The HGVS "coding-DNA" ID if available, else None
-            variant_effects (Sequence[string]): A sequence of predicted effects given by VEP
+            variant_effects (Iterable[string]): An iterable of predicted effects given by functional annotator
             affected_exons (Sequence[integer], Optional): A sequence of exons affected by the variant. Returns None if none are affected.
             affected_protein (Sequence[ProteinMetadata]): A ProteinMetadata object representing the protein affected by this transcript
             protein_effect_start (integer, Optional): The start coordinate of the effect on the protein sequence.
@@ -115,7 +116,7 @@ def hgvsc_id(self) -> typing.Optional[str]:
         return self._hgvsc_id
 
     @property
-    def variant_effects(self) -> typing.Sequence[str]:
+    def variant_effects(self) -> typing.Sequence[VariantEffect]:
         """
         Returns:
             Sequence[string]: A sequence of variant effects.
@@ -377,7 +378,7 @@ def create_variant_from_scratch(variant_coordinates: VariantCoordinates,
                                     trans_id: str,
                                     hgvsc_id: str,
                                     is_preferred: bool,
-                                    consequences: typing.Sequence[str],
+                                    consequences: typing.Iterable[VariantEffect],
                                     exons_effected: typing.Sequence[int],
                                     protein: typing.Sequence[ProteinMetadata],
                                     protein_effect_start: int,

diff --git a/...enophenocorr/constants/variant_effects.py → src/genophenocorr/model/_variant_effects.py b/...enophenocorr/constants/variant_effects.py → src/genophenocorr/model/_variant_effects.py
@@ -1,6 +1,26 @@
 from enum import Enum
 
+
 class VariantEffect(Enum):
+    """
+    `VariantEffect` represents consequences of a variant on transcript that are supported by Genophenocorr.
+
+    .. doctest::
+
+      >>> from genophenocorr.model import VariantEffect
+      >>> missense = VariantEffect.MISSENSE_VARIANT
+      >>> print(missense)
+      missense_variant
+
+    The `VariantEffect` has a :prop:`curie` attribute that represents the ontology class from
+    `Sequence Ontology <http://www.sequenceontology.org/>`_.
+
+    .. doctest::
+
+      >>> missense.curie
+      'SO:0001583'
+    """
+
     TRANSCRIPT_ABLATION = "SO:0001893"
     SPLICE_ACCEPTOR_VARIANT = "SO:0001574"
     SPLICE_DONOR_VARIANT =  "SO:0001575"
@@ -42,5 +62,15 @@ class VariantEffect(Enum):
     INTERGENIC_VARIANT = "SO:0001628"
     SEQUENCE_VARIANT = "SO:0001060"
 
+    def __init__(self, curie: str):
+        self._curie = curie
+
+    @property
+    def curie(self) -> str:
+        """
+        Get a compact URI (CURIE) of the variant effect (e.g. `SO:0001583` for a missense variant).
+        """
+        return self._curie
+
     def __str__(self) -> str:
-        return self.name.lower()
+        return self.name.lower()
diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py
@@ -53,15 +53,8 @@ def find_coordinates(self, item: GenomicInterpretation) -> typing.Tuple[VariantC
                 alt = '<DUP>'
             else:
                 alt = '<DEL>'
-            chrom = re.findall(r'NC_0000(\d{2})\.\d*',
-                               variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id)[0]
-            if chrom.startswith('0'):
-                chrom = str(int(chrom))
-            elif chrom == '23':
-                chrom = 'X'
-            elif chrom == '24':
-                chrom = 'Y'
-            contig = self._build.contig_by_name(chrom)
+            refseq_contig_name = variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id.split(':')[1]
+            contig = self._build.contig_by_name(refseq_contig_name)
         elif len(variant_descriptor.vcf_record.chrom) != 0 and len(
                 variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id) == 0:
             ref = variant_descriptor.vcf_record.ref

diff --git a/src/genophenocorr/preprocessing/_test_vep.py b/src/genophenocorr/preprocessing/_test_vep.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from genophenocorr.model import VariantCoordinates
+from genophenocorr.model import VariantCoordinates, VariantEffect
 from genophenocorr.model.genome import GenomicRegion, Strand, GRCh38
 
 from ._vep import verify_start_end_coordinates, VepFunctionalAnnotator
@@ -79,7 +79,7 @@ def test__process_item_missense(self, variant_annotator: VepFunctionalAnnotator)
         assert preferred.transcript_id == 'NM_013275.6'
         assert preferred.is_preferred is True
         assert preferred.hgvsc_id == 'NM_013275.6:c.7407C>G'
-        assert preferred.variant_effects == ('stop_gained',)
+        assert preferred.variant_effects == (VariantEffect.STOP_GAINED,)
         assert preferred.overlapping_exons == (9,)
 
     def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator):
@@ -99,7 +99,7 @@ def test__process_item_deletion(self, variant_annotator: VepFunctionalAnnotator)
         assert preferred.transcript_id == 'NM_013275.6'
         assert preferred.is_preferred is True
         assert preferred.hgvsc_id == 'NM_013275.6:c.2408_2412del'
-        assert preferred.variant_effects == ('frameshift_variant',)
+        assert preferred.variant_effects == (VariantEffect.FRAMESHIFT_VARIANT,)
         assert preferred.overlapping_exons == (9,)
 
     def _load_response_json(self, test_name: str):

diff --git a/src/genophenocorr/preprocessing/_vep.py b/src/genophenocorr/preprocessing/_vep.py
@@ -3,7 +3,7 @@
 
 import requests
 
-from genophenocorr.model import VariantCoordinates, TranscriptAnnotation
+from genophenocorr.model import VariantCoordinates, TranscriptAnnotation, VariantEffect
 from ._api import FunctionalAnnotator, ProteinMetadataService
 
 
@@ -80,6 +80,18 @@ def annotate(self, variant_coordinates: VariantCoordinates) -> typing.Sequence[T
 
         return annotations
 
+    def _parse_variant_effect(self, effect: str) -> typing.Optional[VariantEffect]:
+        if effect.upper() == "5_PRIME_UTR_VARIANT":
+            effect = "FIVE_PRIME_UTR_VARIANT"
+        elif effect.upper() == "3_PRIME_UTR_VARIANT":
+            effect = 'THREE_PRIME_UTR_VARIANT'
+        try:
+            var_effect = VariantEffect[effect.upper()]
+        except KeyError:
+            self._logging.warning(f"VariantEffect {effect} was not found in our record of possible effects. Please report this issue to the genophenocorr GitHub.")
+            return None
+        return var_effect
+
     def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
         """
         Parse one transcript annotation from the JSON response.
@@ -90,7 +102,12 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
             return None
         is_preferred = True if 'canonical' in item and item['canonical'] else False
         hgvsc_id = item.get('hgvsc')
+        var_effects = []
         consequences = item.get('consequence_terms')
+        for con in consequences:
+            var_effect = self._parse_variant_effect(con)
+            if var_effect is not None:
+                var_effects.append(var_effect)
         gene_name = item.get('gene_symbol')
         protein_id = item.get('protein_id')
         protein = self._protein_annotator.annotate(protein_id)
@@ -112,7 +129,7 @@ def _process_item(self, item) -> typing.Optional[TranscriptAnnotation]:
                                     trans_id,
                                     hgvsc_id,
                                     is_preferred,
-                                    consequences,
+                                    var_effects,
                                     exons_effected,
                                     protein,
                                     protein_effect_start,

diff --git a/tests/fixtures.py b/tests/fixtures.py
@@ -36,44 +36,44 @@ def toy_cohort() -> Cohort:
 
     dup = Variant(VariantCoordinates(make_region("16", 89279849, 89279850), ref='G', alt='GC', change_length=1),
                   [
-                      TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, ['frameshift_variant'], [9],
+                      TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6691dup', False, [VariantEffect.FRAMESHIFT_VARIANT], [9],
                                            [prot], 2231, 2231)
                   ],
                   Genotypes.from_mapping({'HetSingleVar': Genotype.HETEROZYGOUS}))
     indel = Variant(VariantCoordinates(make_region("16", 89284600, 89284602), ref='GG', alt='A', change_length=-1),
                     [
-                        TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, ['frameshift_variant'],
+                        TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.1940_1941delinsT', False, [VariantEffect.FRAMESHIFT_VARIANT],
                                              [9], [prot], 647, 647)
                     ],
                     Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS}))
     snv_stop_gain = Variant(VariantCoordinates(make_region("16", 89280751, 89280752), ref='G', alt='T', change_length=0),
                             [
-                                TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, ['stop_gained'], [9], [prot],
+                                TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.5790C>A', False, [VariantEffect.STOP_GAINED], [9], [prot],
                              1930, 1930)],
                             Genotypes.from_mapping({'HetDoubleVar1': Genotype.HETEROZYGOUS}))
     snv_missense = Variant(VariantCoordinates(make_region("16", 89275127, 89275128), ref='G', alt='A', change_length=0),
                            [
-                               TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, ['missense_variant'], [10],
+                               TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7534C>T', False, [VariantEffect.MISSENSE_VARIANT], [10],
                              [prot], 2512, 2512)
                            ],
                            Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS}))
     del_frameshift = Variant(VariantCoordinates(make_region("16", 89279707, 89279725), ref='AGTGTTCGGGGCGGGGCC', alt='A', change_length=-17),
                              [
-                                 TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, ['frameshift_variant'],
+                                 TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.6817_6833del', False, [VariantEffect.FRAMESHIFT_VARIANT],
                               [9], [prot], 2273, 2278)
                              ],
                              Genotypes.from_mapping({'HetDoubleVar2': Genotype.HETEROZYGOUS}))
     del_small = Variant(VariantCoordinates(make_region("16", 89279457, 89279459), ref='TG', alt='T', change_length=-1),
                         [
-                            TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, ['frameshift_variant'], [9],
+                            TranscriptAnnotation('ANKRD11', 'NM_013275.6', 'NM_013275.6:c.7083del', False, [VariantEffect.FRAMESHIFT_VARIANT], [9],
                              [prot], 2361, 2362)
                         ],
                         Genotypes.from_mapping({'HomoVar': Genotype.HOMOZYGOUS_ALTERNATE}))
     del_large = Variant(VariantCoordinates(make_region("16", 89_190_070, 89_439_815), ref='N', alt='<DEL>', change_length=-249_745),
                         [
                             TranscriptAnnotation('ANKRD11', 'NM_013275.6', None, False,
-                                 ['stop_lost', 'feature_truncation', 'coding_sequence_variant', '5_prime_UTR_variant',
-                                  '3_prime_UTR_variant', 'intron_variant'], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
+                                 [VariantEffect.STOP_LOST, VariantEffect.FEATURE_TRUNCATION, VariantEffect.CODING_SEQUENCE_VARIANT, VariantEffect.FIVE_PRIME_UTR_VARIANT,
+                                  VariantEffect.THREE_PRIME_UTR_VARIANT, VariantEffect.INTRON_VARIANT], [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
                                  [prot], None, None)
                         ],
                         Genotypes.from_mapping({'LargeCNV': Genotype.HETEROZYGOUS}))

diff --git a/tests/genome_data.py b/tests/genome_data.py
@@ -2,8 +2,7 @@
 
 import pytest
 
-from genophenocorr import VariantEffect
-from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Variant, VariantCoordinates, TranscriptAnnotation, Genotypes
+from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Variant, VariantCoordinates, TranscriptAnnotation, Genotypes, VariantEffect
 from genophenocorr.model.genome import Contig, GenomicRegion, Strand
 
 

diff --git a/tests/test_predicates.py b/tests/test_predicates.py
@@ -4,8 +4,7 @@
 import pytest
 
 from genophenocorr.analysis.predicate import *
-from genophenocorr.constants import VariantEffect
-from genophenocorr.model import Cohort, Patient, FeatureType
+from genophenocorr.model import Cohort, Patient, FeatureType, VariantEffect
 from .fixtures import toy_hpo, toy_cohort