Merge pull request #67 from monarch-initiative/add-transcript-exon-model

Prepare logic for fetching transcript regions for visualization
monarch-initiative · Sep 29, 2023 · 6d3d146 · 6d3d146
2 parents ebf0619 + 7a9f698
commit 6d3d146
Show file tree

Hide file tree

Showing 27 changed files with 1,818 additions and 128 deletions.
diff --git a/.github/workflows/python_ci.yml b/.github/workflows/python_ci.yml
@@ -9,13 +9,16 @@ on:
 jobs:
     build:
       runs-on: ubuntu-latest
+      strategy:
+        matrix:
+          python-version: ['3.8', '3.9', '3.10', '3.11']
 
       steps:
         - uses: actions/checkout@v4
-        - name: Initialize Python 3.11
+        - name: Initialize Python
           uses: actions/[email protected]
           with:
-            python-version: "3.11"
+            python-version: ${{ matrix.python-version }}
         - name: Install package
           run: |
             python3 -m pip install .[test,docs]

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include src/genophenocorr/model/genome/GCF_*.tsv
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ authors = [
      ]
 description = "Search for genotype-phenotype correlations with GA4GH phenopackets"
 readme = "README.md"
-requires-python = ">=3.5"
+requires-python = ">=3.8"
 keywords = [
     "Global Alliance for Genomics and Health",
     "GA4GH Phenopacket Schema",
@@ -24,7 +24,10 @@ classifiers = [
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
     "Development Status :: 3 - Alpha",
-    "Programming Language :: Python :: 3.5",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
     "Intended Audience :: Science/Research",
     "Topic :: Scientific/Engineering :: Bio-Informatics"
 ]
@@ -34,7 +37,7 @@ dependencies = [
     "pandas>=2.0.0",
     "phenopackets>=2.0.2",
     "requests>=2.25.0",
-    "scipy>=1.11",
+    "scipy>=1.10",
     "statsmodels>=0.13.0",
     "numpy>=1.23"
 ]

diff --git a/src/genophenocorr/data/_toy.py b/src/genophenocorr/data/_toy.py
@@ -1,6 +1,13 @@
 from hpotk import TermId
 
 from genophenocorr.model import *
+from genophenocorr.model.genome import Contig, GenomicRegion, Region, Strand
+
+CONTIG = Contig('1', 'GB_ACC', 'REFSEQ_NAME', 'UCSC_NAME', 1_000)
+
+
+def make_region(start: int, end: int) -> GenomicRegion:
+    return GenomicRegion(CONTIG, start, end, Strand.POSITIVE)
 
 
 def get_toy_cohort() -> Cohort:
@@ -25,39 +32,39 @@ def get_toy_cohort() -> Cohort:
     spasticity_F = Phenotype(TermId.from_curie('HP:0001257'), 'Spasticity', False)
 
 
-    prot_feat_1 = ProteinFeature.create(FeatureInfo('domain', 1, 75), FeatureType.DOMAIN)
-    prot_feat_2 = ProteinFeature.create(FeatureInfo('region', 50, 100), FeatureType.REGION)
+    prot_feat_1 = ProteinFeature.create(FeatureInfo('domain', Region(1, 75)), FeatureType.DOMAIN)
+    prot_feat_2 = ProteinFeature.create(FeatureInfo('region', Region(50, 100)), FeatureType.REGION)
     prot = ProteinMetadata('NP_09876.5', 'FakeProtein', [prot_feat_1, prot_feat_2])
 
     het_snv = Variant.create_variant_from_scratch(
         'HetVar1', 'SNV',
-        VariantCoordinates('chr1', 280, 281, 'A', 'G', 0, 'heterozygous'),
-        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', ['missense_variant'],
+        VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'heterozygous'),
+        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'],
         [1], [prot], 60, 60)
     het_del = Variant.create_variant_from_scratch(
         'HetVar2', 'indel',
-        VariantCoordinates('chr1', 360, 363, 'TTC', 'T', -2,  'heterozygous'),
-        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', ['frameshift_variant'],
+        VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2,  'heterozygous'),
+        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'],
         [2], [prot], 86, 87)
     het_dup = Variant.create_variant_from_scratch(
         'HetVar3', 'insertion',
-        VariantCoordinates('chr1', 175, 176, 'T', 'TG', 1, 'heterozygous'),
-        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', ['frameshift_variant'],
+        VariantCoordinates(make_region(175, 176), 'T', 'TG', 1, 'heterozygous'),
+        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'],
         [1], [prot], 25, 25)
     hom_snv = Variant.create_variant_from_scratch(
         'HomVar1', 'SNV',
-        VariantCoordinates('chr1', 280, 281, 'A', 'G', 0, 'homozygous'),
-        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', ['missense_variant'],
+        VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'homozygous'),
+        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'],
         [1], [prot], 60, 60)
     hom_del = Variant.create_variant_from_scratch(
         'HomVar2', 'indel',
-        VariantCoordinates('chr1', 360, 363, 'TTC', 'T', -2, 'homozygous'),
-        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', ['frameshift_variant'],
+        VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2, 'homozygous'),
+        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'],
         [2], [prot], 86, 87)
     hom_dup = Variant.create_variant_from_scratch(
         'HomVar3', 'insertion',
-        VariantCoordinates('chr1', 175, 176, 'T', 'TG', 1,'homozygous'),
-        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', ['frameshift_variant'],
+        VariantCoordinates(make_region(175, 176), 'T', 'TG', 1,'homozygous'),
+        'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'],
         [1], [prot], 25, 25)
 
     patients = (

diff --git a/src/genophenocorr/model/__init__.py b/src/genophenocorr/model/__init__.py
@@ -2,15 +2,16 @@
 The `genophenocorr.model` package defines data model classes used in genophenocorr. We start with the top-level elements,
 such as :class:`Cohort` and :class:`Patient`, and we follow with data classes for phenotype, genotype, and protein info.
 """
+from . import genome
 
 from ._cohort import Cohort, Patient
 from ._phenotype import Phenotype
 from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
-from ._variant import VariantCoordinates, TranscriptAnnotation, Variant
+from ._variant import VariantCoordinates, TranscriptAnnotation, TranscriptInfoAware, Variant
 
 __all__ = [
     'Cohort', 'Patient',
     'Phenotype',
-    'Variant', 'TranscriptAnnotation', 'VariantCoordinates',
+    'Variant', 'VariantCoordinates', 'TranscriptAnnotation', 'TranscriptInfoAware',
     'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
 ]
diff --git a/src/genophenocorr/model/_protein.py b/src/genophenocorr/model/_protein.py
@@ -2,37 +2,25 @@
 import enum
 import typing
 
+import hpotk
+
+from .genome import Region
+
 
 class FeatureInfo:
     """A class that represents a protein feature
     (e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
 
     Attributes:
         name (string): The given name or description of the protein feature
-        start (integer): The starting position of the feature on the protein sequence
-        end (integer): The ending position of the feature on the protein sequence
+        region (Region): The protein feature region coordinates
     """
 
-    def __init__(self, name: str, start: int, end: int):
-        """Constructs all necessary attributes for a FeatureInfo object
-
-        Args:
-            name (string): The given name or description of the protein feature
-            start (integer): The starting position of the feature on the protein sequence
-            end (integer): The ending position of the feature on the protein sequence
-        """
+    def __init__(self, name: str, region: Region):
         if not isinstance(name, str):
             raise ValueError(f"name must be type string but was type {type(name)}")
-        self._name = name
-        if not isinstance(start, int):
-            raise ValueError(f"start must be an integer but was type {type(start)}")
-        self._start = start
-        if not isinstance(end, int):
-            raise ValueError(f"end must be an integer but was type {type(end)}")
-        self._end = end
-
-        if self._start > self._end:
-            raise ValueError(f"The start value must come before end but {self._start} is greater than {self._end}")
+        self._name = hpotk.util.validate_instance(name, str, 'name')
+        self._region = hpotk.util.validate_instance(region, Region, 'region')
 
     @property
     def name(self) -> str:
@@ -42,33 +30,40 @@ def name(self) -> str:
         """
         return self._name
 
+    @property
+    def region(self) -> Region:
+        """
+        Returns:
+            Region: a protein region spanned by the feature.
+        """
+        return self._region
+
     @property
     def start(self) -> int:
         """
         Returns:
             integer: A 0-based (excluded) start coordinate of the protein feature.
         """
-        return self._start
+        return self._region.start
 
     @property
     def end(self) -> int:
         """
         Returns:
             integer: A 0-based (included) end coordinate of the protein feature.
         """
-        return self._end
+        return self._region.end
 
     def __len__(self):
-        return self._end - self._start
+        return len(self._region)
 
     def __eq__(self, other) -> bool:
         return isinstance(other, FeatureInfo) \
             and self.name == other.name \
-            and self.start == other.start \
-            and self.end == other.end
+            and self.region == other.region
 
     def __hash__(self):
-        return hash((self._name, self._start, self._end))
+        return hash((self._name, self._region))
 
     def __str__(self) -> str:
         return f"FeatureInfo(name={self.name}, start={self.start}, end={self.end})"
@@ -264,4 +259,4 @@ def __hash__(self) -> int:
         return hash((self.protein_id, self.label, self._features))
 
     def __repr__(self) -> str:
-        return str(self)
+        return str(self)