Skip to content

Commit

Permalink
Merge pull request #67 from monarch-initiative/add-transcript-exon-model
Browse files Browse the repository at this point in the history
Prepare logic for fetching transcript regions for visualization
  • Loading branch information
lnrekerle committed Sep 29, 2023
2 parents ebf0619 + 7a9f698 commit 6d3d146
Show file tree
Hide file tree
Showing 27 changed files with 1,818 additions and 128 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/python_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@ on:
jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8', '3.9', '3.10', '3.11']

steps:
- uses: actions/checkout@v4
- name: Initialize Python 3.11
- name: Initialize Python
uses: actions/[email protected]
with:
python-version: "3.11"
python-version: ${{ matrix.python-version }}
- name: Install package
run: |
python3 -m pip install .[test,docs]
Expand Down
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include src/genophenocorr/model/genome/GCF_*.tsv
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ authors = [
]
description = "Search for genotype-phenotype correlations with GA4GH phenopackets"
readme = "README.md"
requires-python = ">=3.5"
requires-python = ">=3.8"
keywords = [
"Global Alliance for Genomics and Health",
"GA4GH Phenopacket Schema",
Expand All @@ -24,7 +24,10 @@ classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Bio-Informatics"
]
Expand All @@ -34,7 +37,7 @@ dependencies = [
"pandas>=2.0.0",
"phenopackets>=2.0.2",
"requests>=2.25.0",
"scipy>=1.11",
"scipy>=1.10",
"statsmodels>=0.13.0",
"numpy>=1.23"
]
Expand Down
35 changes: 21 additions & 14 deletions src/genophenocorr/data/_toy.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from hpotk import TermId

from genophenocorr.model import *
from genophenocorr.model.genome import Contig, GenomicRegion, Region, Strand

CONTIG = Contig('1', 'GB_ACC', 'REFSEQ_NAME', 'UCSC_NAME', 1_000)


def make_region(start: int, end: int) -> GenomicRegion:
return GenomicRegion(CONTIG, start, end, Strand.POSITIVE)


def get_toy_cohort() -> Cohort:
Expand All @@ -25,39 +32,39 @@ def get_toy_cohort() -> Cohort:
spasticity_F = Phenotype(TermId.from_curie('HP:0001257'), 'Spasticity', False)


prot_feat_1 = ProteinFeature.create(FeatureInfo('domain', 1, 75), FeatureType.DOMAIN)
prot_feat_2 = ProteinFeature.create(FeatureInfo('region', 50, 100), FeatureType.REGION)
prot_feat_1 = ProteinFeature.create(FeatureInfo('domain', Region(1, 75)), FeatureType.DOMAIN)
prot_feat_2 = ProteinFeature.create(FeatureInfo('region', Region(50, 100)), FeatureType.REGION)
prot = ProteinMetadata('NP_09876.5', 'FakeProtein', [prot_feat_1, prot_feat_2])

het_snv = Variant.create_variant_from_scratch(
'HetVar1', 'SNV',
VariantCoordinates('chr1', 280, 281, 'A', 'G', 0, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', ['missense_variant'],
VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'],
[1], [prot], 60, 60)
het_del = Variant.create_variant_from_scratch(
'HetVar2', 'indel',
VariantCoordinates('chr1', 360, 363, 'TTC', 'T', -2, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', ['frameshift_variant'],
VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'],
[2], [prot], 86, 87)
het_dup = Variant.create_variant_from_scratch(
'HetVar3', 'insertion',
VariantCoordinates('chr1', 175, 176, 'T', 'TG', 1, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', ['frameshift_variant'],
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1, 'heterozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'],
[1], [prot], 25, 25)
hom_snv = Variant.create_variant_from_scratch(
'HomVar1', 'SNV',
VariantCoordinates('chr1', 280, 281, 'A', 'G', 0, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', ['missense_variant'],
VariantCoordinates(make_region(280, 281), 'A', 'G', 0, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.180A>G', False, ['missense_variant'],
[1], [prot], 60, 60)
hom_del = Variant.create_variant_from_scratch(
'HomVar2', 'indel',
VariantCoordinates('chr1', 360, 363, 'TTC', 'T', -2, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', ['frameshift_variant'],
VariantCoordinates(make_region(360, 363), 'TTC', 'T', -2, 'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.261_263del', False, ['frameshift_variant'],
[2], [prot], 86, 87)
hom_dup = Variant.create_variant_from_scratch(
'HomVar3', 'insertion',
VariantCoordinates('chr1', 175, 176, 'T', 'TG', 1,'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', ['frameshift_variant'],
VariantCoordinates(make_region(175, 176), 'T', 'TG', 1,'homozygous'),
'FakeGene', 'NM_1234.5', 'NM_1234.5:c.75A>G', False, ['frameshift_variant'],
[1], [prot], 25, 25)

patients = (
Expand Down
5 changes: 3 additions & 2 deletions src/genophenocorr/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
The `genophenocorr.model` package defines data model classes used in genophenocorr. We start with the top-level elements,
such as :class:`Cohort` and :class:`Patient`, and we follow with data classes for phenotype, genotype, and protein info.
"""
from . import genome

from ._cohort import Cohort, Patient
from ._phenotype import Phenotype
from ._protein import FeatureInfo, FeatureType, ProteinFeature, ProteinMetadata
from ._variant import VariantCoordinates, TranscriptAnnotation, Variant
from ._variant import VariantCoordinates, TranscriptAnnotation, TranscriptInfoAware, Variant

__all__ = [
'Cohort', 'Patient',
'Phenotype',
'Variant', 'TranscriptAnnotation', 'VariantCoordinates',
'Variant', 'VariantCoordinates', 'TranscriptAnnotation', 'TranscriptInfoAware',
'ProteinMetadata', 'ProteinFeature', 'FeatureInfo', 'FeatureType',
]
49 changes: 22 additions & 27 deletions src/genophenocorr/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,25 @@
import enum
import typing

import hpotk

from .genome import Region


class FeatureInfo:
"""A class that represents a protein feature
(e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
Attributes:
name (string): The given name or description of the protein feature
start (integer): The starting position of the feature on the protein sequence
end (integer): The ending position of the feature on the protein sequence
region (Region): The protein feature region coordinates
"""

def __init__(self, name: str, start: int, end: int):
"""Constructs all necessary attributes for a FeatureInfo object
Args:
name (string): The given name or description of the protein feature
start (integer): The starting position of the feature on the protein sequence
end (integer): The ending position of the feature on the protein sequence
"""
def __init__(self, name: str, region: Region):
if not isinstance(name, str):
raise ValueError(f"name must be type string but was type {type(name)}")
self._name = name
if not isinstance(start, int):
raise ValueError(f"start must be an integer but was type {type(start)}")
self._start = start
if not isinstance(end, int):
raise ValueError(f"end must be an integer but was type {type(end)}")
self._end = end

if self._start > self._end:
raise ValueError(f"The start value must come before end but {self._start} is greater than {self._end}")
self._name = hpotk.util.validate_instance(name, str, 'name')
self._region = hpotk.util.validate_instance(region, Region, 'region')

@property
def name(self) -> str:
Expand All @@ -42,33 +30,40 @@ def name(self) -> str:
"""
return self._name

@property
def region(self) -> Region:
"""
Returns:
Region: a protein region spanned by the feature.
"""
return self._region

@property
def start(self) -> int:
"""
Returns:
integer: A 0-based (excluded) start coordinate of the protein feature.
"""
return self._start
return self._region.start

@property
def end(self) -> int:
"""
Returns:
integer: A 0-based (included) end coordinate of the protein feature.
"""
return self._end
return self._region.end

def __len__(self):
return self._end - self._start
return len(self._region)

def __eq__(self, other) -> bool:
return isinstance(other, FeatureInfo) \
and self.name == other.name \
and self.start == other.start \
and self.end == other.end
and self.region == other.region

def __hash__(self):
return hash((self._name, self._start, self._end))
return hash((self._name, self._region))

def __str__(self) -> str:
return f"FeatureInfo(name={self.name}, start={self.start}, end={self.end})"
Expand Down Expand Up @@ -264,4 +259,4 @@ def __hash__(self) -> int:
return hash((self.protein_id, self.label, self._features))

def __repr__(self) -> str:
return str(self)
return str(self)
Loading

0 comments on commit 6d3d146

Please sign in to comment.