diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index b01c95d1..7f47e228 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -192,76 +192,96 @@ We use static constructor :func:`~gpsea.analysis.mtc_filter.HpoMtcFilter.default for creating :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`. The constructor takes a threshold as an argument (e.g. 20% in the example above) and the method's logic is made up of 8 individual heuristics -designed to skip testing the HPO terms that are unlikely to yield significant or interesting results: - -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| Code | Name | Description | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF01` | Skip terms that | The ``term_frequency_threshold`` determines the mininum proportion of individuals | -| | occur very rarely | with direct or indirect annotation by the HPO term to test. | -| | | We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), | -| | | and we only retain a term for testing if the proportion of individuals | -| | | in at least one genotype group is greater than | -| | | or equal to ``term_frequency_threshold``. | -| | | This is because of our assumption that even if there is statistical significance, | -| | | if a term is only seen in (for example) 7% of individuals | -| | | in the MISSENSE group and 2% in the not-MISSENSE group, | -| | | the term is unlikely to be of great interest because it is rare. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF02` | Skip terms if | In a related heuristic, we skip terms if no genotype group has more | -| | no cell has more | than one count. This is not completely redundant with the previous condition, | -| | than one count | because some terms may have a small number of total observations. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF03` | Skip terms if | Let's say a term such as | -| | all counts are | `Posterior polar cataract (HP:0001115) `_ | -| | identical | was observed in 7 of 11 individuals with MISSENSE variants | -| | to counts | and in 3 of 8 individuals with NONSENSE variants. | -| | for a child | If we find the same patient counts (7 of 11 and 3 of 8) in the parent term | -| | term | `Polar cataract HP:0010696 `_, | -| | | then we choose to not test the parent term. | -| | | | -| | | This is because the more specific an HPO term is, | -| | | the more information it has (the more interesting the correlation would be if it exists), | -| | | and the result of a test, such as the Fisher Exact test, would be exactly the same | -| | | for *Polar cataract* as for *Posterior polar cataract*. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF04` | Skip terms if | If both (or all) of the genotype groups have the same proportion of individuals | -| | genotypes have | observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, | -| | same HPO | because it is not possible that the Fisher exact test will return a significant result. | -| | proportions | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF05` | Skip terms if | If one of the genotype groups has neither observed nor excluded observations | -| | there are no | for an HPO term, skip it. | -| | HPO observations | | -| | in a group | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF06` | Skip term if | If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) | -| | underpowered | and the total count of patients in all genotype-phenotype groups is less than 7, | -| | for 2x2 or 2x3 | or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients | -| | analysis | is less than 6, then there is a lack even of the nominal statistical power | -| | | and the counts can never be significant. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF07` | Skipping terms | The HPO has a number of other branches that describe modes of inheritance, | -| | that are not | past medical history, and clinical modifiers. | -| | descendents of | We do not think it makes much sense to test for enrichment of these terms, | -| | *Phenotypic* | so, all terms that are not descendants of | -| | *abnormality* | `Phenotypic abnormality `_ are filtered out. | -| | | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF08` | Skipping | All the direct children of the root phenotype term | -| | "general" | `Phenotypic abnormality (HP:0000118) `_ | -| | level terms | are skipped, because of the assumption that if there is a valid signal, | -| | | it will derive from one of the more specific descendents. | -| | | | -| | | For instance, | -| | |`Abnormality of the nervous system `_ | -| | | (HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies | -| | | that if there is a signal from the nervous system, | -| | | it will lead to at least one of the descendents of | -| | | *Abnormality of the nervous system* being significant. | -| | | | -| | | See :ref:`general-hpo-terms` section for details. | -| | | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ +designed to skip testing the HPO terms that are unlikely to yield significant or interesting results. +`HMF01` - Skip terms that occur very rarely +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``term_frequency_threshold`` determines the mininum proportion of individuals +with direct or indirect annotation by the HPO term to test. +We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), +and we only retain a term for testing if the proportion of individuals +in at least one genotype group is greater than +or equal to ``term_frequency_threshold``. +This is because of our assumption that even if there is statistical significance, +if a term is only seen in (for example) 7% of individuals +in the MISSENSE group and 2% in the not-MISSENSE group, +the term is unlikely to be of great interest because it is rare. + + +`HMF02` - Skip terms if no genotype group has more than one count +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In a related heuristic, we skip terms if no genotype group has more +than one count. This is not completely redundant with the previous condition, +because some terms may have a small number of total observations. + + +`HMF03` - Skip terms if all counts are identical to counts for a child term +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Let's say a term such as +`Posterior polar cataract (HP:0001115) `_ +was observed in 7 of 11 individuals with MISSENSE variants +and in 3 of 8 individuals with NONSENSE variants. +If we find the same patient counts (7 of 11 and 3 of 8) in the parent term +`Polar cataract HP:0010696 `_, +then we choose to not test the parent term. + +This is because the more specific an HPO term is, +the more information it has (the more interesting the correlation would be if it exists), +and the result of a test, such as the Fisher Exact test, would be exactly the same +for *Polar cataract* as for *Posterior polar cataract*. + + +`HMF04` - Skip terms if genotypes have same HPO proportions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If both (or all) of the genotype groups have the same proportion of individuals +observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, +because it is not possible that the Fisher exact test will return a significant result. + + +`HMF05` - Skip term if one of the genotype groups has neither observed nor excluded observations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Skip terms if there are no HPO observations in a group. + + +`HMF06` - Skip term if underpowered for 2x2 or 2x3 analysis +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) +and the total count of patients in all genotype-phenotype groups is less than 7, +or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients +is less than 6, then there is a lack even of the nominal statistical power +and the counts can never be significant. + + +`HMF07` - Skipping terms that are not descendents of *Phenotypic abnormality* +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The HPO has a number of other branches that describe modes of inheritance, +past medical history, and clinical modifiers. +We do not think it makes much sense to test for enrichment of these terms, +so, all terms that are not descendants of +`Phenotypic abnormality `_ are filtered out. + + +`HMF08` - Skipping "general" level terms +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All the direct children of the root phenotype term +`Phenotypic abnormality (HP:0000118) `_ +are skipped, because of the assumption that if there is a valid signal, +it will derive from one of the more specific descendents. + +For instance, +`Abnormality of the nervous system `_ +(HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies +that if there is a signal from the nervous system, +it will lead to at least one of the descendents of +*Abnormality of the nervous system* being significant. + +See :ref:`general-hpo-terms` section for details. diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst index 2efb4d6b..6c4178e8 100644 --- a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -4,94 +4,90 @@ Mode of Inheritance Predicates ============================== -There are five basic modes of inheritance for single-gene diseases: autosomal dominant, autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial (See -`Understanding Genetics, Appendix B `_). - - -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` -assigns the individual into a group based on the number of alleles -that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports -the following Mendelian modes of inheritance (MoI): - - -+-----------------------+-----------------------------------+------------------+------------------------+ -| Mode of inheritance | Sex | Allele count | Genotype category | -+=======================+===================================+==================+========================+ -| Autosomal dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| Autosomal recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 2 | `BIALLELIC_ALT` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 3` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked dominant | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | 1 | `HET` | -+ +-----------------------------------+------------------+------------------------+ -| | `*` | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ -| X-linked recessive | `*` | 0 | `HOM_REF` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.FEMALE` | 1 | `HET` | -+ + +------------------+------------------------+ -| | | 2 | `BIALLELIC_ALT` | -+ + +------------------+------------------------+ -| | | :math:`\ge 3` | ``None`` | -+ +-----------------------------------+------------------+------------------------+ -| | :class:`~gpsea.model.Sex.MALE` | 1 | `HEMI` | -+ + +------------------+------------------------+ -| | | :math:`\ge 2` | ``None`` | -+-----------------------+-----------------------------------+------------------+------------------------+ +There are five basic modes of inheritance for single-gene diseases: autosomal dominant, +autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial +(See `Understanding Genetics, Appendix B `_). + + +The :class:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +and :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` +assigns the individual into a group based on the number of the alleles +observed in the individual. +GPSEA supports the following Mendelian modes of inheritance (MoI): + + ++-----------------------+------------------+------------------------+ +| Mode of inheritance | Allele count | Genotype category | ++=======================+==================+========================+ +| Autosomal dominant | 0 | `HOM_REF` | ++ +------------------+------------------------+ +| | 1 | `HET` | ++ +------------------+------------------------+ +| | :math:`\ge 2` | ``None`` | ++-----------------------+------------------+------------------------+ +| Autosomal recessive | 0 | `HOM_REF` | ++ +------------------+------------------------+ +| | 1 | `HET` | ++ +------------------+------------------------+ +| | 2 | `BIALLELIC_ALT` | ++ +------------------+------------------------+ +| | :math:`\ge 3` | ``None`` | ++-----------------------+------------------+------------------------+ + .. note:: `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. Clinical judgment should be used to choose the MoI for the cohort analysis. -Then a predicate for the desired MoI can be created by one of -:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: +Then a predicate for the desired MoI can be created by calling one +of the following methods: + +* :func:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +* :func:`~gpsea.analysis.predicate.genotype.autosomal_recessive` + +By default, the MoI predicates will use *all* variants recorded in the individual. +However, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +can be provided to select a variant subset, if necessary. -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.x_recessive` -All constructors take an instance -of :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` as an argument. +Assign individuals into genotype groups +--------------------------------------- +Here we show seting up a predicate for grouping individuals for differences +between genotypes of a disease with an autosomal recessive MoI. -Example -------- +We use :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` +to create the predicate: -Here we show seting up a predicate for grouping individuals based on -having a variant that leads to a frameshift or to a stop gain to a fictional transcript ``NM_1234.5`` -to test differences between the genotypes of a disease with an autosomal recessive MoI. +>>> from gpsea.analysis.predicate.genotype import autosomal_recessive +>>> gt_predicate = autosomal_recessive() +>>> gt_predicate.display_question() +'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' -First, we set up a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` -for testing if a variant meets the condition: +The predicate will use *all* recorded variants to determine if the individual belongs into +homozygous reference (`HOM_REF`), heterozygous (`HET`), or biallelic alternative (`BIALLELIC_ALT`) +category. + + +Use a subset of variants for choosing the genotype group +-------------------------------------------------------- + +To select specific variants, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` +can be registered with the MoI predicate. +For instance, the following can be done to only consider the variants that lead +to a missense change on a fictional transcript ``NM_1234.5`` +when assigning the genotype group. We set up the variant predicate: >>> from gpsea.model import VariantEffect >>> from gpsea.analysis.predicate.genotype import VariantPredicates >>> tx_id = 'NM_1234.5' ->>> is_frameshift_or_stop_gain = VariantPredicates.variant_effect(VariantEffect.FRAMESHIFT_VARIANT, tx_id) \ -... | VariantPredicates.variant_effect(VariantEffect.STOP_GAINED, tx_id) ->>> is_frameshift_or_stop_gain.get_question() -'(FRAMESHIFT_VARIANT on NM_1234.5 OR STOP_GAINED on NM_1234.5)' +>>> is_missense = VariantPredicates.variant_effect(VariantEffect.MISSENSE_VARIANT, tx_id) +>>> is_missense.get_question() +'MISSENSE_VARIANT on NM_1234.5' -Next, we use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` -for assigning a patient into a genotype group: +and we use it to create the MoI predicate: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_frameshift_or_stop_gain) +>>> gt_predicate = autosomal_recessive(is_missense) >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' - -The `gt_predicate` can be used in downstream analysis, such as in :class:`~gpsea.analysis.pcats.HpoTermAnalysis`. diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 9b2aa6cb..18882892 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -136,8 +136,8 @@ to test if the variant leads to a frameshift (in this case): and then we choose the expected mode of inheritance to test. In case of *TBX5*, we expect the autosomal dominant mode of inheritance: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) +>>> from gpsea.analysis.predicate.genotype import autosomal_dominant +>>> gt_predicate = autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET' diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index b9bd8e8a..9bf0b26a 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -2,13 +2,15 @@ from ._api import VariantPredicate from ._counter import AlleleCounter from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import autosomal_dominant, autosomal_recessive from ._gt_predicates import monoallelic_predicate, biallelic_predicate -from ._gt_predicates import ModeOfInheritancePredicate +from ._gt_predicates import ModeOfInheritancePredicate # TODO: remove before 1.0.0 from ._variant import VariantPredicates, ProteinPredicates __all__ = [ 'GenotypePolyPredicate', 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', + 'autosomal_dominant', 'autosomal_recessive', 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', diff --git a/src/gpsea/analysis/predicate/genotype/_counter.py b/src/gpsea/analysis/predicate/genotype/_counter.py index c74be909..8ad77d55 100644 --- a/src/gpsea/analysis/predicate/genotype/_counter.py +++ b/src/gpsea/analysis/predicate/genotype/_counter.py @@ -8,13 +8,14 @@ class AlleleCounter: :param predicate: a :class:`VariantPredicate` for selecting the target variants. """ - # TODO: this class should probably be an implementation detail, + # TODO: this class should probably be an implementation detail, # and not a public member of the package. def __init__( self, predicate: VariantPredicate, ): + assert isinstance(predicate, VariantPredicate) self._predicate = predicate def get_question(self) -> str: diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 9bc8c6bd..c5dbe07f 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,5 +1,6 @@ import dataclasses import typing +import warnings from collections import defaultdict @@ -11,8 +12,7 @@ from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter - -# TODO: implement __hash__, __eq__ on predicates +from ._variant import VariantPredicates class AlleleCountingGroupsPredicate(GenotypePolyPredicate): @@ -188,7 +188,21 @@ def __init__( self._categorizations = tuple(count2cat.values()) self._a_counter = a_counter self._b_counter = b_counter + self._hash = self._compute_hash() + def _compute_hash(self) -> int: + hash_value = 17 + + self._groups = defaultdict(list) + for count, cat in self._count2cat.items(): + hash_value += 13 * hash(count) + hash_value += 13 * hash(cat) + + hash_value += 23 * hash(self._a_counter) + hash_value += 23 * hash(self._b_counter) + + return hash_value + def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -204,7 +218,14 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return self._count2cat.get(counts, None) - # TODO: implement __hash__, __eq__ + def __eq__(self, value: object) -> bool: + return isinstance(value, PolyCountingGenotypePredicate) \ + and self._count2cat == value._count2cat \ + and self._a_counter == value._a_counter \ + and self._b_counter == value._b_counter + + def __hash__(self) -> int: + return self._hash def monoallelic_predicate( @@ -213,19 +234,32 @@ def monoallelic_predicate( names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ + The predicate bins patient into one of two groups, `A` and `B`, + based on presence of *exactly* one allele of a variant + that meets the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: - The predicate bins patient into one of two groups: `A` and `B`: - - +-----------+------------+------------+ - | Group | `A` count | `B` count | - +===========+============+============+ - | A | 1 | 0 | - +-----------+------------+------------+ - | B | 0 | 1 | - +-----------+------------+------------+ - - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) - are assigned the ``None`` group and, thus, omitted from the analysis. + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | A | 1 | 0 | + +-----------+-------------------+-------------------+ + | B | 0 | 1 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and, thus, omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). """ return PolyCountingGenotypePredicate.monoallelic( a_predicate=a_predicate, @@ -240,24 +274,36 @@ def biallelic_predicate( names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ - Get a predicate for binning the individuals into groups, - with respect to allele counts of variants selected by `a_predicate` and `b_predicate`. - - The predicate bins patient into one of three groups: `AA`, `AB` and `BB`: - - +-----------+------------------+------------------+ - | Group | `A` allele count | `B` allele count | - +===========+==================+==================+ - | AA | 2 | 0 | - +-----------+------------------+------------------+ - | AB | 1 | 1 | - +-----------+------------------+------------------+ - | AA | 0 | 2 | - +-----------+------------------+------------------+ - - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) - are assigned the ``None`` group and, thus, omitted from the analysis. - + The predicate bins patient into one of the three groups, + `AA`, `AB`, and `BB`, + based on presence of one or two variant alleles + that meet the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: + + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | AA | 2 | 0 | + +-----------+-------------------+-------------------+ + | AB | 1 | 1 | + +-----------+-------------------+-------------------+ + | AA | 0 | 2 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 1` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and will be, thus, + omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). """ return PolyCountingGenotypePredicate.biallelic( a_predicate=a_predicate, @@ -266,6 +312,47 @@ def biallelic_predicate( ) +def autosomal_dominant( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: + """ + Create a predicate that assigns the patient either + into homozygous reference or heterozygous + group in line with the autosomal dominant mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used. + """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + ) + + +def autosomal_recessive( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: + """ + Create a predicate that assigns the patient either into + homozygous reference, heterozygous, or biallelic alternative allele + (homozygous alternative or compound heterozygous) + group in line with the autosomal recessive mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used + """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + ) + + @dataclasses.dataclass(eq=True, frozen=True) class GenotypeGroup: allele_count: int @@ -397,8 +484,8 @@ class ModeOfInheritancePredicate(GenotypePolyPredicate): @staticmethod def autosomal_dominant( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": + variant_predicate: typing.Optional[VariantPredicate] = None, + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference or heterozygous @@ -406,15 +493,18 @@ def autosomal_dominant( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate._from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_dominant` instead", + DeprecationWarning, stacklevel=2, ) + return autosomal_dominant(variant_predicate) + @staticmethod def autosomal_recessive( - variant_predicate: VariantPredicate, - ) -> "ModeOfInheritancePredicate": + variant_predicate: typing.Optional[VariantPredicate] = None, + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference, heterozygous, or biallelic alternative allele @@ -423,11 +513,14 @@ def autosomal_recessive( :param variant_predicate: a predicate for choosing the variants for testing. """ - return ModeOfInheritancePredicate._from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_recessive` instead", + DeprecationWarning, stacklevel=2, ) + return autosomal_recessive(variant_predicate) + @staticmethod def _from_moi_info( variant_predicate: VariantPredicate, @@ -546,6 +639,12 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: else: return None + def __eq__(self, value: object) -> bool: + return isinstance(value, SexGenotypePredicate) + + def __hash__(self) -> int: + return 31 + INSTANCE = SexGenotypePredicate() @@ -609,6 +708,7 @@ def __init__( self._categorizations = tuple( sorted(categorizations.values(), key=lambda c: c.category.cat_id) ) + self._hash = hash(tuple(categorizations.items())) def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -633,6 +733,13 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return None return categorization + + def __eq__(self, value: object) -> bool: + return isinstance(value, DiagnosisPredicate) \ + and self._id2cat == value._id2cat + + def __hash__(self) -> int: + return self._hash def diagnosis_predicate( diff --git a/src/gpsea/analysis/predicate/genotype/_predicates.py b/src/gpsea/analysis/predicate/genotype/_predicates.py index 03fae44c..b1ac93e6 100644 --- a/src/gpsea/analysis/predicate/genotype/_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_predicates.py @@ -10,10 +10,41 @@ from ._api import VariantPredicate +class AlwaysTrueVariantPredicate(VariantPredicate): + """ + `AlwaysTrueVariantPredicate` returns `True` for any variant. + """ + + @staticmethod + def get_instance() -> "AlwaysTrueVariantPredicate": + return ALWAYS_TRUE + + def get_question(self) -> str: + return "" + + def test(self, variant: Variant) -> bool: + return True + + def __eq__(self, value: object) -> bool: + return isinstance(value, AlwaysTrueVariantPredicate) + + def __hash__(self) -> int: + return 17 + + def __str__(self): + return repr(self) + + def __repr__(self): + return 'AlwaysTrueVariantPredicate()' + + +ALWAYS_TRUE = AlwaysTrueVariantPredicate() + + class VariantEffectPredicate(VariantPredicate): """ `VariantEffectPredicate` is a `VariantPredicate` that sets up testing - for a specific `VariantEffect` on a given transcript ID. + for a specific `VariantEffect` on a given transcript ID. Args: effect (VariantEffect): the variant effect to search for @@ -28,16 +59,6 @@ def get_question(self) -> str: return f'{self._effect.name} on {self._tx_id}' def test(self, variant: Variant) -> bool: - """Tests if the `Variant` causes the specified `VariantEffect` - on the given transcript. - - Args: - variant (Variant): a - - Returns: - bool: _description_ - """ - tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) if tx_anno is None: return False @@ -98,7 +119,7 @@ def __repr__(self): class VariantGenePredicate(VariantPredicate): """ - `VariantGenePredicate` tests if the variant + `VariantGenePredicate` tests if the variant is annotated to affect a gene denoted by a `symbol`. Args: @@ -134,7 +155,7 @@ def __repr__(self): class VariantTranscriptPredicate(VariantPredicate): """ - `VariantTranscriptPredicate` tests if the variant + `VariantTranscriptPredicate` tests if the variant is annotated to affect a transcript with `tx_id` accession. Args: @@ -170,24 +191,23 @@ def __repr__(self): class VariantExonPredicate(VariantPredicate): """ - `VariantExonPredicate` tests if the variant affects + `VariantExonPredicate` tests if the variant affects *n*-th exon of the transcript of interest. .. warning:: - We use 1-based numbering to number the exons, + We use 1-based numbering to number the exons, not the usual 0-based numbering of the computer science. - Therefore, the first exon of the transcript + Therefore, the first exon of the transcript has ``exon_number==1``, the second exon is ``2``, and so on ... .. warning:: - We do not check if the `exon_number` spans + We do not check if the `exon_number` spans beyond the number of exons of the given `transcript_id`! - Therefore, ``exon_number==10,000`` will effectively - return :attr:`GenotypeBooleanPredicate.FALSE` - for *all* patients!!! 😱 - Well, at least the patients of the *Homo sapiens sapiens* taxon... + Therefore, ``exon_number==10,000`` will effectively + return `False` for *all* variants!!! 😱 + Well, at least the genomic variants of the *Homo sapiens sapiens* taxon... :param exon: a positive `int` of the target exon :param tx_id: the accession of the transcript of interest, e.g. `NM_123456.7` @@ -195,10 +215,12 @@ class VariantExonPredicate(VariantPredicate): def __init__( self, - exon: int, + exon: int, tx_id: str, ): + assert isinstance(exon, int) and exon > 0, '`exon` must be a positive `int`' self._exon = exon + assert isinstance(tx_id, str) self._tx_id = tx_id def get_question(self) -> str: @@ -337,20 +359,20 @@ def __repr__(self): def _decode_operator(op: str) -> typing.Callable[[int, int], bool]: - if op == '<': - return operator.lt - elif op == '<=': - return operator.le - elif op == '==': - return operator.eq - elif op == '!=': - return operator.ne - elif op == '>=': - return operator.ge - elif op == '>': - return operator.gt - else: - raise ValueError(f'Unsupported operator {op}') + if op == '<': + return operator.lt + elif op == '<=': + return operator.le + elif op == '==': + return operator.eq + elif op == '!=': + return operator.ne + elif op == '>=': + return operator.ge + elif op == '>': + return operator.gt + else: + raise ValueError(f'Unsupported operator {op}') class ChangeLengthPredicate(VariantPredicate): @@ -394,7 +416,7 @@ def __repr__(self): class RefAlleleLengthPredicate(VariantPredicate): """ - `RefAlleleLengthPredicate` tests if the length of the variant's reference allele + `RefAlleleLengthPredicate` tests if the length of the variant's reference allele is greater than, equal, or less than certain value. """ @@ -455,7 +477,8 @@ def __init__( self._tx_id = tx_id def get_question(self) -> str: - return f'variant affects aminoacid(s) between {self._region.start} and {self._region.end} on protein encoded by transcript {self._tx_id}' + return f'variant affects aminoacid(s) between {self._region.start} and {self._region.end} ' \ + f'on protein encoded by transcript {self._tx_id}' def test(self, variant: Variant) -> bool: tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) @@ -494,9 +517,9 @@ class ProteinFeatureTypePredicate(VariantPredicate): """ def __init__( - self, - feature_type: FeatureType, - tx_id: str, + self, + feature_type: FeatureType, + tx_id: str, protein_metadata_service: ProteinMetadataService, ): self._feature_type = feature_type @@ -504,7 +527,8 @@ def __init__( self._prot_service = protein_metadata_service def get_question(self) -> str: - return f'variant affects {self._feature_type.name} feature type on the protein encoded by transcript {self._tx_id}' + return f'variant affects {self._feature_type.name} feature type on the protein ' \ + f'encoded by transcript {self._tx_id}' def test(self, variant: Variant) -> bool: tx_anno = variant.get_tx_anno_by_tx_id(self._tx_id) @@ -519,7 +543,7 @@ def test(self, variant: Variant) -> bool: return False protein_meta = self._prot_service.annotate(protein_id) - for feat in protein_meta.protein_features: + for feat in protein_meta.protein_features: if feat.feature_type == self._feature_type and location.overlaps_with(feat.info.region): return True return False @@ -538,7 +562,11 @@ def __str__(self): return repr(self) def __repr__(self): - return f'ProteinFeatureTypePredicate(feature_type={self._feature_type}, tx_id={self._tx_id}, protein_metadata_service={self._prot_service})' + return 'ProteinFeatureTypePredicate(' \ + f'feature_type={self._feature_type}, ' \ + f'tx_id={self._tx_id}, ' \ + f'protein_metadata_service={self._prot_service}' \ + ')' class ProteinFeaturePredicate(VariantPredicate): @@ -574,7 +602,7 @@ def test(self, variant: Variant) -> bool: return False protein = self._prot_service.annotate(protein_id) - for feat in protein.protein_features: + for feat in protein.protein_features: if feat.info.name == self._feature_name and location.overlaps_with(feat.info.region): return True return False @@ -593,4 +621,7 @@ def __str__(self): return repr(self) def __repr__(self): - return f'ProteinFeaturePredicate(feature_name={self._feature_name}, tx_id={self._tx_id}, protein_metadata_service={self._prot_service})' + return f'ProteinFeaturePredicate(' \ + f'feature_name={self._feature_name}, ' \ + f'tx_id={self._tx_id}, ' \ + f'protein_metadata_service={self._prot_service})' diff --git a/src/gpsea/analysis/predicate/genotype/_variant.py b/src/gpsea/analysis/predicate/genotype/_variant.py index 54d39735..99f23638 100644 --- a/src/gpsea/analysis/predicate/genotype/_variant.py +++ b/src/gpsea/analysis/predicate/genotype/_variant.py @@ -18,6 +18,14 @@ class VariantPredicates: that are relatively simple to configure. """ + @staticmethod + def true() -> VariantPredicate: + """ + Prepare an absolutely inclusive :class:`VariantPredicate` - a predicate that returns `True` + for any variant whatsoever. + """ + return AlwaysTrueVariantPredicate.get_instance() + @staticmethod def all(predicates: typing.Iterable[VariantPredicate]) -> VariantPredicate: """ @@ -140,6 +148,21 @@ def exon( """ Prepare a :class:`VariantPredicate` that tests if the variant overlaps with an exon of a specific transcript. + .. warning:: + + We use 1-based numbering to number the exons, + not the usual 0-based numbering of the computer science. + Therefore, the first exon of the transcript + has ``exon_number==1``, the second exon is ``2``, and so on ... + + .. warning:: + + We do not check if the `exon_number` spans + beyond the number of exons of the given `transcript_id`! + Therefore, ``exon_number==10,000`` will effectively return `False` + for *all* variants!!! 😱 + Well, at least the genome variants of the *Homo sapiens sapiens* taxon... + Args: exon: a non-negative `int` with the index of the target exon (e.g. `0` for the 1st exon, `1` for the 2nd, ...) diff --git a/src/gpsea/preprocessing/_uniprot.py b/src/gpsea/preprocessing/_uniprot.py index d1adb98b..ec17ac0d 100644 --- a/src/gpsea/preprocessing/_uniprot.py +++ b/src/gpsea/preprocessing/_uniprot.py @@ -10,10 +10,8 @@ class UniprotProteinMetadataService(ProteinMetadataService): """A class that creates ProteinMetadata objects from data found with the Uniprot REST API. - More info on the Uniprot REST API here - https://www.uniprot.org/help/programmatic_access - - Methods: - annotate(protein_id:str): Gets metadata and creates ProteinMetadata for given protein ID + More info on the Uniprot REST API are + in the `Programmatic access `_ section. """ def __init__( @@ -27,8 +25,8 @@ def __init__( @staticmethod def parse_uniprot_json( - payload: typing.Mapping[str, typing.Any], - protein_id: str, + payload: typing.Mapping[str, typing.Any], + protein_id: str, ) -> ProteinMetadata: """ Try to extract `ProteinMetadata` corresponding to `protein_id` from the Uniprot JSON `payload`. @@ -44,39 +42,52 @@ def parse_uniprot_json( results = payload['results'] if len(results) == 0: raise ValueError(f"No proteins found for ID {protein_id}. Please verify refseq ID.") + elif len(results) == 1: + # In case we get only one result, let's use it! + return UniprotProteinMetadataService._extract_metadata( + protein_id=protein_id, + data=results[0], + ) + else: + for protein in results: + if any(uni['id'] == protein_id for uni in protein['uniProtKBCrossReferences']): + return UniprotProteinMetadataService._extract_metadata( + protein_id=protein_id, + data=protein, + ) - for protein in results: - if any(uni['id'] == protein_id for uni in protein['uniProtKBCrossReferences']): - # `protein` has a cross-reference to the `protein_id` of interest - try: - protein_name = protein['proteinDescription']['recommendedName']['fullName']['value'] - except KeyError: - protein_name = protein['proteinDescription']['submissionNames'][0]['fullName']['value'] + raise ValueError(f'Could not find an entry for {protein_id} in Uniprot response') - all_features_list = [] - try: - for feature in protein['features']: - feature_start = int(feature['location']['start']['value']) - feature_end = int(feature['location']['end']['value']) - feature_name = feature['description'] - feature_info = FeatureInfo( - feature_name, - Region(start=feature_start, end=feature_end), - ) - feature_type = FeatureType[feature['type'].upper()] - protein_feature = ProteinFeature.create(feature_info, feature_type) - all_features_list.append(protein_feature) - except KeyError: - raise ValueError(f"No features for {protein_id}") + @staticmethod + def _extract_metadata(protein_id: str, data: typing.Mapping[str, typing.Any]) -> ProteinMetadata: + # `protein` has a cross-reference to the `protein_id` of interest + try: + protein_name = data['proteinDescription']['recommendedName']['fullName']['value'] + except KeyError: + protein_name = data['proteinDescription']['submissionNames'][0]['fullName']['value'] - try: - protein_length = protein["sequence"]["length"] - except KeyError as e: - raise ValueError(e) + all_features_list = [] + try: + for feature in data['features']: + feature_start = int(feature['location']['start']['value']) + feature_end = int(feature['location']['end']['value']) + feature_name = feature['description'] + feature_info = FeatureInfo( + feature_name, + Region(start=feature_start, end=feature_end), + ) + feature_type = FeatureType[feature['type'].upper()] + protein_feature = ProteinFeature.create(feature_info, feature_type) + all_features_list.append(protein_feature) + except KeyError: + raise ValueError(f"No features for {protein_id}") - return ProteinMetadata(protein_id, protein_name, all_features_list, protein_length) + try: + protein_length = data["sequence"]["length"] + except KeyError as e: + raise ValueError(e) - raise ValueError(f'Could not find an entry for {protein_id} in Uniprot response') + return ProteinMetadata(protein_id, protein_name, all_features_list, protein_length) def annotate(self, protein_id: str) -> ProteinMetadata: """ @@ -96,4 +107,5 @@ def annotate(self, protein_id: str) -> ProteinMetadata: raise ValueError(f"only works with a RefSeq database ID (e.g. NP_037407.4), but we got {protein_id}") api_url = self._url % protein_id response = requests.get(api_url, timeout=self._timeout).json() + return UniprotProteinMetadataService.parse_uniprot_json(response, protein_id) diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index 6d1b0988..425c63f5 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -7,9 +7,10 @@ sex_predicate, monoallelic_predicate, biallelic_predicate, + autosomal_dominant, + autosomal_recessive, VariantPredicates, VariantPredicate, - ModeOfInheritancePredicate, ) @@ -102,7 +103,30 @@ def test_autosomal_dominant( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_dominant(variant_predicate) + predicate = autosomal_dominant(variant_predicate) + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.name == name + + @pytest.mark.parametrize( + "patient_name,name", + [ + ("adam", "HET"), # 0/0 & 0/1 + ("eve", "HET"), # 0/1 & 0/0 + ("cain", "HET"), # 0/1 & 0/0 + ], + ) + def test_autosomal_dominant__with_default_predicate( + self, + patient_name: str, + name: str, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = autosomal_dominant() categorization = predicate.test(patient) @@ -127,7 +151,32 @@ def test_autosomal_recessive( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_recessive(variant_predicate) + predicate = autosomal_recessive(variant_predicate) + + categorization = predicate.test(patient) + + assert categorization is not None + + assert categorization.category.name == name + + @pytest.mark.parametrize( + "patient_name,name", + [ + # The White family has two variants: + ("walt", "BIALLELIC_ALT"), # 0/1 & 0/1 + ("skyler", "BIALLELIC_ALT"), # 0/1 & 0/1 + ("flynn", "BIALLELIC_ALT"), # 1/1 & 0/0 + ("holly", "BIALLELIC_ALT"), # 0/0 & 1/1 + ], + ) + def test_autosomal_recessive__with_default_predicate( + self, + patient_name: str, + name: str, + request: pytest.FixtureRequest, + ): + patient = request.getfixturevalue(patient_name) + predicate = autosomal_recessive() categorization = predicate.test(patient) diff --git a/tests/analysis/predicate/genotype/test_predicates.py b/tests/analysis/predicate/genotype/test_predicates.py index a076adac..e85e3a3f 100644 --- a/tests/analysis/predicate/genotype/test_predicates.py +++ b/tests/analysis/predicate/genotype/test_predicates.py @@ -1,12 +1,19 @@ import pytest -from gpsea.analysis.predicate.genotype import VariantPredicates, ProteinPredicates, VariantPredicate +from gpsea.analysis.predicate.genotype import VariantPredicates, ProteinPredicates from gpsea.model import * from gpsea.model.genome import * class TestVariantPredicates: + def test_always_true_predicate( + self, + suox_cohort: Cohort, + ): + predicate = VariantPredicates.true() + assert all(predicate.test(v) for v in suox_cohort.all_variants()) + @pytest.mark.parametrize( 'effect, expected', [ @@ -46,7 +53,7 @@ def test_variant_key_predicate( @pytest.mark.parametrize( 'exon, expected', [ - (0, False), + (1, False), (4, True), (5, False), ] @@ -61,6 +68,11 @@ def test_exon_predicate( assert predicate.test(missense_variant) == expected + def test_exon_predicate_fails_on_invalid_exon(self): + with pytest.raises(AssertionError) as e: + VariantPredicates.exon(0, tx_id='tx:xyz') + assert e.value.args[0] == '`exon` must be a positive `int`' + @pytest.mark.parametrize( 'tx_id, expected', [ diff --git a/tests/conftest.py b/tests/conftest.py index 5459ce3f..900e7f45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, ModeOfInheritancePredicate, VariantPredicates +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, autosomal_dominant from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.io import GpseaJSONDecoder from gpsea.model import * @@ -119,7 +119,7 @@ def suox_mane_tx_id() -> str: def suox_gt_predicate( suox_mane_tx_id: str, ) -> GenotypePolyPredicate: - return ModeOfInheritancePredicate.autosomal_dominant( + return autosomal_dominant( variant_predicate=VariantPredicates.variant_effect( effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id diff --git a/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json b/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json new file mode 100644 index 00000000..4e5fdb0c --- /dev/null +++ b/tests/preprocessing/data/uniprot_response/ITPR1_HUMAN.json @@ -0,0 +1,446 @@ +{ + "results": [ + { + "entryType": "UniProtKB reviewed (Swiss-Prot)", + "primaryAccession": "Q14643", + "uniProtkbId": "ITPR1_HUMAN", + "proteinDescription": { + "recommendedName": { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000305" + } + ], + "value": "Inositol 1,4,5-trisphosphate-gated calcium channel ITPR1" + } + }, + "alternativeNames": [ + { + "fullName": { + "value": "IP3 receptor isoform 1" + }, + "shortNames": [ + { + "value": "IP3R 1" + }, + { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "InsP3R1" + } + ] + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "8648241" + } + ], + "value": "Inositol 1,4,5 trisphosphate receptor" + } + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7500840" + } + ], + "value": "Inositol 1,4,5-trisphosphate receptor type 1" + } + }, + { + "fullName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7852357" + }, + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "Type 1 inositol 1,4,5-trisphosphate receptor" + }, + "shortNames": [ + { + "value": "Type 1 InsP3 receptor" + } + ] + } + ] + }, + "genes": [ + { + "geneName": { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7852357" + }, + { + "evidenceCode": "ECO:0000312", + "source": "HGNC", + "id": "HGNC:6180" + } + ], + "value": "ITPR1" + }, + "synonyms": [ + { + "evidences": [ + { + "evidenceCode": "ECO:0000303", + "source": "PubMed", + "id": "7945203" + } + ], + "value": "INSP3R1" + } + ] + } + ], + "features": [ + { + "type": "Domain", + "location": { + "start": { + "value": 112, + "modifier": "EXACT" + }, + "end": { + "value": 166, + "modifier": "EXACT" + } + }, + "description": "MIR 1", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 173, + "modifier": "EXACT" + }, + "end": { + "value": 223, + "modifier": "EXACT" + } + }, + "description": "MIR 2", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 231, + "modifier": "EXACT" + }, + "end": { + "value": 287, + "modifier": "EXACT" + } + }, + "description": "MIR 3", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 294, + "modifier": "EXACT" + }, + "end": { + "value": 373, + "modifier": "EXACT" + } + }, + "description": "MIR 4", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Domain", + "location": { + "start": { + "value": 379, + "modifier": "EXACT" + }, + "end": { + "value": 435, + "modifier": "EXACT" + } + }, + "description": "MIR 5", + "evidences": [ + { + "evidenceCode": "ECO:0000255", + "source": "PROSITE-ProRule", + "id": "PRU00131" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1015, + "modifier": "EXACT" + }, + "end": { + "value": 1036, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1146, + "modifier": "EXACT" + }, + "end": { + "value": 1178, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1708, + "modifier": "EXACT" + }, + "end": { + "value": 1740, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1760, + "modifier": "EXACT" + }, + "end": { + "value": 1796, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1890, + "modifier": "EXACT" + }, + "end": { + "value": 1915, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 1939, + "modifier": "EXACT" + }, + "end": { + "value": 1960, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 2472, + "modifier": "EXACT" + }, + "end": { + "value": 2537, + "modifier": "EXACT" + } + }, + "description": "Interaction with ERP44", + "evidences": [ + { + "evidenceCode": "ECO:0000250", + "source": "UniProtKB", + "id": "P11881" + } + ] + }, + { + "type": "Region", + "location": { + "start": { + "value": 2729, + "modifier": "EXACT" + }, + "end": { + "value": 2758, + "modifier": "EXACT" + } + }, + "description": "Disordered", + "evidences": [ + { + "evidenceCode": "ECO:0000256", + "source": "SAM", + "id": "MobiDB-lite" + } + ] + } + ], + "uniProtKBCrossReferences": [ + { + "database": "RefSeq", + "id": "NP_001093422.2", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_001099952.2" + } + ], + "isoformId": "Q14643-3" + }, + { + "database": "RefSeq", + "id": "NP_001161744.1", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_001168272.1" + } + ], + "isoformId": "Q14643-2" + }, + { + "database": "RefSeq", + "id": "NP_002213.5", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "NM_002222.5" + } + ], + "isoformId": "Q14643-4" + }, + { + "database": "RefSeq", + "id": "XP_011531985.1", + "properties": [ + { + "key": "NucleotideSequenceId", + "value": "XM_011533683.2" + } + ] + } + ], + "sequence": { + "length": 2758 + }, + "extraAttributes": { + "uniParcId": "UPI00015E0851" + } + } + ] +} \ No newline at end of file diff --git a/tests/preprocessing/test_uniprot.py b/tests/preprocessing/test_uniprot.py index f87e478e..461ca863 100644 --- a/tests/preprocessing/test_uniprot.py +++ b/tests/preprocessing/test_uniprot.py @@ -15,6 +15,11 @@ def fpath_uniprot_response_dir( return os.path.join(fpath_preprocessing_data_dir, "uniprot_response") +def read_json_payload(path: str) -> typing.Mapping[str, typing.Any]: + with open(path) as fh: + return json.load(fh) + + class TestUniprotProteinMetadataService: @pytest.fixture @@ -27,8 +32,7 @@ def zn462_human_uniprot_json( fpath_uniprot_response_dir: str, ) -> typing.Mapping[str, typing.Any]: fpath_zn462_human = os.path.join(fpath_uniprot_response_dir, "ZN462_HUMAN.json") - with open(fpath_zn462_human) as f: - return json.load(f) + return read_json_payload(fpath_zn462_human) def test_zn462( self, @@ -44,6 +48,24 @@ def test_zn462( assert protein_metadata.label == "Ankyrin repeat domain-containing protein 11" assert len(protein_metadata.protein_features) == 16 assert protein_metadata.protein_length == 2663 + + def test_itpr1( + self, + fpath_uniprot_response_dir: str, + ): + response_json_path = os.path.join(fpath_uniprot_response_dir, 'ITPR1_HUMAN.json') + response_json = read_json_payload(response_json_path) + + protein_id = "NP_001365381.1" + protein_metadata = UniprotProteinMetadataService.parse_uniprot_json( + payload=response_json, + protein_id=protein_id, + ) + + assert protein_metadata.protein_id == protein_id + assert protein_metadata.label == "Inositol 1,4,5-trisphosphate-gated calcium channel ITPR1" + assert len(protein_metadata.protein_features) == 13 + assert protein_metadata.protein_length == 2758 @pytest.mark.skip("Run manually to regenerate SUOX `NP_001027558.1` metadata") def test_fetch_suox_protein_metadata(