From 22bc9dfc6c666e25a54e1f80a119c91234580f82 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 14:51:27 +0200 Subject: [PATCH 1/5] Improve Pydoc for monoallelic and biallelic predicates. --- .../predicate/genotype/_gt_predicates.py | 85 ++++++++++++------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index c1209330..f5fc9284 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -214,19 +214,32 @@ def monoallelic_predicate( names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ + The predicate bins patient into one of two groups, `A` and `B`, + based on presence of *exactly* one allele of a variant + that meets the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: - The predicate bins patient into one of two groups: `A` and `B`: - - +-----------+------------+------------+ - | Group | `A` count | `B` count | - +===========+============+============+ - | A | 1 | 0 | - +-----------+------------+------------+ - | B | 0 | 1 | - +-----------+------------+------------+ - - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) - are assigned the ``None`` group and, thus, omitted from the analysis. + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | A | 1 | 0 | + +-----------+-------------------+-------------------+ + | B | 0 | 1 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and, thus, omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). """ return PolyCountingGenotypePredicate.monoallelic( a_predicate=a_predicate, @@ -241,24 +254,36 @@ def biallelic_predicate( names: typing.Tuple[str, str] = ('A', 'B'), ) -> GenotypePolyPredicate: """ - Get a predicate for binning the individuals into groups, - with respect to allele counts of variants selected by `a_predicate` and `b_predicate`. - - The predicate bins patient into one of three groups: `AA`, `AB` and `BB`: - - +-----------+------------------+------------------+ - | Group | `A` allele count | `B` allele count | - +===========+==================+==================+ - | AA | 2 | 0 | - +-----------+------------------+------------------+ - | AB | 1 | 1 | - +-----------+------------------+------------------+ - | AA | 0 | 2 | - +-----------+------------------+------------------+ - - Individuals with different allele counts (e.g. :math:`count_{A} = 0` and :math:`count_{B} = 1`) - are assigned the ``None`` group and, thus, omitted from the analysis. - + The predicate bins patient into one of the three groups, + `AA`, `AB`, and `BB`, + based on presence of one or two variant alleles + that meet the predicate criteria. + + The number of alleles :math:`count_{A}` and :math:`count_{B}` + is computed using `a_predicate` and `b_predicate` + and the individual is assigned into a group + based on the following table: + + +-----------+-------------------+-------------------+ + | Group | :math:`count_{A}` | :math:`count_{B}` | + +===========+===================+===================+ + | AA | 2 | 0 | + +-----------+-------------------+-------------------+ + | AB | 1 | 1 | + +-----------+-------------------+-------------------+ + | AA | 0 | 2 | + +-----------+-------------------+-------------------+ + + The individuals with different allele counts + (e.g. :math:`count_{A} = 1` and :math:`count_{B} = 2`) + are assigned into the ``None`` group and will be, thus, + omitted from the analysis. + + :param a_predicate: predicate to test if the variants + meet the criteria of the first group (named `A` by default). + :param b_predicate: predicate to test if the variants + meet the criteria of the second group (named `B` by default). + :param names: group names (default ``('A', 'B')``). """ return PolyCountingGenotypePredicate.biallelic( a_predicate=a_predicate, From 7aa36f9887c9dfde8ce121ec2a7ea6404d09e103 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 15:05:25 +0200 Subject: [PATCH 2/5] Compute __eq__ and __hash__ on allele predicate. --- .../predicate/genotype/_gt_predicates.py | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index f5fc9284..0ff4bf4c 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -189,7 +189,21 @@ def __init__( self._categorizations = tuple(count2cat.values()) self._a_counter = a_counter self._b_counter = b_counter + self._hash = self._compute_hash() + def _compute_hash(self) -> int: + hash_value = 17 + + self._groups = defaultdict(list) + for count, cat in self._count2cat.items(): + hash_value += 13 * hash(count) + hash_value += 13 * hash(cat) + + hash_value += 23 * hash(self._a_counter) + hash_value += 23 * hash(self._b_counter) + + return hash_value + def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -205,7 +219,14 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return self._count2cat.get(counts, None) - # TODO: implement __hash__, __eq__ + def __eq__(self, value: object) -> bool: + return isinstance(value, PolyCountingGenotypePredicate) \ + and self._count2cat == value._count2cat \ + and self._a_counter == value._a_counter \ + and self._b_counter == value._b_counter + + def __hash__(self) -> int: + return self._hash def monoallelic_predicate( From 634b43cd47b96df43a754962936473f478fa07be Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 16:26:15 +0200 Subject: [PATCH 3/5] Do not export `ModeOfInheritancePredicate`. --- .../mode_of_inheritance_predicate.rst | 26 +++--- docs/user-guide/stats.rst | 4 +- .../analysis/predicate/genotype/__init__.py | 2 + .../predicate/genotype/_gt_predicates.py | 88 +++++++++++++++---- .../predicate/genotype/test_gt_predicates.py | 11 +-- 5 files changed, 94 insertions(+), 37 deletions(-) diff --git a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst index f213e9a7..6c4178e8 100644 --- a/docs/user-guide/predicates/mode_of_inheritance_predicate.rst +++ b/docs/user-guide/predicates/mode_of_inheritance_predicate.rst @@ -9,11 +9,11 @@ autosomal recessive, X-linked dominant, X-linked recessive, and mitochondrial (See `Understanding Genetics, Appendix B `_). -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` -assigns the individual into a group based on the number of alleles -that match a condition specified by a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate`. -The :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` supports -the following Mendelian modes of inheritance (MoI): +The :class:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +and :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` +assigns the individual into a group based on the number of the alleles +observed in the individual. +GPSEA supports the following Mendelian modes of inheritance (MoI): +-----------------------+------------------+------------------------+ @@ -40,11 +40,11 @@ the following Mendelian modes of inheritance (MoI): `BIALLELIC_ALT` includes both homozygous and compound heterozygous genotypes. Clinical judgment should be used to choose the MoI for the cohort analysis. -Then a predicate for the desired MoI can be created by one of -:class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate` static constructors: +Then a predicate for the desired MoI can be created by calling one +of the following methods: -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_dominant` -* :func:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +* :func:`~gpsea.analysis.predicate.genotype.autosomal_dominant` +* :func:`~gpsea.analysis.predicate.genotype.autosomal_recessive` By default, the MoI predicates will use *all* variants recorded in the individual. However, a :class:`~gpsea.analysis.predicate.genotype.VariantPredicate` @@ -57,11 +57,11 @@ Assign individuals into genotype groups Here we show seting up a predicate for grouping individuals for differences between genotypes of a disease with an autosomal recessive MoI. -We use :class:`~gpsea.analysis.predicate.genotype.ModeOfInheritancePredicate.autosomal_recessive` +We use :class:`~gpsea.analysis.predicate.genotype.autosomal_recessive` to create the predicate: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive() +>>> from gpsea.analysis.predicate.genotype import autosomal_recessive +>>> gt_predicate = autosomal_recessive() >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' @@ -88,6 +88,6 @@ when assigning the genotype group. We set up the variant predicate: and we use it to create the MoI predicate: ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_recessive(is_missense) +>>> gt_predicate = autosomal_recessive(is_missense) >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET, BIALLELIC_ALT' diff --git a/docs/user-guide/stats.rst b/docs/user-guide/stats.rst index 3247a121..cb622526 100644 --- a/docs/user-guide/stats.rst +++ b/docs/user-guide/stats.rst @@ -136,8 +136,8 @@ to test if the variant leads to a frameshift (in this case): and then we choose the expected mode of inheritance to test. In case of *TBX5*, we expect the autosomal dominant mode of inheritance: ->>> from gpsea.analysis.predicate.genotype import ModeOfInheritancePredicate ->>> gt_predicate = ModeOfInheritancePredicate.autosomal_dominant(is_frameshift) +>>> from gpsea.analysis.predicate.genotype import autosomal_dominant +>>> gt_predicate = autosomal_dominant(is_frameshift) >>> gt_predicate.display_question() 'What is the genotype group: HOM_REF, HET' diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index b9bd8e8a..a27bc0fb 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -2,6 +2,7 @@ from ._api import VariantPredicate from ._counter import AlleleCounter from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate +from ._gt_predicates import autosomal_dominant, autosomal_recessive from ._gt_predicates import monoallelic_predicate, biallelic_predicate from ._gt_predicates import ModeOfInheritancePredicate from ._variant import VariantPredicates, ProteinPredicates @@ -9,6 +10,7 @@ __all__ = [ 'GenotypePolyPredicate', 'groups_predicate', 'sex_predicate', 'diagnosis_predicate', + 'autosomal_dominant', 'autosomal_recessive', 'monoallelic_predicate', 'biallelic_predicate', 'ModeOfInheritancePredicate', 'AlleleCounter', 'VariantPredicate', diff --git a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py index 0ff4bf4c..c5dbe07f 100644 --- a/src/gpsea/analysis/predicate/genotype/_gt_predicates.py +++ b/src/gpsea/analysis/predicate/genotype/_gt_predicates.py @@ -1,19 +1,18 @@ import dataclasses import typing +import warnings from collections import defaultdict import hpotk -from gpsea.analysis.predicate.genotype._variant import VariantPredicates from gpsea.model import Patient, Sex from .._api import Categorization, PatientCategory from ._api import GenotypePolyPredicate from ._api import VariantPredicate from ._counter import AlleleCounter - -# TODO: implement __hash__, __eq__ on predicates +from ._variant import VariantPredicates class AlleleCountingGroupsPredicate(GenotypePolyPredicate): @@ -313,6 +312,47 @@ def biallelic_predicate( ) +def autosomal_dominant( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: + """ + Create a predicate that assigns the patient either + into homozygous reference or heterozygous + group in line with the autosomal dominant mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used. + """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + ) + + +def autosomal_recessive( + variant_predicate: typing.Optional[VariantPredicate] = None, +) -> GenotypePolyPredicate: + """ + Create a predicate that assigns the patient either into + homozygous reference, heterozygous, or biallelic alternative allele + (homozygous alternative or compound heterozygous) + group in line with the autosomal recessive mode of inheritance. + + :param variant_predicate: a predicate for choosing the variants for testing + or `None` if all variants should be used + """ + if variant_predicate is None: + variant_predicate = VariantPredicates.true() + + return ModeOfInheritancePredicate._from_moi_info( + variant_predicate=variant_predicate, + mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + ) + + @dataclasses.dataclass(eq=True, frozen=True) class GenotypeGroup: allele_count: int @@ -445,7 +485,7 @@ class ModeOfInheritancePredicate(GenotypePolyPredicate): @staticmethod def autosomal_dominant( variant_predicate: typing.Optional[VariantPredicate] = None, - ) -> "ModeOfInheritancePredicate": + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference or heterozygous @@ -453,18 +493,18 @@ def autosomal_dominant( :param variant_predicate: a predicate for choosing the variants for testing. """ - if variant_predicate is None: - variant_predicate = VariantPredicates.true() - - return ModeOfInheritancePredicate._from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_dominant(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_dominant` instead", + DeprecationWarning, stacklevel=2, ) + return autosomal_dominant(variant_predicate) + @staticmethod def autosomal_recessive( variant_predicate: typing.Optional[VariantPredicate] = None, - ) -> "ModeOfInheritancePredicate": + ) -> GenotypePolyPredicate: """ Create a predicate that assigns the patient either into homozygous reference, heterozygous, or biallelic alternative allele @@ -473,14 +513,14 @@ def autosomal_recessive( :param variant_predicate: a predicate for choosing the variants for testing. """ - if variant_predicate is None: - variant_predicate = VariantPredicates.true() - - return ModeOfInheritancePredicate._from_moi_info( - variant_predicate=variant_predicate, - mode_of_inheritance_data=ModeOfInheritanceInfo.autosomal_recessive(), + # TODO: remove before 1.0.0 + warnings.warn( + "Use `gpsea.analysis.predicate.genotype.autosomal_recessive` instead", + DeprecationWarning, stacklevel=2, ) + return autosomal_recessive(variant_predicate) + @staticmethod def _from_moi_info( variant_predicate: VariantPredicate, @@ -599,6 +639,12 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: else: return None + def __eq__(self, value: object) -> bool: + return isinstance(value, SexGenotypePredicate) + + def __hash__(self) -> int: + return 31 + INSTANCE = SexGenotypePredicate() @@ -662,6 +708,7 @@ def __init__( self._categorizations = tuple( sorted(categorizations.values(), key=lambda c: c.category.cat_id) ) + self._hash = hash(tuple(categorizations.items())) def get_categorizations(self) -> typing.Sequence[Categorization]: return self._categorizations @@ -686,6 +733,13 @@ def test(self, patient: Patient) -> typing.Optional[Categorization]: return None return categorization + + def __eq__(self, value: object) -> bool: + return isinstance(value, DiagnosisPredicate) \ + and self._id2cat == value._id2cat + + def __hash__(self) -> int: + return self._hash def diagnosis_predicate( diff --git a/tests/analysis/predicate/genotype/test_gt_predicates.py b/tests/analysis/predicate/genotype/test_gt_predicates.py index cd8e7ac6..425c63f5 100644 --- a/tests/analysis/predicate/genotype/test_gt_predicates.py +++ b/tests/analysis/predicate/genotype/test_gt_predicates.py @@ -7,9 +7,10 @@ sex_predicate, monoallelic_predicate, biallelic_predicate, + autosomal_dominant, + autosomal_recessive, VariantPredicates, VariantPredicate, - ModeOfInheritancePredicate, ) @@ -102,7 +103,7 @@ def test_autosomal_dominant( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_dominant(variant_predicate) + predicate = autosomal_dominant(variant_predicate) categorization = predicate.test(patient) @@ -125,7 +126,7 @@ def test_autosomal_dominant__with_default_predicate( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_dominant() + predicate = autosomal_dominant() categorization = predicate.test(patient) @@ -150,7 +151,7 @@ def test_autosomal_recessive( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_recessive(variant_predicate) + predicate = autosomal_recessive(variant_predicate) categorization = predicate.test(patient) @@ -175,7 +176,7 @@ def test_autosomal_recessive__with_default_predicate( request: pytest.FixtureRequest, ): patient = request.getfixturevalue(patient_name) - predicate = ModeOfInheritancePredicate.autosomal_recessive() + predicate = autosomal_recessive() categorization = predicate.test(patient) From d7109b43cbe7428a1bd1787204e68c118030e96f Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 16:44:02 +0200 Subject: [PATCH 4/5] Improve legibility of the HPO MTC filter docs section. --- docs/user-guide/mtc.rst | 162 ++++++++++++++++++++++------------------ 1 file changed, 91 insertions(+), 71 deletions(-) diff --git a/docs/user-guide/mtc.rst b/docs/user-guide/mtc.rst index b01c95d1..7f47e228 100644 --- a/docs/user-guide/mtc.rst +++ b/docs/user-guide/mtc.rst @@ -192,76 +192,96 @@ We use static constructor :func:`~gpsea.analysis.mtc_filter.HpoMtcFilter.default for creating :class:`~gpsea.analysis.mtc_filter.HpoMtcFilter`. The constructor takes a threshold as an argument (e.g. 20% in the example above) and the method's logic is made up of 8 individual heuristics -designed to skip testing the HPO terms that are unlikely to yield significant or interesting results: - -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| Code | Name | Description | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF01` | Skip terms that | The ``term_frequency_threshold`` determines the mininum proportion of individuals | -| | occur very rarely | with direct or indirect annotation by the HPO term to test. | -| | | We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), | -| | | and we only retain a term for testing if the proportion of individuals | -| | | in at least one genotype group is greater than | -| | | or equal to ``term_frequency_threshold``. | -| | | This is because of our assumption that even if there is statistical significance, | -| | | if a term is only seen in (for example) 7% of individuals | -| | | in the MISSENSE group and 2% in the not-MISSENSE group, | -| | | the term is unlikely to be of great interest because it is rare. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF02` | Skip terms if | In a related heuristic, we skip terms if no genotype group has more | -| | no cell has more | than one count. This is not completely redundant with the previous condition, | -| | than one count | because some terms may have a small number of total observations. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF03` | Skip terms if | Let's say a term such as | -| | all counts are | `Posterior polar cataract (HP:0001115) `_ | -| | identical | was observed in 7 of 11 individuals with MISSENSE variants | -| | to counts | and in 3 of 8 individuals with NONSENSE variants. | -| | for a child | If we find the same patient counts (7 of 11 and 3 of 8) in the parent term | -| | term | `Polar cataract HP:0010696 `_, | -| | | then we choose to not test the parent term. | -| | | | -| | | This is because the more specific an HPO term is, | -| | | the more information it has (the more interesting the correlation would be if it exists), | -| | | and the result of a test, such as the Fisher Exact test, would be exactly the same | -| | | for *Polar cataract* as for *Posterior polar cataract*. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF04` | Skip terms if | If both (or all) of the genotype groups have the same proportion of individuals | -| | genotypes have | observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, | -| | same HPO | because it is not possible that the Fisher exact test will return a significant result. | -| | proportions | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF05` | Skip terms if | If one of the genotype groups has neither observed nor excluded observations | -| | there are no | for an HPO term, skip it. | -| | HPO observations | | -| | in a group | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF06` | Skip term if | If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) | -| | underpowered | and the total count of patients in all genotype-phenotype groups is less than 7, | -| | for 2x2 or 2x3 | or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients | -| | analysis | is less than 6, then there is a lack even of the nominal statistical power | -| | | and the counts can never be significant. | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF07` | Skipping terms | The HPO has a number of other branches that describe modes of inheritance, | -| | that are not | past medical history, and clinical modifiers. | -| | descendents of | We do not think it makes much sense to test for enrichment of these terms, | -| | *Phenotypic* | so, all terms that are not descendants of | -| | *abnormality* | `Phenotypic abnormality `_ are filtered out. | -| | | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ -| `HMF08` | Skipping | All the direct children of the root phenotype term | -| | "general" | `Phenotypic abnormality (HP:0000118) `_ | -| | level terms | are skipped, because of the assumption that if there is a valid signal, | -| | | it will derive from one of the more specific descendents. | -| | | | -| | | For instance, | -| | |`Abnormality of the nervous system `_ | -| | | (HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies | -| | | that if there is a signal from the nervous system, | -| | | it will lead to at least one of the descendents of | -| | | *Abnormality of the nervous system* being significant. | -| | | | -| | | See :ref:`general-hpo-terms` section for details. | -| | | | -+------------+-------------------+--------------------------------------------------------------------------------------------+ +designed to skip testing the HPO terms that are unlikely to yield significant or interesting results. +`HMF01` - Skip terms that occur very rarely +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The ``term_frequency_threshold`` determines the mininum proportion of individuals +with direct or indirect annotation by the HPO term to test. +We check each of the genotype groups (e.g., MISSENSE vs. not-MISSENSE), +and we only retain a term for testing if the proportion of individuals +in at least one genotype group is greater than +or equal to ``term_frequency_threshold``. +This is because of our assumption that even if there is statistical significance, +if a term is only seen in (for example) 7% of individuals +in the MISSENSE group and 2% in the not-MISSENSE group, +the term is unlikely to be of great interest because it is rare. + + +`HMF02` - Skip terms if no genotype group has more than one count +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In a related heuristic, we skip terms if no genotype group has more +than one count. This is not completely redundant with the previous condition, +because some terms may have a small number of total observations. + + +`HMF03` - Skip terms if all counts are identical to counts for a child term +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Let's say a term such as +`Posterior polar cataract (HP:0001115) `_ +was observed in 7 of 11 individuals with MISSENSE variants +and in 3 of 8 individuals with NONSENSE variants. +If we find the same patient counts (7 of 11 and 3 of 8) in the parent term +`Polar cataract HP:0010696 `_, +then we choose to not test the parent term. + +This is because the more specific an HPO term is, +the more information it has (the more interesting the correlation would be if it exists), +and the result of a test, such as the Fisher Exact test, would be exactly the same +for *Polar cataract* as for *Posterior polar cataract*. + + +`HMF04` - Skip terms if genotypes have same HPO proportions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If both (or all) of the genotype groups have the same proportion of individuals +observed to be annotated to an HPO term, e.g., both are 50%, then skip the term, +because it is not possible that the Fisher exact test will return a significant result. + + +`HMF05` - Skip term if one of the genotype groups has neither observed nor excluded observations +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Skip terms if there are no HPO observations in a group. + + +`HMF06` - Skip term if underpowered for 2x2 or 2x3 analysis +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +If the individuals are binned into 2 phenotype groups and 2 genotype groups (2x2) +and the total count of patients in all genotype-phenotype groups is less than 7, +or into 2 phenotype groups and 3 genotype groups (2x3) and the total count of patients +is less than 6, then there is a lack even of the nominal statistical power +and the counts can never be significant. + + +`HMF07` - Skipping terms that are not descendents of *Phenotypic abnormality* +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The HPO has a number of other branches that describe modes of inheritance, +past medical history, and clinical modifiers. +We do not think it makes much sense to test for enrichment of these terms, +so, all terms that are not descendants of +`Phenotypic abnormality `_ are filtered out. + + +`HMF08` - Skipping "general" level terms +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All the direct children of the root phenotype term +`Phenotypic abnormality (HP:0000118) `_ +are skipped, because of the assumption that if there is a valid signal, +it will derive from one of the more specific descendents. + +For instance, +`Abnormality of the nervous system `_ +(HP:0000707) is a child of *Phenotypic abnormality*, and this assumption implies +that if there is a signal from the nervous system, +it will lead to at least one of the descendents of +*Abnormality of the nervous system* being significant. + +See :ref:`general-hpo-terms` section for details. From 1b3c848ffb7ae54f031f9377bc0c2b579b7adf02 Mon Sep 17 00:00:00 2001 From: Daniel Danis Date: Thu, 12 Sep 2024 16:48:44 +0200 Subject: [PATCH 5/5] Prepare to unexport `ModeOfInheritancePredicate`. --- src/gpsea/analysis/predicate/genotype/__init__.py | 2 +- tests/conftest.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpsea/analysis/predicate/genotype/__init__.py b/src/gpsea/analysis/predicate/genotype/__init__.py index a27bc0fb..9bf0b26a 100644 --- a/src/gpsea/analysis/predicate/genotype/__init__.py +++ b/src/gpsea/analysis/predicate/genotype/__init__.py @@ -4,7 +4,7 @@ from ._gt_predicates import groups_predicate, sex_predicate, diagnosis_predicate from ._gt_predicates import autosomal_dominant, autosomal_recessive from ._gt_predicates import monoallelic_predicate, biallelic_predicate -from ._gt_predicates import ModeOfInheritancePredicate +from ._gt_predicates import ModeOfInheritancePredicate # TODO: remove before 1.0.0 from ._variant import VariantPredicates, ProteinPredicates __all__ = [ diff --git a/tests/conftest.py b/tests/conftest.py index 5459ce3f..900e7f45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,7 @@ from gpsea.analysis.mtc_filter import PhenotypeMtcResult from gpsea.analysis.pcats import HpoTermAnalysisResult -from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, ModeOfInheritancePredicate, VariantPredicates +from gpsea.analysis.predicate.genotype import GenotypePolyPredicate, VariantPredicates, autosomal_dominant from gpsea.analysis.predicate.phenotype import PhenotypePolyPredicate, HpoPredicate from gpsea.io import GpseaJSONDecoder from gpsea.model import * @@ -119,7 +119,7 @@ def suox_mane_tx_id() -> str: def suox_gt_predicate( suox_mane_tx_id: str, ) -> GenotypePolyPredicate: - return ModeOfInheritancePredicate.autosomal_dominant( + return autosomal_dominant( variant_predicate=VariantPredicates.variant_effect( effect=VariantEffect.MISSENSE_VARIANT, tx_id=suox_mane_tx_id