From 761be021f805b38292ede75409467d1021ceea9b Mon Sep 17 00:00:00 2001 From: Lauren Rekerle Date: Fri, 29 Sep 2023 14:47:22 -0500 Subject: [PATCH 1/2] Fixed bugs that came up with a subjects in RPGRIP --- ...oskin_PMID_34722527_RunGenoPhenoCorr.ipynb | 801 ++++++++++++++++++ src/genophenocorr/analysis/_analyzers.py | 13 +- .../preprocessing/_phenopacket.py | 5 +- src/genophenocorr/preprocessing/_variant.py | 5 +- 4 files changed, 816 insertions(+), 8 deletions(-) create mode 100644 notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb diff --git a/notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb b/notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb new file mode 100644 index 00000000..c225791e --- /dev/null +++ b/notebooks/RPGRIP1/RPGRIP1_Beryoskin_PMID_34722527_RunGenoPhenoCorr.ipynb @@ -0,0 +1,801 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bed7b521", + "metadata": {}, + "source": [ + "

Retinal Degeneration Associated With RPGRIP1

\n", + "

Data from Beryozkin A, et al. Retinal Degeneration Associated With RPGRIP1: A Review of Natural History, Mutation Spectrum, and Genotype-Phenotype Correlation in 228 Patients. Front Cell Dev Biol. 2021 Oct 14;9:746781. doi: 10.3389/fcell.2021.746781. PMID: 34722527; PMCID: PMC8551679..

" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d58a69d7", + "metadata": {}, + "outputs": [], + "source": [ + "import typing\n", + "import os\n", + "\n", + "import hpotk\n", + "from phenopackets import Phenopacket" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d4c247db", + "metadata": {}, + "outputs": [], + "source": [ + "fpath_hpo = 'hpo_data/hp.json'\n", + "cache_dir = 'annotations'\n", + "fpath_phenopackets = 'phenopackets'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4089b707", + "metadata": {}, + "outputs": [], + "source": [ + "from genophenocorr.preprocessing import configure_caching_patient_creator" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d692db53", + "metadata": {}, + "outputs": [], + "source": [ + "hpo: hpotk.ontology.Ontology = hpotk.ontology.load.obographs.load_ontology(fpath_hpo)\n", + " \n", + "phenotype_creator = configure_caching_patient_creator(hpo, cache_dir=cache_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3416a4b3", + "metadata": {}, + "outputs": [], + "source": [ + "from genophenocorr.preprocessing import load_phenopacket_folder" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b0533219", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Patient 237–523 has unknown alternative variant GNNNN and will not be included.\n", + "Patient 79–194 has unknown alternative variant GNNNN and will not be included.\n", + "Expected at least one variant per patient, but received none for patient 79–194\n" + ] + } + ], + "source": [ + "patientCohort = load_phenopacket_folder(fpath_phenopackets, phenotype_creator)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0d4132a4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('14_21312458_A/-', 25),\n", + " ('14_21312434_deletion', 9),\n", + " ('14_21302531_G/-', 8),\n", + " ('14_21345145_C/T', 8),\n", + " ('14_21325252_G/A', 7),\n", + " ('14_21345140_AAGGCCG/-', 7),\n", + " ('14_21327672_-/T', 6),\n", + " ('14_21303542_C/T', 5),\n", + " ('14_21325265_A/G', 5),\n", + " ('14_21348211_AAAG/-', 4),\n", + " ('14_21326131_C/T', 4),\n", + " ('14_21326545_TTTTTAGTAGAGATGGGATTTCTCCATGTTGGTCAGGCTGGTCTTCAACTCCCGACCTCAGGTGAACCTCCCACCTGAGCCTCCCAAAGTGCTGGGATTACAGACGTGAGCCACCGCGCCTGGCTGAACAAACTTTTTCAAGCTCTGTAATGCTGTCTAGTATCTGTCTTTACTAAAGGCCTGTTGTTTCTTAGTGCATGACTACATAGATATCTGATTATAAACTGAGACCTTAACACTCCCCCATCATTCTCTCACTTCTTTTAAACACTGGACACAAGTTAGAGAGATTTCCACACCAGATCATGACAAACACAAATTTCTTGGATTTTTTTTTTCCTCCCAATGTGGAGCTGAGCTCCATACTGTCTTTCCTAACTTTTATACCTAGGATTGTGGGGGTGTACCAAGAGGGGTCAACTCTTTGACTACAGTCCTGGGAGGGTGAGGTGGGGGTATCCATGTTTTCCTTAGGAAGTGGGGATAGCTGCAGTCAGAAACAACCATATTTAACAAGATTCTGGATGCTCCAGGACATGTATGCAGCTCTCTCCTCAATACAACTGCTTAAAAAAAGGCTGACACTTCTGGACACAACTCCTTTGCCAAACAGGGGAGGCAGTATAAGCCACCTGTTAATCAGTGTTACAAATCAGACATCTGGCATTTCGAAAGAGCCATTTTGCTTAAGTTTTCTTGGGACCACTTGAGGGTAGAGGTAACAGTTTTCTTGGTACAACTAAGGCACAGTAAGCATTTGATAATAGTAATAATAATGCAGCCTTGTTCATTGGCTGGATGACTGATGACAAGCAAGCTGTACTCCTTTTCATACACCCTTCACTATCTCTTCCTGAATCCTAGAGATAACCCATCTTCCCTGATTAATATTCTAACTGCACTGCTGTTTGATTTTACTTCTGAGTGTATCATCATCGTAATTATTTAATGGATGTTAATTAATTGCTGATAAAATATGTTGAAATTAAAAATGGGAAGGAAGTAGATAAGGTGCTGACAAATGCTCACTTGCTTATTTCATGTGATCAGGTCTTATTAATATCTGTTTGTTTCTCAGGTGATTTTAACCTCACTGACCCTGCAGAGAAACCCAACGGATCTATTCAAGTGCAACTGGATTGGAAGTTTCCCTACATACCCCCTGAGAGCTTCCTGAAACCAGAAGCTCAGACTAAGGGGAAGGATACCAAGGACAGTTCAAAGATCTCATCTGAAGAGGAAAAGGCTTCATTTCCTTCCCAGGTAACTCTCCAGGACTCCACAGGTAGCAGATCTCTGCCAATCCTATGGAGCAGATTTGAAGGAGACAGTATTATAG/-',\n", + " 4),\n", + " ('14_21327801_T/-', 4),\n", + " ('14_21326055_T/G', 3),\n", + " ('14_21328463_C/T', 3),\n", + " ('14_21326072_-/A', 3),\n", + " ('14_21317850_G/T', 3),\n", + " ('14_21294785_G/A', 3),\n", + " ('14_21326125_C/T', 3),\n", + " ('14_21321856_GGAACTGGAG/-', 3),\n", + " ('14_21328469_C/T', 3),\n", + " ('14_21301105_C/T', 3),\n", + " ('14_21325318_C/T', 3),\n", + " ('14_21294745_C/T', 3),\n", + " ('14_21325311_C/A', 2),\n", + " ('14_21317760_C/-', 2),\n", + " ('14_21301114_C/T', 2),\n", + " ('14_21324867_G/A', 2),\n", + " ('14_21325880_C/T', 2),\n", + " ('14_21307824_AG/-', 2),\n", + " ('14_21325405_G/-', 2),\n", + " ('14_21312439_GAGA/-', 2),\n", + " ('14_21303510_C/G', 2),\n", + " ('14_21327708_-/T', 2),\n", + " ('14_21326119_C/T', 2),\n", + " ('14_21325241_GA/-', 2),\n", + " ('14_21301238_G/T', 2),\n", + " ('14_21317769_C/T', 2),\n", + " ('14_21317724_C/T', 2),\n", + " ('14_21324876_C/A', 2),\n", + " ('14_21312466_C/T', 2),\n", + " ('14_21302508_T/-', 2),\n", + " ('14_21320155_T/A', 2),\n", + " ('14_21325330_C/T', 2),\n", + " ('14_21320178_G/T', 2),\n", + " ('14_21324747_A/T', 2),\n", + " ('14_21321257_A/G', 2),\n", + " ('14_21325943_G/T', 2),\n", + " ('14_21326017_C/T', 2),\n", + " ('14_21334696_T/A', 2),\n", + " ('14_21312440_-/A', 2),\n", + " ('14_21351103_G/T', 2),\n", + " ('14_21303544_G/A', 2),\n", + " ('14_21348186_T/A', 2),\n", + " ('14_21317778_C/T', 2),\n", + " ('14_21321929_C/T', 1),\n", + " ('14_21326031_-/T', 1),\n", + " ('14_21348184_-/G', 1),\n", + " ('14_21303543_G/A', 1),\n", + " ('14_21324747_A/C', 1),\n", + " ('14_21325931_A/G', 1),\n", + " ('14_21343084_G/C', 1),\n", + " ('14_21325832_deletion', 1),\n", + " ('14_21321316_C/T', 1),\n", + " ('14_21303373_A/-', 1),\n", + " ('14_21321865_G/-', 1),\n", + " ('14_21312468_A/-', 1),\n", + " ('14_21348167_TTTAG/-', 1),\n", + " ('14_21324616_A/G', 1),\n", + " ('14_21325903_C/T', 1),\n", + " ('14_21280084_deletion', 1),\n", + " ('14_21317847_A/T', 1),\n", + " ('14_21307704_AGAATAATTTAGCGCCTTTCTCTGCAGAGCTTCCATTAAAGAGAAGGTAGAGCTGATTCGACTTAAGA/-',\n", + " 1),\n", + " ('14_21343054_A/G', 1),\n", + " ('14_21326107_C/T', 1),\n", + " ('14_21310608_G/A', 1),\n", + " ('14_129365_deletion', 1),\n", + " ('14_21348174_T/G', 1),\n", + " ('14_21326018_G/A', 1),\n", + " ('14_21280084_duplication', 1),\n", + " ('14_21325277_T/-', 1),\n", + " ('14_21325861_G/A', 1),\n", + " ('14_21324913_-/A', 1),\n", + " ('14_21330248_GCAGGTGAATTACACTGAGTGGAAGTTCTCAGAGACTAACAGCTTCATAGGTGATGGCTTTAAAAATCAGCACGAGGAAGAGGAAATGACATTATCCCATTCAGCACTGAAACAGAAGGAACCTCTACATCCTGTAAAT/-',\n", + " 1),\n", + " ('14_21327670_C/T', 1),\n", + " ('14_21294674_T/G', 1),\n", + " ('14_21345189_T/-', 1),\n", + " ('14_21320157_C/T', 1),\n", + " ('14_21325372_C/T', 1),\n", + " ('14_21326174_G/A', 1),\n", + " ('14_21300989_T/A', 1),\n", + " ('14_21301167_G/-', 1),\n", + " ('14_21324875_C/T', 1),\n", + " ('14_21312488_ATG/-', 1),\n", + " ('14_21312501_AGAA/-', 1),\n", + " ('14_21325931_-/A', 1),\n", + " ('14_21326041_-/T', 1),\n", + " ('14_21345198_G/A', 1),\n", + " ('14_21311824_A/-', 1),\n", + " ('14_21327627_-/T', 1),\n", + " ('14_21287940_duplication', 1),\n", + " ('14_21303454_A/-', 1),\n", + " ('14_21324865_-/G', 1),\n", + " ('14_21325253_G/A', 1),\n", + " ('14_21324934_C/G', 1),\n", + " ('14_21325064_CTGATTGGTAAGTGCCGTTGGCTTC/-', 1),\n", + " ('14_21312500_T/A', 1),\n", + " ('14_21317754_G/T', 1),\n", + " ('14_21311858_-/T', 1),\n", + " ('14_21348303_G/A', 1),\n", + " ('14_21351188_AGG/-', 1),\n", + " ('14_21321295_-/T', 1),\n", + " ('14_21321402_G/A', 1),\n", + " ('14_21302585_G/C', 1),\n", + " ('14_21325898_G/A', 1),\n", + " ('14_21328500_A/-', 1)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "patientCohort.list_all_variants()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "892a2c4b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'NM_001377949.1': Counter({'intron_variant': 47,\n", + " 'frameshift_variant': 16,\n", + " 'splice_acceptor_variant': 7,\n", + " 'splice_region_variant': 5,\n", + " 'stop_gained': 18,\n", + " 'missense_variant': 4,\n", + " 'splice_donor_variant': 5,\n", + " 'feature_truncation': 4,\n", + " 'coding_sequence_variant': 7,\n", + " '5_prime_UTR_variant': 3,\n", + " 'inframe_deletion': 1,\n", + " 'splice_donor_5th_base_variant': 2,\n", + " 'synonymous_variant': 1,\n", + " 'splice_polypyrimidine_tract_variant': 1}),\n", + " 'NM_001377948.1': Counter({'frameshift_variant': 24,\n", + " 'splice_acceptor_variant': 7,\n", + " 'missense_variant': 22,\n", + " 'splice_region_variant': 5,\n", + " 'stop_gained': 30,\n", + " 'intron_variant': 9,\n", + " 'splice_donor_variant': 7,\n", + " 'feature_truncation': 4,\n", + " 'coding_sequence_variant': 8,\n", + " '5_prime_UTR_variant': 3,\n", + " 'inframe_deletion': 1,\n", + " 'splice_donor_5th_base_variant': 3,\n", + " 'synonymous_variant': 1,\n", + " 'splice_polypyrimidine_tract_variant': 1}),\n", + " 'NM_001377950.1': Counter({'intron_variant': 49,\n", + " 'frameshift_variant': 16,\n", + " 'splice_acceptor_variant': 6,\n", + " 'splice_region_variant': 5,\n", + " 'stop_gained': 18,\n", + " 'missense_variant': 4,\n", + " 'splice_donor_variant': 5,\n", + " 'feature_truncation': 4,\n", + " 'coding_sequence_variant': 7,\n", + " '5_prime_UTR_variant': 3,\n", + " 'inframe_deletion': 1,\n", + " 'splice_donor_5th_base_variant': 2,\n", + " 'synonymous_variant': 1,\n", + " 'splice_polypyrimidine_tract_variant': 1}),\n", + " 'NM_001377523.1': Counter({'intron_variant': 48,\n", + " 'frameshift_variant': 16,\n", + " 'splice_acceptor_variant': 6,\n", + " 'splice_region_variant': 5,\n", + " 'stop_gained': 18,\n", + " 'missense_variant': 4,\n", + " 'splice_donor_variant': 5,\n", + " 'feature_truncation': 4,\n", + " 'coding_sequence_variant': 7,\n", + " '5_prime_UTR_variant': 3,\n", + " 'inframe_deletion': 1,\n", + " 'splice_donor_5th_base_variant': 2,\n", + " 'synonymous_variant': 1,\n", + " 'splice_polypyrimidine_tract_variant': 1}),\n", + " 'NM_001377951.1': Counter({'intron_variant': 48,\n", + " 'frameshift_variant': 14,\n", + " 'splice_acceptor_variant': 6,\n", + " 'splice_region_variant': 4,\n", + " '5_prime_UTR_variant': 7,\n", + " 'upstream_gene_variant': 9,\n", + " 'stop_gained': 8,\n", + " 'missense_variant': 3,\n", + " 'splice_donor_variant': 5,\n", + " 'feature_truncation': 4,\n", + " 'coding_sequence_variant': 7,\n", + " 'inframe_deletion': 1,\n", + " 'splice_donor_5th_base_variant': 2,\n", + " 'synonymous_variant': 1,\n", + " 'splice_polypyrimidine_tract_variant': 1}),\n", + " 'NM_020366.4': Counter({'frameshift_variant': 38,\n", + " 'splice_acceptor_variant': 8,\n", + " 'missense_variant': 23,\n", + " 'splice_region_variant': 11,\n", + " 'stop_gained': 43,\n", + " 'intron_variant': 12,\n", + " 'inframe_deletion': 2,\n", + " 'splice_donor_variant': 12,\n", + " 'splice_polypyrimidine_tract_variant': 2,\n", + " 'coding_sequence_variant': 11,\n", + " 'start_lost': 3,\n", + " 'feature_truncation': 4,\n", + " 'start_retained_variant': 3,\n", + " '5_prime_UTR_variant': 3,\n", + " 'splice_donor_5th_base_variant': 3,\n", + " 'feature_elongation': 2,\n", + " 'synonymous_variant': 1})}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "patientCohort.list_data_by_tx()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "82b28521", + "metadata": {}, + "outputs": [], + "source": [ + "from genophenocorr.analysis import CohortAnalysis" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ef5b47de", + "metadata": {}, + "outputs": [], + "source": [ + "rec_analysis = CohortAnalysis(patientCohort, 'NM_020366.4', hpo, recessive=True, include_unmeasured=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3a3ddcdc", + "metadata": {}, + "outputs": [], + "source": [ + "from genophenocorr.constants import VariantEffect\n", + "from genophenocorr.model import FeatureType" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "19112877", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Divide by 0 error with HPO HP:0000662, not included in this analysis.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Homozygous frameshift_variantHeterozygous frameshift_variantNo frameshift_variant
CountPercentCountPercentCountPercentp-valueCorrected p-values
HP:0001483 (Eye poking)2184.00%2100.00%541.67%0.0242290.096914
HP:0012758 (Neurodevelopmental delay)811.94%00.00%43.31%0.0383270.153306
HP:0000613 (Photophobia)12100.00%2100.00%16100.00%1.0000001.000000
HP:0000496 (Abnormality of eye movement)32100.00%9100.00%50100.00%1.0000001.000000
\n", + "
" + ], + "text/plain": [ + " Homozygous frameshift_variant \\\n", + " Count \n", + "HP:0001483 (Eye poking) 21 \n", + "HP:0012758 (Neurodevelopmental delay) 8 \n", + "HP:0000613 (Photophobia) 12 \n", + "HP:0000496 (Abnormality of eye movement) 32 \n", + "\n", + " \\\n", + " Percent \n", + "HP:0001483 (Eye poking) 84.00% \n", + "HP:0012758 (Neurodevelopmental delay) 11.94% \n", + "HP:0000613 (Photophobia) 100.00% \n", + "HP:0000496 (Abnormality of eye movement) 100.00% \n", + "\n", + " Heterozygous frameshift_variant \\\n", + " Count \n", + "HP:0001483 (Eye poking) 2 \n", + "HP:0012758 (Neurodevelopmental delay) 0 \n", + "HP:0000613 (Photophobia) 2 \n", + "HP:0000496 (Abnormality of eye movement) 9 \n", + "\n", + " No frameshift_variant \\\n", + " Percent Count \n", + "HP:0001483 (Eye poking) 100.00% 5 \n", + "HP:0012758 (Neurodevelopmental delay) 0.00% 4 \n", + "HP:0000613 (Photophobia) 100.00% 16 \n", + "HP:0000496 (Abnormality of eye movement) 100.00% 50 \n", + "\n", + " \n", + " Percent p-value Corrected p-values \n", + "HP:0001483 (Eye poking) 41.67% 0.024229 0.096914 \n", + "HP:0012758 (Neurodevelopmental delay) 3.31% 0.038327 0.153306 \n", + "HP:0000613 (Photophobia) 100.00% 1.000000 1.000000 \n", + "HP:0000496 (Abnormality of eye movement) 100.00% 1.000000 1.000000 " + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rec_analysis.compare_by_variant_type(VariantEffect.FRAMESHIFT_VARIANT)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b9bf9ccf", + "metadata": {}, + "outputs": [], + "source": [ + "dom_analysis = CohortAnalysis(patientCohort,'NM_020366.4', hpo, include_unmeasured=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d835f075", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
With frameshift_variantWithout frameshift_variant
CountPercentCountPercentp-valueCorrected p-values
HP:0001483 (Eye poking)2385.19%541.67%0.0169830.084915
HP:0012758 (Neurodevelopmental delay)88.99%43.31%0.1296440.648221
HP:0000613 (Photophobia)14100.00%16100.00%1.0000001.000000
HP:0000662 (Nyctalopia)3100.00%3100.00%1.0000001.000000
HP:0000496 (Abnormality of eye movement)41100.00%50100.00%1.0000001.000000
\n", + "
" + ], + "text/plain": [ + " With frameshift_variant \\\n", + " Count Percent \n", + "HP:0001483 (Eye poking) 23 85.19% \n", + "HP:0012758 (Neurodevelopmental delay) 8 8.99% \n", + "HP:0000613 (Photophobia) 14 100.00% \n", + "HP:0000662 (Nyctalopia) 3 100.00% \n", + "HP:0000496 (Abnormality of eye movement) 41 100.00% \n", + "\n", + " Without frameshift_variant \\\n", + " Count Percent \n", + "HP:0001483 (Eye poking) 5 41.67% \n", + "HP:0012758 (Neurodevelopmental delay) 4 3.31% \n", + "HP:0000613 (Photophobia) 16 100.00% \n", + "HP:0000662 (Nyctalopia) 3 100.00% \n", + "HP:0000496 (Abnormality of eye movement) 50 100.00% \n", + "\n", + " \n", + " p-value Corrected p-values \n", + "HP:0001483 (Eye poking) 0.016983 0.084915 \n", + "HP:0012758 (Neurodevelopmental delay) 0.129644 0.648221 \n", + "HP:0000613 (Photophobia) 1.000000 1.000000 \n", + "HP:0000662 (Nyctalopia) 1.000000 1.000000 \n", + "HP:0000496 (Abnormality of eye movement) 1.000000 1.000000 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dom_analysis.compare_by_variant_type(VariantEffect.FRAMESHIFT_VARIANT)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "44006715", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Divide by 0 error with HPO HP:0000662, not included in this analysis.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
With 14_21312458_A/-Without 14_21312458_A/-
CountPercentCountPercentp-valueCorrected p-values
HP:0001483 (Eye poking)16100.00%1252.17%0.0009190.003677
HP:0012758 (Neurodevelopmental delay)416.00%84.32%0.0402630.161052
HP:0000613 (Photophobia)5100.00%25100.00%1.0000001.000000
HP:0000496 (Abnormality of eye movement)16100.00%75100.00%1.0000001.000000
\n", + "
" + ], + "text/plain": [ + " With 14_21312458_A/- \\\n", + " Count Percent \n", + "HP:0001483 (Eye poking) 16 100.00% \n", + "HP:0012758 (Neurodevelopmental delay) 4 16.00% \n", + "HP:0000613 (Photophobia) 5 100.00% \n", + "HP:0000496 (Abnormality of eye movement) 16 100.00% \n", + "\n", + " Without 14_21312458_A/- \\\n", + " Count Percent \n", + "HP:0001483 (Eye poking) 12 52.17% \n", + "HP:0012758 (Neurodevelopmental delay) 8 4.32% \n", + "HP:0000613 (Photophobia) 25 100.00% \n", + "HP:0000496 (Abnormality of eye movement) 75 100.00% \n", + "\n", + " \n", + " p-value Corrected p-values \n", + "HP:0001483 (Eye poking) 0.000919 0.003677 \n", + "HP:0012758 (Neurodevelopmental delay) 0.040263 0.161052 \n", + "HP:0000613 (Photophobia) 1.000000 1.000000 \n", + "HP:0000496 (Abnormality of eye movement) 1.000000 1.000000 " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dom_analysis.compare_by_variant('14_21312458_A/-')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57cbbe67", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "enviro", + "language": "python", + "name": "enviro" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/genophenocorr/analysis/_analyzers.py b/src/genophenocorr/analysis/_analyzers.py index fc5c8bb2..f484cf4e 100644 --- a/src/genophenocorr/analysis/_analyzers.py +++ b/src/genophenocorr/analysis/_analyzers.py @@ -2,6 +2,7 @@ import logging import math import typing +from decimal import Decimal import numpy as np @@ -446,16 +447,16 @@ def _fisher_exact(self, table): n = 0 for x in row_sum: n += x - p_0 /= math.factorial(n) + p_0 /= Decimal(math.factorial(n)) for i in range(len(table)): for j in range(len(table[0])): - p_0 /= math.factorial(table[i][j]) + p_0 /= Decimal(math.factorial(table[i][j])) p = [0] self._dfs(mat, pos, row_sum, col_sum, p_0, p) - return p[0] + return float(p[0]) def _dfs(self, mat, pos, r_sum, c_sum, p_0, p): @@ -497,12 +498,12 @@ def _dfs(self, mat, pos, r_sum, c_sum, p_0, p): n = 0 for x in r_sum: n += x - p_1 /= math.factorial(n) + p_1 /= Decimal(math.factorial(n)) for i in range(len(mat_new)): for j in range(len(mat_new[0])): - p_1 /= math.factorial(mat_new[i][j]) - if p_1 <= p_0 + 0.00000001: + p_1 /= Decimal(math.factorial(mat_new[i][j])) + if p_1 <= p_0 + Decimal(0.00000001): # print(mat_new) # print(p_1) p[0] += p_1 diff --git a/src/genophenocorr/preprocessing/_phenopacket.py b/src/genophenocorr/preprocessing/_phenopacket.py index df8917c5..43ce455c 100644 --- a/src/genophenocorr/preprocessing/_phenopacket.py +++ b/src/genophenocorr/preprocessing/_phenopacket.py @@ -52,7 +52,7 @@ def find_coordinates(self, item: GenomicInterpretation) -> VariantCoordinates: alt = '' else: alt = '' - chrom = re.findall(r'NC_0000(\d{2}).\d\d', + chrom = re.findall(r'NC_0000(\d{2})\.\d*', variant_descriptor.variation.copy_number.allele.sequence_location.sequence_id)[0] if chrom.startswith('0'): chrom = str(int(chrom)) @@ -122,6 +122,9 @@ def _add_variants(self, pp: Phenopacket) -> typing.Sequence[Variant]: if hasattr(interp, 'diagnosis') and interp.diagnosis is not None: for genomic_interp in interp.diagnosis.genomic_interpretations: vc = self._coord_finder.find_coordinates(genomic_interp) + if "N" in vc.alt: + self._logger.warning(f'Patient {pp.id} has unknown alternative variant {vc.alt} and will not be included.') + continue variant = self._func_ann.annotate(vc) variants_list.append(variant) else: diff --git a/src/genophenocorr/preprocessing/_variant.py b/src/genophenocorr/preprocessing/_variant.py index 6cc1988e..cc7dab97 100644 --- a/src/genophenocorr/preprocessing/_variant.py +++ b/src/genophenocorr/preprocessing/_variant.py @@ -182,7 +182,10 @@ def _create_file_name(self, variant_coordinates: VariantCoordinates) -> str: Args: variant_coordinates (VariantCoordinates): The variant coordinates associated with the Variant """ - fname = f'{variant_coordinates.as_string()}.pickle' + if len(variant_coordinates.as_string()) <= 50: + fname = f'{variant_coordinates.as_string()}.pickle' + else: + fname = f'{variant_coordinates.chrom}_{variant_coordinates.start}_{variant_coordinates.end}_{variant_coordinates.genotype}.pickle' return os.path.join(self._datadir, fname) From e966492448fcf704cc4d42a84ee8be65be1ade0f Mon Sep 17 00:00:00 2001 From: Lauren Rekerle Date: Fri, 29 Sep 2023 15:10:35 -0500 Subject: [PATCH 2/2] Removed overflow error test --- src/genophenocorr/analysis/_test_fisherExact.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/genophenocorr/analysis/_test_fisherExact.py b/src/genophenocorr/analysis/_test_fisherExact.py index f5d4c97a..6a059608 100644 --- a/src/genophenocorr/analysis/_test_fisherExact.py +++ b/src/genophenocorr/analysis/_test_fisherExact.py @@ -11,7 +11,7 @@ def MultiExact() -> PythonMultiFisherExact: @pytest.mark.parametrize('table, raise_error, pVal', ([[[0,0], [0,0], [0,0]], pytest.raises(ValueError), None], [[[2,3], [1,0], [0,2]], does_not_raise(), 0.6429], - [[[100, 150], [500, 460], [420, 400]], pytest.raises(OverflowError), None], + #[[[100, 150], [500, 460], [420, 400]], pytest.raises(OverflowError), None], [[[5,5],[5,5],[5,5]], does_not_raise(), 1], [[[10,15], [5], [20,5]], pytest.raises(ValueError), None], [[[10, 1], [2,3], [3,4]], does_not_raise(), 0.0395],