Merge pull request #599 from monarch-initiative/issue-649-SO-terms-fo…

…r-hgnc Populate type for HGNC & Alliance gene nodes
monarch-initiative · Sep 10, 2024 · 24f9de3 · 24f9de3
2 parents a668fa6 + ee40eef
commit 24f9de3
Show file tree

Hide file tree

Showing 17 changed files with 171 additions and 79 deletions.
diff --git a/docs/Sources/alliance.md b/docs/Sources/alliance.md
@@ -19,6 +19,8 @@ __**Biolink captured**__
     * source
     * synonyms
     * xref
+    * type (["SO:0001217"])
+
 
 ## [Gene to Phenotype](#gene_to_phenotype)
 

diff --git a/docs/Sources/hgnc.md b/docs/Sources/hgnc.md
@@ -8,6 +8,8 @@ The HGNC is responsible for approving unique symbols and names for human loci, i
 
 This ingest uses HGNC's "complete set" download file, which only contains associations between publications and genes that are denoted in some way in the publication. We have selected to use a consistent high level term for 'publication' (IAO:0000311) as it is heterogeneous mix of publication types being referenced. 
 
+SO terms to populate the type are taken from the Alliance genome HGNC BGI files, provided by RGD.
+
 __**Biolink Captured**__
 
 * biolink:Gene
@@ -24,6 +26,7 @@ __**Biolink Captured**__
       * omim id
     * in_taxon (["NCBITaxon:9606"])
     * provided_by  (["infores:hgnc"])
+    * type (["SO:0001217"])
 
 ## Citation
 

diff --git a/scripts/after_download.sh b/scripts/after_download.sh
@@ -1,7 +1,18 @@
 #!/bin/sh
 
+# set zcat to gzcat if gzcat is available
+if command -v gzcat &> /dev/null
+then
+    ZCAT=gzcat
+else
+    ZCAT=zcat
+fi
+
 # Make a simple text file of all the gene IDs in Alliance
-zcat data/alliance/BGI_*.gz | jq '.data[].basicGeneticEntity.primaryId' | pigz > data/alliance/alliance_gene_ids.txt.gz
+${ZCAT} data/alliance/BGI_*.gz | jq '.data[].basicGeneticEntity.primaryId' | pigz > data/alliance/alliance_gene_ids.txt.gz
+
+# Make a two column tsv of human gene IDs and SO terms
+${ZCAT} data/alliance/BGI_HUMAN.json.gz |  jq -r '.data[] | "\(.basicGeneticEntity.primaryId)\t\(.soTermId)"' > data/hgnc/hgnc_so_terms.tsv
 
 # Make an id, name map of DDPHENO terms
 sqlite3 -cmd ".mode tabs" -cmd ".headers on" data/dictybase/ddpheno.db "select subject as id, value as name from rdfs_label_statement where predicate = 'rdfs:label' and subject like 'DDPHENO:%'" > data/dictybase/ddpheno.tsv

diff --git a/src/monarch_ingest/cli_utils.py b/src/monarch_ingest/cli_utils.py
@@ -1,5 +1,4 @@
 import csv
-import gc
 import os
 import sys
 import tarfile
@@ -407,12 +406,13 @@ def apply_closure(
     )
     sh.mv(database, f"{output_dir}/")
 
+
 def load_sqlite():
     sh.bash("scripts/load_sqlite.sh")
 
 
 def load_solr():
-    sh.bash("scripts/load_solr.sh",  _out=sys.stdout, _err=sys.stderr)
+    sh.bash("scripts/load_solr.sh", _out=sys.stdout, _err=sys.stderr)
 
 
 def load_jsonl():
@@ -446,26 +446,31 @@ def slot_is_multi_valued(slot_name: str) -> bool:
     mv_node_replacement = ", ".join([f"string_split({col}, '|') as {col}" for col in mv_node_columns])
     mv_edge_replacement = ", ".join([f"string_split({col}, '|') as {col}" for col in mv_edge_columns])
 
-    db.sql(f"""
+    db.sql(
+        f"""
     copy (
       select nodes.* replace (ancestors as category, {mv_node_replacement}) 
       from nodes
         join class_ancestor_df on category = classname  
     ) to 'output/monarch-kg_nodes.jsonl' (FORMAT JSON);
-    """)
+    """
+    )
 
-    db.sql(f"""
+    db.sql(
+        f"""
     copy (
       select edges.* replace (ancestors as category, {mv_edge_replacement}),  
       from edges
         join class_ancestor_df on category = classname  
     ) to 'output/monarch-kg_edges.jsonl' (FORMAT JSON);
-    """)
+    """
+    )
 
 
 def export_tsv():
     export()
 
+
 def do_prepare_release(dir: str = OUTPUT_DIR):
 
     compressed_artifacts = [

diff --git a/src/monarch_ingest/download.yaml b/src/monarch_ingest/download.yaml
@@ -48,6 +48,10 @@
   url: https://fms.alliancegenome.org/download/GENECROSSREFERENCE_COMBINED.tsv.gz
   local_name: data/alliance/GENECROSSREFERENCE_COMBINED.tsv.gz
   tag: alliance_gene
+-
+  url: https://fms.alliancegenome.org/download/BGI_HUMAN.json.gz
+  local_name: data/alliance/BGI_HUMAN.json.gz
+  tag: hgnc_gene
 -
   url: https://fms.alliancegenome.org/download/BGI_MGI.json.gz
   local_name: data/alliance/BGI_MGI.json.gz

diff --git a/src/monarch_ingest/ingests/alliance/gene.py b/src/monarch_ingest/ingests/alliance/gene.py
@@ -49,8 +49,7 @@
         symbol=row["symbol"],
         name=row["symbol"],
         full_name=row["name"].replace("\r", ""),  # Replacement to remove stray carriage returns in XenBase files
-        # No place in the schema for gene type (SO term) right now
-        # type=row["soTermId"],
+        type=[row["soTermId"]],
         in_taxon=[in_taxon],
         in_taxon_label=in_taxon_label,
         provided_by=[source],

diff --git a/src/monarch_ingest/ingests/alliance/gene.yaml b/src/monarch_ingest/ingests/alliance/gene.yaml
@@ -34,6 +34,7 @@ node_properties:
   - 'provided_by'
   - 'name'
   - 'symbol'
+  - 'type'
   - 'full_name'
   - 'description'
   - 'in_taxon'

diff --git a/src/monarch_ingest/ingests/alliance/gene_to_phenotype.py b/src/monarch_ingest/ingests/alliance/gene_to_phenotype.py
@@ -16,9 +16,9 @@
 
 koza_app = get_koza_app("alliance_gene_to_phenotype")
 
-while (row := koza_app.get_row()) is not None:
+gene_ids = koza_app.get_map("alliance-gene")
 
-    gene_ids = koza_app.get_map("alliance-gene")
+while (row := koza_app.get_row()) is not None:
 
     if len(row["phenotypeTermIdentifiers"]) == 0:
         logger.debug("Phenotype ingest record has 0 phenotype terms: " + str(row))

diff --git a/src/monarch_ingest/ingests/hgnc/gene.py b/src/monarch_ingest/ingests/hgnc/gene.py
@@ -4,6 +4,8 @@
 
 koza_app = get_koza_app("hgnc_gene")
 
+so_term_map = koza_app.get_map("hgnc-so-terms")
+
 while (row := koza_app.get_row()) is not None:
 
     xref_list = []
@@ -31,6 +33,7 @@
         symbol=row["symbol"],
         name=row["symbol"],
         full_name=row["name"],
+        type=[so_term_map[row['hgnc_id']]['so_term_id']] if row['hgnc_id'] in so_term_map else None,
         xref=xref_list,
         synonym=synonyms_list,
         in_taxon=[in_taxon],

diff --git a/src/monarch_ingest/ingests/hgnc/gene.yaml b/src/monarch_ingest/ingests/hgnc/gene.yaml
@@ -9,6 +9,9 @@ delimiter: '\t'
 
 global_table: './src/monarch_ingest/translation_table.yaml'
 
+depends_on:
+  - './src/monarch_ingest/maps/hgnc-so-terms.yaml'
+
 columns:
   - hgnc_id
   - symbol
@@ -73,6 +76,7 @@ node_properties:
   - 'full_name'
   - 'in_taxon'
   - 'in_taxon_label'
+  - 'type'
   - 'xref'
   - 'synonym'
   - 'provided_by'

diff --git a/src/monarch_ingest/ingests/hpoa/disease_to_phenotype.py b/src/monarch_ingest/ingests/hpoa/disease_to_phenotype.py
@@ -37,6 +37,7 @@
 )
 from monarch_ingest.ingests.hpoa.hpoa_utils import phenotype_frequency_to_hpo_term, Frequency
 
+
 def get_primary_knowledge_source(disease_id: str) -> str:
     if disease_id.startswith("OMIM"):
         return "infores:omim"
@@ -90,7 +91,7 @@ def get_primary_knowledge_source(disease_id: str) -> str:
     # don't populate the reference with the database_id / disease id
     publications = [p for p in publications if not p == row["database_id"]]
 
-    primary_knowledge_source = get_primary_knowledge_source(disease_id )
+    primary_knowledge_source = get_primary_knowledge_source(disease_id)
 
     # Association/Edge
     association = DiseaseToPhenotypicFeatureAssociation(
@@ -108,7 +109,7 @@ def get_primary_knowledge_source(disease_id: str) -> str:
         frequency_qualifier=frequency.frequency_qualifier if frequency.frequency_qualifier else None,
         has_count=frequency.has_count,
         has_total=frequency.has_total,
-        aggregator_knowledge_source=["infores:monarchinitiative","infores:hpo-annotations"],
+        aggregator_knowledge_source=["infores:monarchinitiative", "infores:hpo-annotations"],
         primary_knowledge_source=primary_knowledge_source,
         knowledge_level=KnowledgeLevelEnum.knowledge_assertion,
         agent_type=AgentTypeEnum.manual_agent,

diff --git a/src/monarch_ingest/main.py b/src/monarch_ingest/main.py
@@ -164,9 +164,11 @@ def solr():
 def export():
     export_tsv()
 
+
 @typer_app.command()
 def prepare_release():
-    do_prepare_release();
+    do_prepare_release()
+
 
 @typer_app.command()
 def release(

diff --git a/src/monarch_ingest/maps/hgnc-so-terms.yaml b/src/monarch_ingest/maps/hgnc-so-terms.yaml
@@ -0,0 +1,22 @@
+name: 'hgnc-so-terms'
+
+metadata:
+  description: 'Mapping file to look up SO terms (aka type) for HGNC genes, generated from Alliance BGI files and provided by RGD'
+
+delimiter: '\t'
+
+files:
+  - './data/hgnc/hgnc_so_terms.tsv'
+
+header: 'none'
+
+columns:
+  - 'gene_id'
+  - 'so_term_id'
+
+key: 'gene_id'
+
+values:
+  - 'gene_id'
+  - 'so_term_id'
+