Skip to content

Commit

Permalink
Uniprot JSON prototype
Browse files Browse the repository at this point in the history
  • Loading branch information
pnrobinson committed Sep 21, 2024
1 parent 4e9f7ab commit 6b375ca
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 0 deletions.
55 changes: 55 additions & 0 deletions src/gpsea/model/_protein.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import abc
import enum
import json
import typing

import hpotk
Expand Down Expand Up @@ -305,6 +306,60 @@ def from_feature_frame(
protein_features=region_list,
protein_length=protein_length,
)




@staticmethod
def from_uniprot_json(protein_id: str,
label: str,
uniprot_json: str,
protein_length: int,) -> "ProteinMetadata":
"""
Create `ProteinMetadata` from a json file that has been downloaded from UniProt.
Go to the UniProt page for the protein of interest, then go to the section "Family & Domains", and the
subsection "Features". Click on the Download symbol. You will be presented with a JSON file for download.
From this, we extract information about the gene symbol, protein identifier, and regions.
This method is intended to be a backup if the API call to UniProt fails; the same information should be
retrieved.
See the test file "test_uniprot_json.py" for details about the JSON parsing etc.
:param protein_id: the accession id of the protein, e.g. `'NP_000129.3'`.
:param label: human-readable label, e.g. `'fibrillin-1 isoform a preproprotein'`.
:param uniprot_json: path to a local json file downloaded from UniProt with information about protein features.
:param protein_length: a positive `int` representing the number of aminoacids included in the protein sequence.
:raises ValueError: if case of issues during parsing the provided data.
"""
with open(uniprot_json) as json_file:
data = json.load(json_file)
primary_acc = data["primaryAccession"]
features = data["features"]
region_list = list()
for feature in features:
try:
region_name = feature["description"]
locus = feature["location"]
start_obj = locus["start"]
region_start = int(start_obj["value"]) - 1 # convert to 0-based coordinates
end_obj = locus["end"]
region_end = int(end_obj["value"])
region_category = feature["type"]
feature_type = FeatureType.from_string(region_category)
finfo = FeatureInfo(
name=region_name, region=Region(start=region_start, end=region_end)
)
pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
region_list.append(pfeature)
except Exception as feature_exception:
print(f"Could not parse feature: {str(feature_exception)} (skipping)")

return ProteinMetadata(
protein_id=protein_id,
label=label,
protein_features=region_list,
protein_length=protein_length,
)


def __init__(
self,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"entryType":"UniProtKB reviewed (Swiss-Prot)","primaryAccession":"Q8IZT6","features":[{"type":"Domain","location":{"start":{"value":920,"modifier":"EXACT"},"end":{"value":1056,"modifier":"EXACT"}},"description":"Calponin-homology (CH) 1","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00044"}]},{"type":"Domain","location":{"start":{"value":1110,"modifier":"EXACT"},"end":{"value":1261,"modifier":"EXACT"}},"description":"Calponin-homology (CH) 2","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00044"}]},{"type":"Domain","location":{"start":{"value":1347,"modifier":"EXACT"},"end":{"value":1378,"modifier":"EXACT"}},"description":"IQ 1","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1393,"modifier":"EXACT"},"end":{"value":1422,"modifier":"EXACT"}},"description":"IQ 2","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1582,"modifier":"EXACT"},"end":{"value":1613,"modifier":"EXACT"}},"description":"IQ 3","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1632,"modifier":"EXACT"},"end":{"value":1661,"modifier":"EXACT"}},"description":"IQ 4","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1655,"modifier":"EXACT"},"end":{"value":1684,"modifier":"EXACT"}},"description":"IQ 5","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1728,"modifier":"EXACT"},"end":{"value":1757,"modifier":"EXACT"}},"description":"IQ 6","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1751,"modifier":"EXACT"},"end":{"value":1782,"modifier":"EXACT"}},"description":"IQ 7","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1801,"modifier":"EXACT"},"end":{"value":1830,"modifier":"EXACT"}},"description":"IQ 8","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1824,"modifier":"EXACT"},"end":{"value":1853,"modifier":"EXACT"}},"description":"IQ 9","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1874,"modifier":"EXACT"},"end":{"value":1903,"modifier":"EXACT"}},"description":"IQ 10","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1897,"modifier":"EXACT"},"end":{"value":1928,"modifier":"EXACT"}},"description":"IQ 11","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1947,"modifier":"EXACT"},"end":{"value":1978,"modifier":"EXACT"}},"description":"IQ 12","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":1970,"modifier":"EXACT"},"end":{"value":2001,"modifier":"EXACT"}},"description":"IQ 13","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2020,"modifier":"EXACT"},"end":{"value":2049,"modifier":"EXACT"}},"description":"IQ 14","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2043,"modifier":"EXACT"},"end":{"value":2074,"modifier":"EXACT"}},"description":"IQ 15","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2093,"modifier":"EXACT"},"end":{"value":2124,"modifier":"EXACT"}},"description":"IQ 16","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2116,"modifier":"EXACT"},"end":{"value":2147,"modifier":"EXACT"}},"description":"IQ 17","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2166,"modifier":"EXACT"},"end":{"value":2197,"modifier":"EXACT"}},"description":"IQ 18","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2189,"modifier":"EXACT"},"end":{"value":2218,"modifier":"EXACT"}},"description":"IQ 19","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2239,"modifier":"EXACT"},"end":{"value":2270,"modifier":"EXACT"}},"description":"IQ 20","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2262,"modifier":"EXACT"},"end":{"value":2293,"modifier":"EXACT"}},"description":"IQ 21","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2311,"modifier":"EXACT"},"end":{"value":2342,"modifier":"EXACT"}},"description":"IQ 22","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2334,"modifier":"EXACT"},"end":{"value":2365,"modifier":"EXACT"}},"description":"IQ 23","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2384,"modifier":"EXACT"},"end":{"value":2415,"modifier":"EXACT"}},"description":"IQ 24","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2407,"modifier":"EXACT"},"end":{"value":2438,"modifier":"EXACT"}},"description":"IQ 25","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2457,"modifier":"EXACT"},"end":{"value":2488,"modifier":"EXACT"}},"description":"IQ 26","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2530,"modifier":"EXACT"},"end":{"value":2561,"modifier":"EXACT"}},"description":"IQ 27","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2624,"modifier":"EXACT"},"end":{"value":2653,"modifier":"EXACT"}},"description":"IQ 28","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2665,"modifier":"EXACT"},"end":{"value":2696,"modifier":"EXACT"}},"description":"IQ 29","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2688,"modifier":"EXACT"},"end":{"value":2719,"modifier":"EXACT"}},"description":"IQ 30","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2738,"modifier":"EXACT"},"end":{"value":2767,"modifier":"EXACT"}},"description":"IQ 31","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2859,"modifier":"EXACT"},"end":{"value":2890,"modifier":"EXACT"}},"description":"IQ 32","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2909,"modifier":"EXACT"},"end":{"value":2938,"modifier":"EXACT"}},"description":"IQ 33","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2932,"modifier":"EXACT"},"end":{"value":2963,"modifier":"EXACT"}},"description":"IQ 34","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":2954,"modifier":"EXACT"},"end":{"value":2985,"modifier":"EXACT"}},"description":"IQ 35","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":3029,"modifier":"EXACT"},"end":{"value":3060,"modifier":"EXACT"}},"description":"IQ 36","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":3079,"modifier":"EXACT"},"end":{"value":3110,"modifier":"EXACT"}},"description":"IQ 37","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":3181,"modifier":"EXACT"},"end":{"value":3210,"modifier":"EXACT"}},"description":"IQ 38","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Domain","location":{"start":{"value":3204,"modifier":"EXACT"},"end":{"value":3235,"modifier":"EXACT"}},"description":"IQ 39","evidences":[{"evidenceCode":"ECO:0000255","source":"PROSITE-ProRule","id":"PRU00116"}]},{"type":"Region","location":{"start":{"value":1,"modifier":"EXACT"},"end":{"value":30,"modifier":"EXACT"}},"description":"Disordered","evidences":[{"evidenceCode":"ECO:0000256","source":"SAM","id":"MobiDB-lite"}]},{"type":"Region","location":{"start":{"value":308,"modifier":"EXACT"},"end":{"value":409,"modifier":"EXACT"}},"description":"Sufficient for interaction with KATNA1:KATNB1","evidences":[{"evidenceCode":"ECO:0000250","source":"UniProtKB","id":"Q8CJ27"}]},{"type":"Region","location":{"start":{"value":415,"modifier":"EXACT"},"end":{"value":443,"modifier":"EXACT"}},"description":"Disordered","evidences":[{"evidenceCode":"ECO:0000256","source":"SAM","id":"MobiDB-lite"}]},{"type":"Region","location":{"start":{"value":559,"modifier":"EXACT"},"end":{"value":581,"modifier":"EXACT"}},"description":"Disordered","evidences":[{"evidenceCode":"ECO:0000256","source":"SAM","id":"MobiDB-lite"}]},{"type":"Coiled coil","location":{"start":{"value":1057,"modifier":"EXACT"},"end":{"value":1078,"modifier":"EXACT"}},"description":"","evidences":[{"evidenceCode":"ECO:0000255"}]},{"type":"Compositional bias","location":{"start":{"value":559,"modifier":"EXACT"},"end":{"value":574,"modifier":"EXACT"}},"description":"Polar residues","evidences":[{"evidenceCode":"ECO:0000256","source":"SAM","id":"MobiDB-lite"}]}],"extraAttributes":{"uniParcId":"UPI0000458904"}}
42 changes: 42 additions & 0 deletions tests/preprocessing/test_uniprot_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import pytest
import pandas as pd
import os

from gpsea.model import ProteinMetadata, FeatureType


# Homo sapiens inositol 1,4,5-trisphosphate receptor type 1 (ITPR1), transcript variant 4, mRNA
ITPR1_protein_id = "NP_001365381.1"
ITPR1_protein_len = 2758



class TestUniprotJsonToMetadata:
"""
Test function that ingests UniProt JSON and transforms it to a ProteinMetadata object
"""

@pytest.fixture
def Q8IZT6_json_file_path(
fpath_preprocessing_data_dir: str,
) -> str:
return os.path.join(fpath_preprocessing_data_dir, "Q8IZT6_manual_download.json")

@pytest.fixture
def q8izt6_protein_metadata(self,
Q8IZT6_json_file_path:str) -> ProteinMetadata:
"""
:returns: ProteinMetadata created from a downloaded UniProt JSON file
"""
return ProteinMetadata.from_uniprot_json(
protein_id=ITPR1_protein_id,
label=ITPR1_protein_id,
uniprot_json=Q8IZT6_json_file_path,
protein_length=ITPR1_protein_len,
)

def test_general_info(self,
q8izt6_protein_metadata: ProteinMetadata):
assert ITPR1_protein_len == q8izt6_protein_metadata.protein_length
assert ITPR1_protein_id == q8izt6_protein_metadata.protein_id
assert len(q8izt6_protein_metadata.protein_features) == 2

0 comments on commit 6b375ca

Please sign in to comment.