Skip to content

Commit

Permalink
Merge pull request #771 from ImageMarkup/structured-patient-ids
Browse files Browse the repository at this point in the history
  • Loading branch information
danlamanna committed Oct 11, 2023
2 parents 68b5d3b + 93c8342 commit 35b880a
Show file tree
Hide file tree
Showing 35 changed files with 716 additions and 64 deletions.
2 changes: 2 additions & 0 deletions isic/core/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
MONGO_ID_REGEX = "[0-9a-f]{24}"
ISIC_ID_REGEX = "ISIC_[0-9]{7}"
LESION_ID_REGEX = "IL_[0-9]{7}"
PATIENT_ID_REGEX = "IP_[0-9]{7}"
4 changes: 4 additions & 0 deletions isic/core/dsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def convert_term(s, loc, toks):
# isic_id can't be used with wildcards since it's a foreign key, so join the table and
# refer to the __id.
return "isic__id"
elif toks[0] == "lesion_id":
return "accession__lesion__id"
elif toks[0] == "patient_id":
return "accession__patient__id"
elif toks[0] == "age_approx":
return "accession__metadata__age__approx"
elif toks[0] == "copyright_license":
Expand Down
18 changes: 18 additions & 0 deletions isic/core/models/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,24 @@ def doi_url(self):
if self.doi:
return f"https://doi.org/{self.doi}"

@property
def num_lesions(self):
return (
self.images.exclude(accession__lesion_id=None)
.values("accession__lesion_id")
.distinct()
.count()
)

@property
def num_patients(self):
return (
self.images.exclude(accession__patient_id=None)
.values("accession__patient_id")
.distinct()
.count()
)

def full_clean(self, exclude=None, validate_unique=True):
if self.pk and self.public and self.images.private().exists():
raise ValidationError("Can't make collection public, it contains private images.")
Expand Down
73 changes: 42 additions & 31 deletions isic/core/models/image.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from copy import deepcopy

from django.contrib.auth.models import User
from django.contrib.postgres.aggregates.general import ArrayAgg
from django.contrib.postgres.indexes import GinIndex, OpClass
Expand Down Expand Up @@ -77,25 +79,36 @@ def __str__(self):
def get_absolute_url(self):
return reverse("core/image-detail", args=[self.pk])

@staticmethod
def _image_metadata(accession_metadata: dict) -> dict:
"""
Return the metadata for an image given its accession metadata.
@property
def has_patient(self) -> bool:
return self.accession.patient_id is not None

Note that the metadata for the image includes a rounded age approximation, but not the
original age.
@property
def has_lesion(self) -> bool:
return self.accession.lesion_id is not None

The static version of this method is useful when dealing with dictionary representations.
@property
def metadata(self) -> dict:
"""
if "age" in accession_metadata:
accession_metadata["age_approx"] = Accession._age_approx(accession_metadata["age"])
del accession_metadata["age"]
Return the metadata for an image.
return accession_metadata
Note that the metadata for the image is sanitized unlike the metadata for the accession
which is behind the "firewall" of ingest. This includes rounded ages and obfuscated
longitudinal IDs.
"""
image_metadata = deepcopy(self.accession.metadata)

@property
def metadata(self) -> dict:
return Image._image_metadata(self.accession.metadata)
if "age" in image_metadata:
image_metadata["age_approx"] = Accession._age_approx(image_metadata["age"])
del image_metadata["age"]

if self.has_lesion:
image_metadata["lesion_id"] = self.accession.lesion_id

if self.has_patient:
image_metadata["patient_id"] = self.accession.patient_id

return image_metadata

def to_elasticsearch_document(self, body_only=False) -> dict:
# Can only be called on images that were fetched with with_elasticsearch_properties.
Expand All @@ -119,27 +132,25 @@ def to_elasticsearch_document(self, body_only=False) -> dict:
# index the document by image.pk so it can be updated later.
return {"_id": self.pk, "_source": document}

def _with_same_metadata(self, metadata_key: str) -> QuerySet["Image"]:
if self.accession.metadata.get(metadata_key):
return (
Image.objects.filter(accession__cohort_id=self.accession.cohort_id)
.filter(
**{
f"accession__metadata__{metadata_key}": self.accession.metadata[
metadata_key
]
}
)
.exclude(pk=self.pk)
)
else:
def same_patient_images(self) -> QuerySet["Image"]:
if not self.has_patient:
return Image.objects.none()

def same_patient_images(self) -> QuerySet["Image"]:
return self._with_same_metadata("patient_id")
return (
Image.objects.filter(accession__cohort_id=self.accession.cohort_id)
.filter(**{"accession__patient_id": self.accession.patient_id})
.exclude(pk=self.pk)
)

def same_lesion_images(self) -> QuerySet["Image"]:
return self._with_same_metadata("lesion_id")
if not self.has_lesion:
return Image.objects.none()

return (
Image.objects.filter(accession__cohort_id=self.accession.cohort_id)
.filter(**{"accession__lesion_id": self.accession.lesion_id})
.exclude(pk=self.pk)
)


class ImageShare(TimeStampedModel):
Expand Down
4 changes: 3 additions & 1 deletion isic/core/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@ def valid_search_query(cls, value: str | None):
def collections_to_list(cls, value: str | list[int]):
if isinstance(value, str) and value:
return [int(x) for x in value.split(",")]
elif isinstance(value, list):
elif isinstance(value, list) and len(value) == 1 and isinstance(value[0], str):
# TODO: this is a hack to get around the fact that ninja uses a swagger array input
# field for list types regardless.
return cls.collections_to_list(value[0])
elif isinstance(value, list) and value:
return value
return None

Expand Down
31 changes: 27 additions & 4 deletions isic/core/services/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,26 @@ class JsonKeys(Func):
function = "jsonb_object_keys"


def _image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]:
def image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]:
headers = ["isic_id", "attribution", "copyright_license"]

accession_qs = Accession.objects.filter(image__in=qs)

# depending on which queryset is passed in, the set of headers is different.
# get the superset of headers for this particular queryset.
used_metadata_keys = list(
Accession.objects.filter(image__in=qs)
.annotate(metadata_keys=JsonKeys("metadata"))
accession_qs.annotate(metadata_keys=JsonKeys("metadata"))
.order_by()
.values_list("metadata_keys", flat=True)
.distinct()
)

if accession_qs.exclude(lesion=None).exists():
used_metadata_keys.append("lesion_id")

if accession_qs.exclude(patient=None).exists():
used_metadata_keys.append("patient_id")

# TODO: this is a very leaky part of sensitive metadata handling that
# should be refactored.
if "age" in used_metadata_keys:
Expand All @@ -34,17 +41,33 @@ def _image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]:


def image_metadata_csv_rows(*, qs: QuerySet[Image]) -> Iterator[dict]:
# Note this uses .values because populating django ORM objects is very slow, and doing this on
# large querysets can add ~5s per 100k images to the request time.
for image in qs.order_by("isic_id").values(
"isic_id",
"accession__cohort__attribution",
"accession__copyright_license",
"accession__metadata",
"accession__lesion_id",
"accession__patient_id",
):
if "age" in image["accession__metadata"]:
image["accession__metadata"]["age_approx"] = Accession._age_approx(
image["accession__metadata"]["age"]
)
del image["accession__metadata"]["age"]

if image["accession__lesion_id"]:
image["accession__metadata"]["lesion_id"] = image["accession__lesion_id"]

if image["accession__patient_id"]:
image["accession__metadata"]["patient_id"] = image["accession__patient_id"]

yield {
**{
"isic_id": image["isic_id"],
"attribution": image["accession__cohort__attribution"],
"copyright_license": image["accession__copyright_license"],
**Image._image_metadata(image["accession__metadata"]),
**image["accession__metadata"],
}
}
8 changes: 8 additions & 0 deletions isic/core/templates/core/collection_detail.html
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,14 @@
<span class="font-bold">Number of images:</span>
{{ num_images|intcomma }}
</li>
<li>
<span class="font-bold">Number of patients:</span>
{{ collection.num_patients|intcomma }}
</li>
<li>
<span class="font-bold">Number of lesions:</span>
{{ collection.num_lesions|intcomma }}
</li>
<li>
<span class="font-bold">Licenses:</span>
<ul class="list-disc">
Expand Down
4 changes: 4 additions & 0 deletions isic/core/tests/test_dsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
# test isic_id especially due to the weirdness of the foreign key
["isic_id:ISIC_123*", Q(isic__id__startswith="ISIC_123")],
["isic_id:*123", Q(isic__id__endswith="123")],
["lesion_id:IL_123*", Q(accession__lesion__id__startswith="IL_123")],
["lesion_id:*123", Q(accession__lesion__id__endswith="123")],
["patient_id:IP_123*", Q(accession__patient__id__startswith="IP_123")],
["patient_id:*123", Q(accession__patient__id__endswith="123")],
['copyright_license:"CC-0"', Q(accession__copyright_license="CC-0")],
["diagnosis:foobar", Q(accession__metadata__diagnosis="foobar")],
['diagnosis:"foo bar"', Q(accession__metadata__diagnosis="foo bar")],
Expand Down
55 changes: 55 additions & 0 deletions isic/core/tests/test_metadata_masking_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pytest

from isic.core.models.image import Image
from isic.core.services import image_metadata_csv_headers, image_metadata_csv_rows


@pytest.fixture
def image_with_maskable_metadata(image):
image.accession.update_metadata(
image.creator,
{
"age": 32,
"lesion_id": "supersecretlesionid",
"patient_id": "supersecretpatientid",
},
ignore_image_check=True,
)
return image


@pytest.mark.django_db
def test_accession_exposes_unsafe_metadata(image_with_maskable_metadata):
assert image_with_maskable_metadata.accession.metadata["age"] == 32
assert "age_approx" not in image_with_maskable_metadata.accession.metadata
assert "lesion_id" not in image_with_maskable_metadata.accession.metadata
assert "patient_id" not in image_with_maskable_metadata.accession.metadata


@pytest.mark.django_db
def test_image_exposes_safe_metadata(image_with_maskable_metadata):
assert image_with_maskable_metadata.metadata["age_approx"] == 30
assert "age" not in image_with_maskable_metadata.metadata
assert image_with_maskable_metadata.metadata["lesion_id"] != "supersecretlesionid"
assert image_with_maskable_metadata.metadata["patient_id"] != "supersecretpatientid"


@pytest.mark.django_db
def test_image_csv_headers_exposes_safe_metadata(image_with_maskable_metadata):
headers = image_metadata_csv_headers(qs=Image.objects.all())
assert "age" not in headers
assert "age_approx" in headers
assert "lesion_id" in headers
assert "patient_id" in headers


@pytest.mark.django_db
def test_image_csv_rows_exposes_safe_metadata(image_with_maskable_metadata):
rows = image_metadata_csv_rows(qs=Image.objects.all())
for row in rows:
assert "age" not in row
assert "age_approx" in row
assert "lesion_id" in row
assert "patient_id" in row
assert row["lesion_id"] != "supersecretlesionid"
assert row["patient_id"] != "supersecretpatientid"
21 changes: 14 additions & 7 deletions isic/core/tests/test_view_image_detail.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import pytest
from pytest_lazyfixture import lazy_fixture

from isic.core.models.image import Image


@pytest.mark.django_db
@pytest.mark.parametrize(
Expand All @@ -25,24 +27,29 @@ def detailed_image(
image_factory, user_factory, study_factory, study_task_factory, collection_factory
):
user = user_factory()

metadata = {
"age": 32,
"lesion_id": "IL_123456",
"patient_id": "IP_123456",
"lesion_id": "IL_1234567",
"patient_id": "IP_1234567",
}

private_collection = collection_factory(public=False, pinned=True)
public_collection = collection_factory(public=True, pinned=True)
private_study = study_factory(public=False)
public_study = study_factory(public=True)
main_image = image_factory(
public=True, accession__metadata=metadata, accession__cohort__contributor__owners=[user]
main_image: Image = image_factory(
public=True,
accession__cohort__contributor__owners=[user],
)

# create an image w/ the same lesion/patient ID
image_factory(
public=True, accession__metadata=metadata, accession__cohort__contributor__owners=[user]
# create an image w/ the same longitudinal information
secondary_image: Image = image_factory(
public=True,
accession__cohort__contributor__owners=[user],
)
main_image.accession.update_metadata(user, metadata, ignore_image_check=True)
secondary_image.accession.update_metadata(user, metadata, ignore_image_check=True)

private_collection.images.add(main_image)
public_collection.images.add(main_image)
Expand Down
4 changes: 2 additions & 2 deletions isic/core/views/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from isic.core.models import Collection
from isic.core.models.base import CopyrightLicense
from isic.core.permissions import get_visible_objects, needs_object_permission
from isic.core.services import _image_metadata_csv_headers, image_metadata_csv_rows
from isic.core.services import image_metadata_csv_headers, image_metadata_csv_rows
from isic.core.services.collection import collection_create, collection_update
from isic.core.services.collection.doi import (
collection_build_doi_preview,
Expand Down Expand Up @@ -117,7 +117,7 @@ def collection_download_metadata(request, pk):
"Content-Disposition"
] = f'attachment; filename="{slugify(collection.name)}_metadata_{current_time}.csv"'

writer = csv.DictWriter(response, _image_metadata_csv_headers(qs=qs))
writer = csv.DictWriter(response, image_metadata_csv_headers(qs=qs))
writer.writeheader()

for metadata_row in image_metadata_csv_rows(qs=qs):
Expand Down
Loading

0 comments on commit 35b880a

Please sign in to comment.