Merge pull request #771 from ImageMarkup/structured-patient-ids

ImageMarkup · Oct 11, 2023 · 35b880a · 35b880a
2 parents 68b5d3b + 93c8342
commit 35b880a
Show file tree

Hide file tree

Showing 35 changed files with 716 additions and 64 deletions.
diff --git a/isic/core/constants.py b/isic/core/constants.py
@@ -1,2 +1,4 @@
 MONGO_ID_REGEX = "[0-9a-f]{24}"
 ISIC_ID_REGEX = "ISIC_[0-9]{7}"
+LESION_ID_REGEX = "IL_[0-9]{7}"
+PATIENT_ID_REGEX = "IP_[0-9]{7}"
diff --git a/isic/core/dsl.py b/isic/core/dsl.py
@@ -113,6 +113,10 @@ def convert_term(s, loc, toks):
         # isic_id can't be used with wildcards since it's a foreign key, so join the table and
         # refer to the __id.
         return "isic__id"
+    elif toks[0] == "lesion_id":
+        return "accession__lesion__id"
+    elif toks[0] == "patient_id":
+        return "accession__patient__id"
     elif toks[0] == "age_approx":
         return "accession__metadata__age__approx"
     elif toks[0] == "copyright_license":

diff --git a/isic/core/models/collection.py b/isic/core/models/collection.py
@@ -99,6 +99,24 @@ def doi_url(self):
         if self.doi:
             return f"https://doi.org/{self.doi}"
 
+    @property
+    def num_lesions(self):
+        return (
+            self.images.exclude(accession__lesion_id=None)
+            .values("accession__lesion_id")
+            .distinct()
+            .count()
+        )
+
+    @property
+    def num_patients(self):
+        return (
+            self.images.exclude(accession__patient_id=None)
+            .values("accession__patient_id")
+            .distinct()
+            .count()
+        )
+
     def full_clean(self, exclude=None, validate_unique=True):
         if self.pk and self.public and self.images.private().exists():
             raise ValidationError("Can't make collection public, it contains private images.")

diff --git a/isic/core/models/image.py b/isic/core/models/image.py
@@ -1,3 +1,5 @@
+from copy import deepcopy
+
 from django.contrib.auth.models import User
 from django.contrib.postgres.aggregates.general import ArrayAgg
 from django.contrib.postgres.indexes import GinIndex, OpClass
@@ -77,25 +79,36 @@ def __str__(self):
     def get_absolute_url(self):
         return reverse("core/image-detail", args=[self.pk])
 
-    @staticmethod
-    def _image_metadata(accession_metadata: dict) -> dict:
-        """
-        Return the metadata for an image given its accession metadata.
+    @property
+    def has_patient(self) -> bool:
+        return self.accession.patient_id is not None
 
-        Note that the metadata for the image includes a rounded age approximation, but not the
-        original age.
+    @property
+    def has_lesion(self) -> bool:
+        return self.accession.lesion_id is not None
 
-        The static version of this method is useful when dealing with dictionary representations.
+    @property
+    def metadata(self) -> dict:
         """
-        if "age" in accession_metadata:
-            accession_metadata["age_approx"] = Accession._age_approx(accession_metadata["age"])
-            del accession_metadata["age"]
+        Return the metadata for an image.
 
-        return accession_metadata
+        Note that the metadata for the image is sanitized unlike the metadata for the accession
+        which is behind the "firewall" of ingest. This includes rounded ages and obfuscated
+        longitudinal IDs.
+        """
+        image_metadata = deepcopy(self.accession.metadata)
 
-    @property
-    def metadata(self) -> dict:
-        return Image._image_metadata(self.accession.metadata)
+        if "age" in image_metadata:
+            image_metadata["age_approx"] = Accession._age_approx(image_metadata["age"])
+            del image_metadata["age"]
+
+        if self.has_lesion:
+            image_metadata["lesion_id"] = self.accession.lesion_id
+
+        if self.has_patient:
+            image_metadata["patient_id"] = self.accession.patient_id
+
+        return image_metadata
 
     def to_elasticsearch_document(self, body_only=False) -> dict:
         # Can only be called on images that were fetched with with_elasticsearch_properties.
@@ -119,27 +132,25 @@ def to_elasticsearch_document(self, body_only=False) -> dict:
             # index the document by image.pk so it can be updated later.
             return {"_id": self.pk, "_source": document}
 
-    def _with_same_metadata(self, metadata_key: str) -> QuerySet["Image"]:
-        if self.accession.metadata.get(metadata_key):
-            return (
-                Image.objects.filter(accession__cohort_id=self.accession.cohort_id)
-                .filter(
-                    **{
-                        f"accession__metadata__{metadata_key}": self.accession.metadata[
-                            metadata_key
-                        ]
-                    }
-                )
-                .exclude(pk=self.pk)
-            )
-        else:
+    def same_patient_images(self) -> QuerySet["Image"]:
+        if not self.has_patient:
             return Image.objects.none()
 
-    def same_patient_images(self) -> QuerySet["Image"]:
-        return self._with_same_metadata("patient_id")
+        return (
+            Image.objects.filter(accession__cohort_id=self.accession.cohort_id)
+            .filter(**{"accession__patient_id": self.accession.patient_id})
+            .exclude(pk=self.pk)
+        )
 
     def same_lesion_images(self) -> QuerySet["Image"]:
-        return self._with_same_metadata("lesion_id")
+        if not self.has_lesion:
+            return Image.objects.none()
+
+        return (
+            Image.objects.filter(accession__cohort_id=self.accession.cohort_id)
+            .filter(**{"accession__lesion_id": self.accession.lesion_id})
+            .exclude(pk=self.pk)
+        )
 
 
 class ImageShare(TimeStampedModel):

diff --git a/isic/core/serializers.py b/isic/core/serializers.py
@@ -25,9 +25,11 @@ def valid_search_query(cls, value: str | None):
     def collections_to_list(cls, value: str | list[int]):
         if isinstance(value, str) and value:
             return [int(x) for x in value.split(",")]
-        elif isinstance(value, list):
+        elif isinstance(value, list) and len(value) == 1 and isinstance(value[0], str):
             # TODO: this is a hack to get around the fact that ninja uses a swagger array input
             # field for list types regardless.
+            return cls.collections_to_list(value[0])
+        elif isinstance(value, list) and value:
             return value
         return None
 

diff --git a/isic/core/services/__init__.py b/isic/core/services/__init__.py
@@ -11,19 +11,26 @@ class JsonKeys(Func):
     function = "jsonb_object_keys"
 
 
-def _image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]:
+def image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]:
     headers = ["isic_id", "attribution", "copyright_license"]
 
+    accession_qs = Accession.objects.filter(image__in=qs)
+
     # depending on which queryset is passed in, the set of headers is different.
     # get the superset of headers for this particular queryset.
     used_metadata_keys = list(
-        Accession.objects.filter(image__in=qs)
-        .annotate(metadata_keys=JsonKeys("metadata"))
+        accession_qs.annotate(metadata_keys=JsonKeys("metadata"))
         .order_by()
         .values_list("metadata_keys", flat=True)
         .distinct()
     )
 
+    if accession_qs.exclude(lesion=None).exists():
+        used_metadata_keys.append("lesion_id")
+
+    if accession_qs.exclude(patient=None).exists():
+        used_metadata_keys.append("patient_id")
+
     # TODO: this is a very leaky part of sensitive metadata handling that
     # should be refactored.
     if "age" in used_metadata_keys:
@@ -34,17 +41,33 @@ def _image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]:
 
 
 def image_metadata_csv_rows(*, qs: QuerySet[Image]) -> Iterator[dict]:
+    # Note this uses .values because populating django ORM objects is very slow, and doing this on
+    # large querysets can add ~5s per 100k images to the request time.
     for image in qs.order_by("isic_id").values(
         "isic_id",
         "accession__cohort__attribution",
         "accession__copyright_license",
         "accession__metadata",
+        "accession__lesion_id",
+        "accession__patient_id",
     ):
+        if "age" in image["accession__metadata"]:
+            image["accession__metadata"]["age_approx"] = Accession._age_approx(
+                image["accession__metadata"]["age"]
+            )
+            del image["accession__metadata"]["age"]
+
+        if image["accession__lesion_id"]:
+            image["accession__metadata"]["lesion_id"] = image["accession__lesion_id"]
+
+        if image["accession__patient_id"]:
+            image["accession__metadata"]["patient_id"] = image["accession__patient_id"]
+
         yield {
             **{
                 "isic_id": image["isic_id"],
                 "attribution": image["accession__cohort__attribution"],
                 "copyright_license": image["accession__copyright_license"],
-                **Image._image_metadata(image["accession__metadata"]),
+                **image["accession__metadata"],
             }
         }
diff --git a/isic/core/templates/core/collection_detail.html b/isic/core/templates/core/collection_detail.html
@@ -56,6 +56,14 @@
         <span class="font-bold">Number of images:</span>
         {{ num_images|intcomma }}
       </li>
+      <li>
+        <span class="font-bold">Number of patients:</span>
+        {{ collection.num_patients|intcomma }}
+      </li>
+      <li>
+        <span class="font-bold">Number of lesions:</span>
+        {{ collection.num_lesions|intcomma }}
+      </li>
       <li>
         <span class="font-bold">Licenses:</span>
           <ul class="list-disc">

diff --git a/isic/core/tests/test_dsl.py b/isic/core/tests/test_dsl.py
@@ -11,6 +11,10 @@
         # test isic_id especially due to the weirdness of the foreign key
         ["isic_id:ISIC_123*", Q(isic__id__startswith="ISIC_123")],
         ["isic_id:*123", Q(isic__id__endswith="123")],
+        ["lesion_id:IL_123*", Q(accession__lesion__id__startswith="IL_123")],
+        ["lesion_id:*123", Q(accession__lesion__id__endswith="123")],
+        ["patient_id:IP_123*", Q(accession__patient__id__startswith="IP_123")],
+        ["patient_id:*123", Q(accession__patient__id__endswith="123")],
         ['copyright_license:"CC-0"', Q(accession__copyright_license="CC-0")],
         ["diagnosis:foobar", Q(accession__metadata__diagnosis="foobar")],
         ['diagnosis:"foo bar"', Q(accession__metadata__diagnosis="foo bar")],

diff --git a/isic/core/tests/test_metadata_masking_api.py b/isic/core/tests/test_metadata_masking_api.py
@@ -0,0 +1,55 @@
+import pytest
+
+from isic.core.models.image import Image
+from isic.core.services import image_metadata_csv_headers, image_metadata_csv_rows
+
+
+@pytest.fixture
+def image_with_maskable_metadata(image):
+    image.accession.update_metadata(
+        image.creator,
+        {
+            "age": 32,
+            "lesion_id": "supersecretlesionid",
+            "patient_id": "supersecretpatientid",
+        },
+        ignore_image_check=True,
+    )
+    return image
+
+
+@pytest.mark.django_db
+def test_accession_exposes_unsafe_metadata(image_with_maskable_metadata):
+    assert image_with_maskable_metadata.accession.metadata["age"] == 32
+    assert "age_approx" not in image_with_maskable_metadata.accession.metadata
+    assert "lesion_id" not in image_with_maskable_metadata.accession.metadata
+    assert "patient_id" not in image_with_maskable_metadata.accession.metadata
+
+
+@pytest.mark.django_db
+def test_image_exposes_safe_metadata(image_with_maskable_metadata):
+    assert image_with_maskable_metadata.metadata["age_approx"] == 30
+    assert "age" not in image_with_maskable_metadata.metadata
+    assert image_with_maskable_metadata.metadata["lesion_id"] != "supersecretlesionid"
+    assert image_with_maskable_metadata.metadata["patient_id"] != "supersecretpatientid"
+
+
+@pytest.mark.django_db
+def test_image_csv_headers_exposes_safe_metadata(image_with_maskable_metadata):
+    headers = image_metadata_csv_headers(qs=Image.objects.all())
+    assert "age" not in headers
+    assert "age_approx" in headers
+    assert "lesion_id" in headers
+    assert "patient_id" in headers
+
+
+@pytest.mark.django_db
+def test_image_csv_rows_exposes_safe_metadata(image_with_maskable_metadata):
+    rows = image_metadata_csv_rows(qs=Image.objects.all())
+    for row in rows:
+        assert "age" not in row
+        assert "age_approx" in row
+        assert "lesion_id" in row
+        assert "patient_id" in row
+        assert row["lesion_id"] != "supersecretlesionid"
+        assert row["patient_id"] != "supersecretpatientid"
diff --git a/isic/core/tests/test_view_image_detail.py b/isic/core/tests/test_view_image_detail.py
@@ -2,6 +2,8 @@
 import pytest
 from pytest_lazyfixture import lazy_fixture
 
+from isic.core.models.image import Image
+
 
 @pytest.mark.django_db
 @pytest.mark.parametrize(
@@ -25,24 +27,29 @@ def detailed_image(
     image_factory, user_factory, study_factory, study_task_factory, collection_factory
 ):
     user = user_factory()
+
     metadata = {
         "age": 32,
-        "lesion_id": "IL_123456",
-        "patient_id": "IP_123456",
+        "lesion_id": "IL_1234567",
+        "patient_id": "IP_1234567",
     }
 
     private_collection = collection_factory(public=False, pinned=True)
     public_collection = collection_factory(public=True, pinned=True)
     private_study = study_factory(public=False)
     public_study = study_factory(public=True)
-    main_image = image_factory(
-        public=True, accession__metadata=metadata, accession__cohort__contributor__owners=[user]
+    main_image: Image = image_factory(
+        public=True,
+        accession__cohort__contributor__owners=[user],
     )
 
-    # create an image w/ the same lesion/patient ID
-    image_factory(
-        public=True, accession__metadata=metadata, accession__cohort__contributor__owners=[user]
+    # create an image w/ the same longitudinal information
+    secondary_image: Image = image_factory(
+        public=True,
+        accession__cohort__contributor__owners=[user],
     )
+    main_image.accession.update_metadata(user, metadata, ignore_image_check=True)
+    secondary_image.accession.update_metadata(user, metadata, ignore_image_check=True)
 
     private_collection.images.add(main_image)
     public_collection.images.add(main_image)

diff --git a/isic/core/views/collections.py b/isic/core/views/collections.py
@@ -18,7 +18,7 @@
 from isic.core.models import Collection
 from isic.core.models.base import CopyrightLicense
 from isic.core.permissions import get_visible_objects, needs_object_permission
-from isic.core.services import _image_metadata_csv_headers, image_metadata_csv_rows
+from isic.core.services import image_metadata_csv_headers, image_metadata_csv_rows
 from isic.core.services.collection import collection_create, collection_update
 from isic.core.services.collection.doi import (
     collection_build_doi_preview,
@@ -117,7 +117,7 @@ def collection_download_metadata(request, pk):
         "Content-Disposition"
     ] = f'attachment; filename="{slugify(collection.name)}_metadata_{current_time}.csv"'
 
-    writer = csv.DictWriter(response, _image_metadata_csv_headers(qs=qs))
+    writer = csv.DictWriter(response, image_metadata_csv_headers(qs=qs))
     writer.writeheader()
 
     for metadata_row in image_metadata_csv_rows(qs=qs):