diff --git a/isic/core/constants.py b/isic/core/constants.py index 18f4cc62..4eae8de0 100644 --- a/isic/core/constants.py +++ b/isic/core/constants.py @@ -1,2 +1,4 @@ MONGO_ID_REGEX = "[0-9a-f]{24}" ISIC_ID_REGEX = "ISIC_[0-9]{7}" +LESION_ID_REGEX = "IL_[0-9]{7}" +PATIENT_ID_REGEX = "IP_[0-9]{7}" diff --git a/isic/core/dsl.py b/isic/core/dsl.py index 77624788..31eee327 100644 --- a/isic/core/dsl.py +++ b/isic/core/dsl.py @@ -113,6 +113,10 @@ def convert_term(s, loc, toks): # isic_id can't be used with wildcards since it's a foreign key, so join the table and # refer to the __id. return "isic__id" + elif toks[0] == "lesion_id": + return "accession__lesion__id" + elif toks[0] == "patient_id": + return "accession__patient__id" elif toks[0] == "age_approx": return "accession__metadata__age__approx" elif toks[0] == "copyright_license": diff --git a/isic/core/models/collection.py b/isic/core/models/collection.py index c3bd818d..8a885b65 100644 --- a/isic/core/models/collection.py +++ b/isic/core/models/collection.py @@ -99,6 +99,24 @@ def doi_url(self): if self.doi: return f"https://doi.org/{self.doi}" + @property + def num_lesions(self): + return ( + self.images.exclude(accession__lesion_id=None) + .values("accession__lesion_id") + .distinct() + .count() + ) + + @property + def num_patients(self): + return ( + self.images.exclude(accession__patient_id=None) + .values("accession__patient_id") + .distinct() + .count() + ) + def full_clean(self, exclude=None, validate_unique=True): if self.pk and self.public and self.images.private().exists(): raise ValidationError("Can't make collection public, it contains private images.") diff --git a/isic/core/models/image.py b/isic/core/models/image.py index 4bce0317..cdf88453 100644 --- a/isic/core/models/image.py +++ b/isic/core/models/image.py @@ -1,3 +1,5 @@ +from copy import deepcopy + from django.contrib.auth.models import User from django.contrib.postgres.aggregates.general import ArrayAgg from django.contrib.postgres.indexes import GinIndex, OpClass @@ -77,25 +79,36 @@ def __str__(self): def get_absolute_url(self): return reverse("core/image-detail", args=[self.pk]) - @staticmethod - def _image_metadata(accession_metadata: dict) -> dict: - """ - Return the metadata for an image given its accession metadata. + @property + def has_patient(self) -> bool: + return self.accession.patient_id is not None - Note that the metadata for the image includes a rounded age approximation, but not the - original age. + @property + def has_lesion(self) -> bool: + return self.accession.lesion_id is not None - The static version of this method is useful when dealing with dictionary representations. + @property + def metadata(self) -> dict: """ - if "age" in accession_metadata: - accession_metadata["age_approx"] = Accession._age_approx(accession_metadata["age"]) - del accession_metadata["age"] + Return the metadata for an image. - return accession_metadata + Note that the metadata for the image is sanitized unlike the metadata for the accession + which is behind the "firewall" of ingest. This includes rounded ages and obfuscated + longitudinal IDs. + """ + image_metadata = deepcopy(self.accession.metadata) - @property - def metadata(self) -> dict: - return Image._image_metadata(self.accession.metadata) + if "age" in image_metadata: + image_metadata["age_approx"] = Accession._age_approx(image_metadata["age"]) + del image_metadata["age"] + + if self.has_lesion: + image_metadata["lesion_id"] = self.accession.lesion_id + + if self.has_patient: + image_metadata["patient_id"] = self.accession.patient_id + + return image_metadata def to_elasticsearch_document(self, body_only=False) -> dict: # Can only be called on images that were fetched with with_elasticsearch_properties. @@ -119,27 +132,25 @@ def to_elasticsearch_document(self, body_only=False) -> dict: # index the document by image.pk so it can be updated later. return {"_id": self.pk, "_source": document} - def _with_same_metadata(self, metadata_key: str) -> QuerySet["Image"]: - if self.accession.metadata.get(metadata_key): - return ( - Image.objects.filter(accession__cohort_id=self.accession.cohort_id) - .filter( - **{ - f"accession__metadata__{metadata_key}": self.accession.metadata[ - metadata_key - ] - } - ) - .exclude(pk=self.pk) - ) - else: + def same_patient_images(self) -> QuerySet["Image"]: + if not self.has_patient: return Image.objects.none() - def same_patient_images(self) -> QuerySet["Image"]: - return self._with_same_metadata("patient_id") + return ( + Image.objects.filter(accession__cohort_id=self.accession.cohort_id) + .filter(**{"accession__patient_id": self.accession.patient_id}) + .exclude(pk=self.pk) + ) def same_lesion_images(self) -> QuerySet["Image"]: - return self._with_same_metadata("lesion_id") + if not self.has_lesion: + return Image.objects.none() + + return ( + Image.objects.filter(accession__cohort_id=self.accession.cohort_id) + .filter(**{"accession__lesion_id": self.accession.lesion_id}) + .exclude(pk=self.pk) + ) class ImageShare(TimeStampedModel): diff --git a/isic/core/serializers.py b/isic/core/serializers.py index 1ab02263..a470ea3c 100644 --- a/isic/core/serializers.py +++ b/isic/core/serializers.py @@ -25,9 +25,11 @@ def valid_search_query(cls, value: str | None): def collections_to_list(cls, value: str | list[int]): if isinstance(value, str) and value: return [int(x) for x in value.split(",")] - elif isinstance(value, list): + elif isinstance(value, list) and len(value) == 1 and isinstance(value[0], str): # TODO: this is a hack to get around the fact that ninja uses a swagger array input # field for list types regardless. + return cls.collections_to_list(value[0]) + elif isinstance(value, list) and value: return value return None diff --git a/isic/core/services/__init__.py b/isic/core/services/__init__.py index 0d4692e0..7d7ef181 100644 --- a/isic/core/services/__init__.py +++ b/isic/core/services/__init__.py @@ -11,19 +11,26 @@ class JsonKeys(Func): function = "jsonb_object_keys" -def _image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]: +def image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]: headers = ["isic_id", "attribution", "copyright_license"] + accession_qs = Accession.objects.filter(image__in=qs) + # depending on which queryset is passed in, the set of headers is different. # get the superset of headers for this particular queryset. used_metadata_keys = list( - Accession.objects.filter(image__in=qs) - .annotate(metadata_keys=JsonKeys("metadata")) + accession_qs.annotate(metadata_keys=JsonKeys("metadata")) .order_by() .values_list("metadata_keys", flat=True) .distinct() ) + if accession_qs.exclude(lesion=None).exists(): + used_metadata_keys.append("lesion_id") + + if accession_qs.exclude(patient=None).exists(): + used_metadata_keys.append("patient_id") + # TODO: this is a very leaky part of sensitive metadata handling that # should be refactored. if "age" in used_metadata_keys: @@ -34,17 +41,33 @@ def _image_metadata_csv_headers(*, qs: QuerySet[Image]) -> list[str]: def image_metadata_csv_rows(*, qs: QuerySet[Image]) -> Iterator[dict]: + # Note this uses .values because populating django ORM objects is very slow, and doing this on + # large querysets can add ~5s per 100k images to the request time. for image in qs.order_by("isic_id").values( "isic_id", "accession__cohort__attribution", "accession__copyright_license", "accession__metadata", + "accession__lesion_id", + "accession__patient_id", ): + if "age" in image["accession__metadata"]: + image["accession__metadata"]["age_approx"] = Accession._age_approx( + image["accession__metadata"]["age"] + ) + del image["accession__metadata"]["age"] + + if image["accession__lesion_id"]: + image["accession__metadata"]["lesion_id"] = image["accession__lesion_id"] + + if image["accession__patient_id"]: + image["accession__metadata"]["patient_id"] = image["accession__patient_id"] + yield { **{ "isic_id": image["isic_id"], "attribution": image["accession__cohort__attribution"], "copyright_license": image["accession__copyright_license"], - **Image._image_metadata(image["accession__metadata"]), + **image["accession__metadata"], } } diff --git a/isic/core/templates/core/collection_detail.html b/isic/core/templates/core/collection_detail.html index 972a2ce6..69ade630 100644 --- a/isic/core/templates/core/collection_detail.html +++ b/isic/core/templates/core/collection_detail.html @@ -56,6 +56,14 @@ Number of images: {{ num_images|intcomma }} +
  • + Number of patients: + {{ collection.num_patients|intcomma }} +
  • +
  • + Number of lesions: + {{ collection.num_lesions|intcomma }} +
  • Licenses: