Skip to content

Commit

Permalink
Merge pull request #907 from ImageMarkup/handle-bom-csvs
Browse files Browse the repository at this point in the history
  • Loading branch information
danlamanna committed Jun 20, 2024
2 parents abe29d6 + 0eec563 commit 4ace25a
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 41 deletions.
8 changes: 4 additions & 4 deletions isic/ingest/models/metadata_file.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import csv
import io

from django.contrib.auth.models import User
from django.core.files.base import File
from django.core.validators import FileExtensionValidator
from django.db import models
from django.db.models.query import QuerySet
Expand All @@ -26,9 +26,9 @@ class MetadataFile(CreationSortedTimeStampedModel):
def __str__(self) -> str:
return self.blob_name

def to_iterable(self) -> tuple[File, csv.DictReader]:
with self.blob.open("r") as blob:
return blob, csv.DictReader(blob)
@staticmethod
def to_dict_reader(fh):
return csv.DictReader(io.TextIOWrapper(fh, encoding="utf-8-sig"))


class MetadataFilePermissions:
Expand Down
54 changes: 27 additions & 27 deletions isic/ingest/tasks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
from collections.abc import Iterable
from csv import DictReader
import itertools
import time

Expand Down Expand Up @@ -104,24 +103,24 @@ def validate_metadata_task(metadata_file_pk: int):
metadata_file = MetadataFile.objects.select_related("cohort").get(pk=metadata_file_pk)

try:
fh, reader = metadata_file.to_iterable()
successful: bool = False
unstructured_columns: list[str] = get_unstructured_columns(reader.fieldnames)
csv_check: list[Problem] | None = None
internal_check: tuple[ColumnRowErrors, list[Problem]] | None = None
archive_check: tuple[ColumnRowErrors, list[Problem]] | None = None
with metadata_file.blob.open("rb") as fh:
reader = MetadataFile.to_dict_reader(fh)
successful: bool = False
unstructured_columns: list[str] = get_unstructured_columns(reader.fieldnames)
csv_check: list[Problem] | None = None
internal_check: tuple[ColumnRowErrors, list[Problem]] | None = None
archive_check: tuple[ColumnRowErrors, list[Problem]] | None = None

with metadata_file.blob.open("r") as fh:
csv_check = validate_csv_format_and_filenames(DictReader(fh), metadata_file.cohort)
csv_check = validate_csv_format_and_filenames(reader, metadata_file.cohort)

if not any(csv_check):
fh.seek(0)
internal_check = validate_internal_consistency(DictReader(fh))
internal_check = validate_internal_consistency(MetadataFile.to_dict_reader(fh))

if not any(internal_check):
fh.seek(0)
archive_check = validate_archive_consistency(
DictReader(fh), metadata_file.cohort
MetadataFile.to_dict_reader(fh), metadata_file.cohort
)

if not any(archive_check):
Expand Down Expand Up @@ -159,22 +158,23 @@ def update_metadata_task(user_pk: int, metadata_file_pk: int):
(_ for _ in Patient.objects.select_for_update().all())
(_ for _ in RcmCase.objects.select_for_update().all())

_, rows = metadata_file.to_iterable()

for chunk in itertools.batched(rows, 1_000):
accessions: QuerySet[Accession] = metadata_file.cohort.accessions.select_related(
"image", "review", "lesion", "patient", "rcm_case", "cohort"
).filter(original_blob_name__in=[row["filename"] for row in chunk])
accessions_by_filename: dict[str, Accession] = {
accession.original_blob_name: accession for accession in accessions
}

for row in chunk:
# filename doesn't need to be stored in the metadata since it's equal to
# original_blob_name
accession = accessions_by_filename[row["filename"]]
del row["filename"]
accession.update_metadata(user, row)
with metadata_file.blob.open("rb") as blob:
rows = MetadataFile.to_dict_reader(blob)

for chunk in itertools.batched(rows, 1_000):
accessions: QuerySet[Accession] = metadata_file.cohort.accessions.select_related(
"image", "review", "lesion", "patient", "rcm_case", "cohort"
).filter(original_blob_name__in=[row["filename"] for row in chunk])
accessions_by_filename: dict[str, Accession] = {
accession.original_blob_name: accession for accession in accessions
}

for row in chunk:
# filename doesn't need to be stored in the metadata since it's equal to
# original_blob_name
accession = accessions_by_filename[row["filename"]]
del row["filename"]
accession.update_metadata(user, row)


@shared_task(soft_time_limit=3600, time_limit=3660)
Expand Down
2 changes: 2 additions & 0 deletions isic/ingest/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from .csv_streams import (
csv_stream_bom_filename_column,
csv_stream_duplicate_filenames,
csv_stream_valid,
csv_stream_without_filename_column,
Expand All @@ -13,4 +14,5 @@

csv_stream_valid = pytest.fixture(csv_stream_valid)
csv_stream_without_filename_column = pytest.fixture(csv_stream_without_filename_column)
csv_stream_bom_filename_column = pytest.fixture(csv_stream_bom_filename_column)
csv_stream_duplicate_filenames = pytest.fixture(csv_stream_duplicate_filenames)
7 changes: 7 additions & 0 deletions isic/ingest/tests/csv_streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ def csv_stream_without_filename_column() -> BinaryIO:
return file_stream


def csv_stream_bom_filename_column() -> BinaryIO:
file_stream = StreamWriter(io.BytesIO())
writer = csv.DictWriter(file_stream, fieldnames=["\ufefffilename"])
writer.writeheader()
return file_stream


def csv_stream_duplicate_filenames() -> BinaryIO:
file_stream = StreamWriter(io.BytesIO())
writer = csv.DictWriter(file_stream, fieldnames=["filename"])
Expand Down
21 changes: 19 additions & 2 deletions isic/ingest/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pytest

from isic.ingest.models.accession import Accession
from isic.ingest.models.metadata_file import MetadataFile
from isic.ingest.services.accession.review import accession_review_update_or_create
from isic.ingest.tasks import update_metadata_task
from isic.ingest.tests.csv_streams import StreamWriter
Expand Down Expand Up @@ -122,19 +123,35 @@ def metadatafile_without_filename_column(
)


@pytest.fixture()
def metadatafile_bom_filename_column(cohort, metadata_file_factory, csv_stream_bom_filename_column):
return metadata_file_factory(
blob__from_func=lambda: csv_stream_bom_filename_column, cohort=cohort
)


@pytest.fixture()
def metadatafile_duplicate_filenames(cohort, metadata_file_factory, csv_stream_duplicate_filenames):
return metadata_file_factory(
blob__from_func=lambda: csv_stream_duplicate_filenames, cohort=cohort
)


@pytest.mark.django_db()
def test_validate_metadata_step1_ignores_bom(metadatafile_bom_filename_column):
problems = validate_csv_format_and_filenames(
MetadataFile.to_dict_reader(metadatafile_bom_filename_column.blob.open("rb")),
metadatafile_bom_filename_column.cohort,
)
assert not problems


@pytest.mark.django_db()
def test_validate_metadata_step1_requires_filename_column(
metadatafile_without_filename_column,
):
problems = validate_csv_format_and_filenames(
metadatafile_without_filename_column.to_iterable()[1],
MetadataFile.to_dict_reader(metadatafile_without_filename_column.blob.open("rb")),
metadatafile_without_filename_column.cohort,
)
assert len(problems) == 1
Expand All @@ -146,7 +163,7 @@ def test_validate_metadata_step1_has_duplicate_filenames(
metadatafile_duplicate_filenames,
):
problems = validate_csv_format_and_filenames(
metadatafile_duplicate_filenames.to_iterable()[1],
MetadataFile.to_dict_reader(metadatafile_duplicate_filenames.blob.open("rb")),
metadatafile_duplicate_filenames.cohort,
)
assert len(problems) == 2
Expand Down
7 changes: 0 additions & 7 deletions isic/ingest/tests/test_metadata_file.py

This file was deleted.

2 changes: 1 addition & 1 deletion isic/ingest/utils/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def validate_csv_format_and_filenames(

filenames.update(row["filename"] for row in rows)

if filenames.most_common(1)[0][1] > 1:
if filenames and filenames.most_common(1)[0][1] > 1:
problems.append(
Problem(
message="Duplicate filenames found.",
Expand Down

0 comments on commit 4ace25a

Please sign in to comment.