diff --git a/isic/ingest/models/metadata_file.py b/isic/ingest/models/metadata_file.py index 5b2c5fad..a4fb51db 100644 --- a/isic/ingest/models/metadata_file.py +++ b/isic/ingest/models/metadata_file.py @@ -1,7 +1,7 @@ import csv +import io from django.contrib.auth.models import User -from django.core.files.base import File from django.core.validators import FileExtensionValidator from django.db import models from django.db.models.query import QuerySet @@ -26,9 +26,9 @@ class MetadataFile(CreationSortedTimeStampedModel): def __str__(self) -> str: return self.blob_name - def to_iterable(self) -> tuple[File, csv.DictReader]: - with self.blob.open("r") as blob: - return blob, csv.DictReader(blob) + @staticmethod + def to_dict_reader(fh): + return csv.DictReader(io.TextIOWrapper(fh, encoding="utf-8-sig")) class MetadataFilePermissions: diff --git a/isic/ingest/tasks.py b/isic/ingest/tasks.py index 2c8da934..8c7cd44f 100644 --- a/isic/ingest/tasks.py +++ b/isic/ingest/tasks.py @@ -1,5 +1,4 @@ from collections.abc import Iterable -from csv import DictReader import itertools import time @@ -104,24 +103,24 @@ def validate_metadata_task(metadata_file_pk: int): metadata_file = MetadataFile.objects.select_related("cohort").get(pk=metadata_file_pk) try: - fh, reader = metadata_file.to_iterable() - successful: bool = False - unstructured_columns: list[str] = get_unstructured_columns(reader.fieldnames) - csv_check: list[Problem] | None = None - internal_check: tuple[ColumnRowErrors, list[Problem]] | None = None - archive_check: tuple[ColumnRowErrors, list[Problem]] | None = None + with metadata_file.blob.open("rb") as fh: + reader = MetadataFile.to_dict_reader(fh) + successful: bool = False + unstructured_columns: list[str] = get_unstructured_columns(reader.fieldnames) + csv_check: list[Problem] | None = None + internal_check: tuple[ColumnRowErrors, list[Problem]] | None = None + archive_check: tuple[ColumnRowErrors, list[Problem]] | None = None - with metadata_file.blob.open("r") as fh: - csv_check = validate_csv_format_and_filenames(DictReader(fh), metadata_file.cohort) + csv_check = validate_csv_format_and_filenames(reader, metadata_file.cohort) if not any(csv_check): fh.seek(0) - internal_check = validate_internal_consistency(DictReader(fh)) + internal_check = validate_internal_consistency(MetadataFile.to_dict_reader(fh)) if not any(internal_check): fh.seek(0) archive_check = validate_archive_consistency( - DictReader(fh), metadata_file.cohort + MetadataFile.to_dict_reader(fh), metadata_file.cohort ) if not any(archive_check): @@ -159,22 +158,23 @@ def update_metadata_task(user_pk: int, metadata_file_pk: int): (_ for _ in Patient.objects.select_for_update().all()) (_ for _ in RcmCase.objects.select_for_update().all()) - _, rows = metadata_file.to_iterable() - - for chunk in itertools.batched(rows, 1_000): - accessions: QuerySet[Accession] = metadata_file.cohort.accessions.select_related( - "image", "review", "lesion", "patient", "rcm_case", "cohort" - ).filter(original_blob_name__in=[row["filename"] for row in chunk]) - accessions_by_filename: dict[str, Accession] = { - accession.original_blob_name: accession for accession in accessions - } - - for row in chunk: - # filename doesn't need to be stored in the metadata since it's equal to - # original_blob_name - accession = accessions_by_filename[row["filename"]] - del row["filename"] - accession.update_metadata(user, row) + with metadata_file.blob.open("rb") as blob: + rows = MetadataFile.to_dict_reader(blob) + + for chunk in itertools.batched(rows, 1_000): + accessions: QuerySet[Accession] = metadata_file.cohort.accessions.select_related( + "image", "review", "lesion", "patient", "rcm_case", "cohort" + ).filter(original_blob_name__in=[row["filename"] for row in chunk]) + accessions_by_filename: dict[str, Accession] = { + accession.original_blob_name: accession for accession in accessions + } + + for row in chunk: + # filename doesn't need to be stored in the metadata since it's equal to + # original_blob_name + accession = accessions_by_filename[row["filename"]] + del row["filename"] + accession.update_metadata(user, row) @shared_task(soft_time_limit=3600, time_limit=3660) diff --git a/isic/ingest/tests/conftest.py b/isic/ingest/tests/conftest.py index bf2ec3b3..18e869bc 100644 --- a/isic/ingest/tests/conftest.py +++ b/isic/ingest/tests/conftest.py @@ -1,6 +1,7 @@ import pytest from .csv_streams import ( + csv_stream_bom_filename_column, csv_stream_duplicate_filenames, csv_stream_valid, csv_stream_without_filename_column, @@ -13,4 +14,5 @@ csv_stream_valid = pytest.fixture(csv_stream_valid) csv_stream_without_filename_column = pytest.fixture(csv_stream_without_filename_column) +csv_stream_bom_filename_column = pytest.fixture(csv_stream_bom_filename_column) csv_stream_duplicate_filenames = pytest.fixture(csv_stream_duplicate_filenames) diff --git a/isic/ingest/tests/csv_streams.py b/isic/ingest/tests/csv_streams.py index 06bba309..dd8a535b 100644 --- a/isic/ingest/tests/csv_streams.py +++ b/isic/ingest/tests/csv_streams.py @@ -27,6 +27,13 @@ def csv_stream_without_filename_column() -> BinaryIO: return file_stream +def csv_stream_bom_filename_column() -> BinaryIO: + file_stream = StreamWriter(io.BytesIO()) + writer = csv.DictWriter(file_stream, fieldnames=["\ufefffilename"]) + writer.writeheader() + return file_stream + + def csv_stream_duplicate_filenames() -> BinaryIO: file_stream = StreamWriter(io.BytesIO()) writer = csv.DictWriter(file_stream, fieldnames=["filename"]) diff --git a/isic/ingest/tests/test_metadata.py b/isic/ingest/tests/test_metadata.py index 7cf95e86..e2f2c9de 100644 --- a/isic/ingest/tests/test_metadata.py +++ b/isic/ingest/tests/test_metadata.py @@ -8,6 +8,7 @@ import pytest from isic.ingest.models.accession import Accession +from isic.ingest.models.metadata_file import MetadataFile from isic.ingest.services.accession.review import accession_review_update_or_create from isic.ingest.tasks import update_metadata_task from isic.ingest.tests.csv_streams import StreamWriter @@ -122,6 +123,13 @@ def metadatafile_without_filename_column( ) +@pytest.fixture() +def metadatafile_bom_filename_column(cohort, metadata_file_factory, csv_stream_bom_filename_column): + return metadata_file_factory( + blob__from_func=lambda: csv_stream_bom_filename_column, cohort=cohort + ) + + @pytest.fixture() def metadatafile_duplicate_filenames(cohort, metadata_file_factory, csv_stream_duplicate_filenames): return metadata_file_factory( @@ -129,12 +137,21 @@ def metadatafile_duplicate_filenames(cohort, metadata_file_factory, csv_stream_d ) +@pytest.mark.django_db() +def test_validate_metadata_step1_ignores_bom(metadatafile_bom_filename_column): + problems = validate_csv_format_and_filenames( + MetadataFile.to_dict_reader(metadatafile_bom_filename_column.blob.open("rb")), + metadatafile_bom_filename_column.cohort, + ) + assert not problems + + @pytest.mark.django_db() def test_validate_metadata_step1_requires_filename_column( metadatafile_without_filename_column, ): problems = validate_csv_format_and_filenames( - metadatafile_without_filename_column.to_iterable()[1], + MetadataFile.to_dict_reader(metadatafile_without_filename_column.blob.open("rb")), metadatafile_without_filename_column.cohort, ) assert len(problems) == 1 @@ -146,7 +163,7 @@ def test_validate_metadata_step1_has_duplicate_filenames( metadatafile_duplicate_filenames, ): problems = validate_csv_format_and_filenames( - metadatafile_duplicate_filenames.to_iterable()[1], + MetadataFile.to_dict_reader(metadatafile_duplicate_filenames.blob.open("rb")), metadatafile_duplicate_filenames.cohort, ) assert len(problems) == 2 diff --git a/isic/ingest/tests/test_metadata_file.py b/isic/ingest/tests/test_metadata_file.py deleted file mode 100644 index 17b0ceff..00000000 --- a/isic/ingest/tests/test_metadata_file.py +++ /dev/null @@ -1,7 +0,0 @@ -import pytest - - -@pytest.mark.django_db() -def test_metadata_file_to_iterable(metadata_file): - _, reader = metadata_file.to_iterable() - next(reader) diff --git a/isic/ingest/utils/metadata.py b/isic/ingest/utils/metadata.py index fc146a02..af609ae1 100644 --- a/isic/ingest/utils/metadata.py +++ b/isic/ingest/utils/metadata.py @@ -42,7 +42,7 @@ def validate_csv_format_and_filenames( filenames.update(row["filename"] for row in rows) - if filenames.most_common(1)[0][1] > 1: + if filenames and filenames.most_common(1)[0][1] > 1: problems.append( Problem( message="Duplicate filenames found.",