Skip to content

Commit

Permalink
BF: use total zarr_size to compute done% for zarr
Browse files Browse the repository at this point in the history
Since maxsize is dynamically computed as we go through the files.
The idea, I guess, was that it would grow rapidly before actual
downloads commense but it is not the case, so we endup with done%
being always close to 100% since we get those reports on final
downloads completed close to when individual files are downloaded.

So this should close #1407 .

But for total zarr file to be used, we needed to account also for
skipped files.  I added reporting of sizes for skipped files as well.
It seems there is no negative side effect on regular files download.
So now for the %done of zarr we might be getting to 100% of original
size having downloaded nothing.  But IMHO it is ok since user does
not care as much of how many "subparts" are downloaded, but rather
to have adequate progress report back.

There also could be side effects if  -e skip  and we skip
download of some updated files which would be smaller than the local
ones so altogether we would get over 100% total at the end.
  • Loading branch information
yarikoptic committed May 14, 2024
1 parent c0df9af commit e6d71bd
Showing 1 changed file with 24 additions and 10 deletions.
34 changes: 24 additions & 10 deletions dandi/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,8 @@ def agg_done(self, done_sizes: Iterator[int]) -> list[str]:
return v


def _skip_file(msg: Any) -> dict:
return {"status": "skipped", "message": str(msg)}
def _skip_file(msg: Any, **kwargs) -> dict:
return dict(**kwargs, status="skipped", message=str(msg))

Check warning on line 485 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L485

Added line #L485 was not covered by tests


def _populate_dandiset_yaml(
Expand Down Expand Up @@ -514,7 +514,7 @@ def _populate_dandiset_yaml(
existing is DownloadExisting.REFRESH
and os.lstat(dandiset_yaml).st_mtime >= mtime.timestamp()
):
yield _skip_file("already exists")
yield _skip_file("already exists", size=os.lstat(dandiset_yaml).st_mtime)

Check warning on line 517 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L517

Added line #L517 was not covered by tests
return
ds = Dandiset(dandiset_path, allow_empty=True)
ds.path_obj.mkdir(exist_ok=True) # exist_ok in case of parallel race
Expand Down Expand Up @@ -637,7 +637,7 @@ def _download_file(
# but mtime is different
if same == ["mtime", "size"]:
# TODO: add recording and handling of .nwb object_id
yield _skip_file("same time and size")
yield _skip_file("same time and size", size=size)

Check warning on line 640 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L640

Added line #L640 was not covered by tests
return
lgr.debug(f"{path!r} - same attributes: {same}. Redownloading")

Expand Down Expand Up @@ -1028,11 +1028,17 @@ def get_done(self) -> dict:
total_downloaded = sum(
s.downloaded
for s in self.files.values()
if s.state in (DLState.DOWNLOADING, DLState.CHECKSUM_ERROR, DLState.DONE)
if s.state
in (
DLState.DOWNLOADING,
DLState.CHECKSUM_ERROR,
DLState.SKIPPED,
DLState.DONE,
)
)
return {
"done": total_downloaded,
"done%": total_downloaded / self.maxsize * 100,
"done%": total_downloaded / self.zarr_size * 100,
}

def set_status(self, statusdict: dict) -> None:
Expand Down Expand Up @@ -1061,16 +1067,24 @@ def set_status(self, statusdict: dict) -> None:
def feed(self, path: str, status: dict) -> Iterator[dict]:
keys = list(status.keys())
self.files.setdefault(path, DownloadProgress())
size = status.get("size")
if size is not None:
if not self.yielded_size:
# this thread will yield
self.yielded_size = True
yield {"size": self.zarr_size}
if status.get("status") == "skipped":
self.files[path].state = DLState.SKIPPED
out = {"message": self.message}
# Treat skipped as "downloaded" for the matter of accounting
if size is not None:
self.files[path].downloaded = size
self.maxsize += size

Check warning on line 1082 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L1080-L1082

Added lines #L1080 - L1082 were not covered by tests
self.set_status(out)
yield out
yield self.get_done()

Check warning on line 1085 in dandi/download.py

View check run for this annotation

Codecov / codecov/patch

dandi/download.py#L1085

Added line #L1085 was not covered by tests
elif keys == ["size"]:
if not self.yielded_size:
yield {"size": self.zarr_size}
self.yielded_size = True
self.files[path].size = status["size"]
self.files[path].size = size
self.maxsize += status["size"]
if any(s.state is DLState.DOWNLOADING for s in self.files.values()):
yield self.get_done()
Expand Down

0 comments on commit e6d71bd

Please sign in to comment.