From 5a18399486aba4398538b243438d99b77957f928 Mon Sep 17 00:00:00 2001 From: Sergey Serebryakov Date: Sun, 14 Jun 2020 04:00:20 -0700 Subject: [PATCH 1/2] Fix GCSFS walk() method --- petastorm/gcsfs_helpers/gcsfs_wrapper.py | 38 ++++++++++-------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/petastorm/gcsfs_helpers/gcsfs_wrapper.py b/petastorm/gcsfs_helpers/gcsfs_wrapper.py index 119cb39f0..ea5845fb4 100644 --- a/petastorm/gcsfs_helpers/gcsfs_wrapper.py +++ b/petastorm/gcsfs_helpers/gcsfs_wrapper.py @@ -12,10 +12,7 @@ def isdir(self, path): path = norm_path(_stringify_path(path)) try: contents = self.fs.ls(path) - if len(contents) == 1 and contents[0] == path: - return False - else: - return True + return not(len(contents) == 1 and contents[0] == path) except OSError: return False @@ -41,26 +38,23 @@ def walk(self, path): directories = set() files = set() - for key in self.fs.ls(path, detail=True): + for obj in self.fs.ls(path, detail=True): # each info name must be at least [path]/part , but here # we check also for names like [path]/part/ - path = key['name'] - if key['storageClass'] == 'DIRECTORY': - if path.endswith('/'): - directories.add(path[:-1]) - else: - directories.add(path) - elif key['storageClass'] == 'BUCKET': - pass - else: - files.add(path) - - files = sorted([posixpath.split(f)[1] for f in files - if f not in directories]) - directories = sorted([posixpath.split(x)[1] - for x in directories]) - - yield path, directories, files + obj_path = obj['name'] + if obj_path == path: + continue + if obj['type'] == 'directory': + directories.add(obj_path) + elif obj['type'] == 'file': + files.add(obj_path) + + rel_files = sorted([posixpath.split(f)[1] for f in files + if f not in directories]) + rel_directories = sorted([posixpath.split(x[:-1])[1] + for x in directories]) + + yield path, rel_directories, rel_files for directory in directories: for tup in self.walk(directory): From b87919b743f52aaa6940a6cf19583935a74744f7 Mon Sep 17 00:00:00 2001 From: Sergey Serebryakov Date: Tue, 23 Jun 2020 02:30:17 -0700 Subject: [PATCH 2/2] [gcsfs] Consider trailing slash when skipping directories --- petastorm/gcsfs_helpers/gcsfs_wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/petastorm/gcsfs_helpers/gcsfs_wrapper.py b/petastorm/gcsfs_helpers/gcsfs_wrapper.py index ea5845fb4..5f1c220c2 100644 --- a/petastorm/gcsfs_helpers/gcsfs_wrapper.py +++ b/petastorm/gcsfs_helpers/gcsfs_wrapper.py @@ -42,7 +42,7 @@ def walk(self, path): # each info name must be at least [path]/part , but here # we check also for names like [path]/part/ obj_path = obj['name'] - if obj_path == path: + if obj_path.strip('/') == path.strip('/'): continue if obj['type'] == 'directory': directories.add(obj_path)