From 3412d909a87b3290cbe2f1fcdbdeebbc9dfcdc38 Mon Sep 17 00:00:00 2001 From: JoranDox Date: Thu, 17 Feb 2022 14:39:31 +0100 Subject: [PATCH] added fsagnosticglob function The current glob function is unusably slow when used with a path like `datasetname/parquet/brandcode=*/application=*/year=2022/month=02/day=14/hour=05/*` which in our case should only return a list of each file per brandcode & application (both manageably short lists), but instead does a walk over each year, month, day, and hour before filtering. The new function filters on each step instead. --- azure/datalake/store/core.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/azure/datalake/store/core.py b/azure/datalake/store/core.py index b4a86d8..e47bd2c 100644 --- a/azure/datalake/store/core.py +++ b/azure/datalake/store/core.py @@ -38,6 +38,23 @@ logger = logging.getLogger(__name__) valid_expire_types = [x.value for x in ExpiryOptionType] +def fsagnosticglob(fs, path, pathtype, prefix=""): + if "//" in path: + path = path.split("//")[-1] + paths = [prefix] + for part in path.strip("/").split("/"): + newpaths = [] + for _prefix in paths: + checkpath = os.path.join(_prefix,part) + if "*" in part: + potentialpaths = fs.ls(_prefix) + for p in potentialpaths: + if pathtype(p).match(checkpath): + newpaths.append(p) + else: + newpaths.append(checkpath) + paths = newpaths + return paths class AzureDLFileSystem(object): """ @@ -309,14 +326,11 @@ def glob(self, path, details=False, invalidate_cache=True): ------- List of files """ - - path = AzureDLPath(path).trim() - path_as_posix = path.as_posix() - prefix = path.globless_prefix - allfiles = self.walk(prefix, details, invalidate_cache) - if prefix == path: - return allfiles - return [f for f in allfiles if AzureDLPath(f['name'] if details else f).match(path_as_posix)] + return fsagnosticglob( + self, + path, + AzureDLPath, + ) def du(self, path, total=False, deep=False, invalidate_cache=True): """