From 7ff95207191413e27cddeb902a4b28c4add327ba Mon Sep 17 00:00:00 2001 From: Lubomir Bucek <lubomir.bucek@eox.at> Date: Mon, 1 Feb 2021 17:06:22 +0100 Subject: [PATCH] removing duplicates from filelists preprocessor --- preprocessor/preprocessor/steps/georeference.py | 7 ++----- preprocessor/preprocessor/steps/output.py | 8 ++------ preprocessor/preprocessor/steps/stack.py | 10 +++------- preprocessor/preprocessor/steps/subdataset.py | 12 ++++-------- preprocessor/preprocessor/util.py | 14 +++++++++++++- 5 files changed, 24 insertions(+), 27 deletions(-) diff --git a/preprocessor/preprocessor/steps/georeference.py b/preprocessor/preprocessor/steps/georeference.py index afadc7bf..ebe8b1ae 100644 --- a/preprocessor/preprocessor/steps/georeference.py +++ b/preprocessor/preprocessor/steps/georeference.py @@ -5,7 +5,7 @@ from glob import glob import shutil from typing import List, Tuple -from ..util import gdal, osr, replace_ext +from ..util import gdal, osr, replace_ext, get_all_data_files logger = logging.getLogger(__name__) @@ -31,10 +31,7 @@ def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preproce else: raise Exception('Invalid georeference type %s' % type_name) try: - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) for filename in filenames: target_filename = join(target_dir, basename(filename)) georef_func(filename, target_filename, **opts_dict) diff --git a/preprocessor/preprocessor/steps/output.py b/preprocessor/preprocessor/steps/output.py index efc34ba9..f7e035f6 100644 --- a/preprocessor/preprocessor/steps/output.py +++ b/preprocessor/preprocessor/steps/output.py @@ -1,9 +1,8 @@ import os from os.path import join, basename from uuid import uuid4 -from glob import glob -from ..util import replace_ext, gdal +from ..util import replace_ext, gdal, get_all_data_files import logging logger = logging.getLogger(__name__) @@ -19,10 +18,7 @@ def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_c extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0] # warp each individual file warped_files = [] - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) for filename in filenames: target_filename = join(target_dir, replace_ext(basename(filename), extension)) logger.debug('Warping file %s' % filename) diff --git a/preprocessor/preprocessor/steps/stack.py b/preprocessor/preprocessor/steps/stack.py index e7086540..55d3eff5 100644 --- a/preprocessor/preprocessor/steps/stack.py +++ b/preprocessor/preprocessor/steps/stack.py @@ -1,20 +1,16 @@ import os -from os.path import basename, join, splitext +from os.path import basename, join from itertools import groupby import re -from glob import glob from typing import List -from ..util import replace_ext, gdal +from ..util import replace_ext, gdal, get_all_data_files def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None): """ Stack bands of the individual images """ - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) # check if we have a group_by regex. If yes, use the first # re-group to group by. # Fallback is basename of file as groupname diff --git a/preprocessor/preprocessor/steps/subdataset.py b/preprocessor/preprocessor/steps/subdataset.py index 6fe05bcb..552bd29f 100644 --- a/preprocessor/preprocessor/steps/subdataset.py +++ b/preprocessor/preprocessor/steps/subdataset.py @@ -1,20 +1,16 @@ import os -from os.path import join, splitext, basename, dirname, isdir -from glob import glob +from os.path import join, basename from typing import Dict -from ..util import replace_ext, gdal +from ..util import replace_ext, gdal, get_all_data_files def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None): - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) if len(filenames) == 0: raise Exception('No datafiles were matched by the provided glob') - for filename in datafiles: + for filename in filenames: extract_subdatasets( filename, target_dir, diff --git a/preprocessor/preprocessor/util.py b/preprocessor/preprocessor/util.py index d668037a..bcccb149 100644 --- a/preprocessor/preprocessor/util.py +++ b/preprocessor/preprocessor/util.py @@ -1,8 +1,9 @@ import os -from os.path import splitext +from os.path import splitext, join from contextlib import contextmanager from tempfile import TemporaryDirectory, mkdtemp from time import time +from glob import glob try: from osgeo import gdal @@ -86,3 +87,14 @@ def get_size_in_bytes(file_path, unit): """ Get size of file at given path in bytes""" size = os.path.getsize(file_path) return convert_unit(size, unit) + + +def get_all_data_files(source_dir, preprocessor_config): + """ Based on 'data_file_globs' configuration, gets all unique data file paths from folder""" + file_paths = [] + for dataglob in preprocessor_config.get('data_file_globs', '*'): + for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: + file_paths.append(p) + # get only unique files to compensate for possibly bad glob yielding doubles, keeping order + file_paths = list(dict.fromkeys(file_paths)) + return file_paths -- GitLab