diff --git a/preprocessor/preprocessor/steps/georeference.py b/preprocessor/preprocessor/steps/georeference.py index afadc7bf1d863676eb39c5729737affd38aea85b..ebe8b1aea9a18f72caa51df7e9cb8d75ca9047aa 100644 --- a/preprocessor/preprocessor/steps/georeference.py +++ b/preprocessor/preprocessor/steps/georeference.py @@ -5,7 +5,7 @@ from glob import glob import shutil from typing import List, Tuple -from ..util import gdal, osr, replace_ext +from ..util import gdal, osr, replace_ext, get_all_data_files logger = logging.getLogger(__name__) @@ -31,10 +31,7 @@ def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preproce else: raise Exception('Invalid georeference type %s' % type_name) try: - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) for filename in filenames: target_filename = join(target_dir, basename(filename)) georef_func(filename, target_filename, **opts_dict) diff --git a/preprocessor/preprocessor/steps/output.py b/preprocessor/preprocessor/steps/output.py index efc34ba92c123ce446d508d7a3a4a4d9ff38c41d..f7e035f6182422f63ec0c1388e8747cd32bfc3c0 100644 --- a/preprocessor/preprocessor/steps/output.py +++ b/preprocessor/preprocessor/steps/output.py @@ -1,9 +1,8 @@ import os from os.path import join, basename from uuid import uuid4 -from glob import glob -from ..util import replace_ext, gdal +from ..util import replace_ext, gdal, get_all_data_files import logging logger = logging.getLogger(__name__) @@ -19,10 +18,7 @@ def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_c extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0] # warp each individual file warped_files = [] - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) for filename in filenames: target_filename = join(target_dir, replace_ext(basename(filename), extension)) logger.debug('Warping file %s' % filename) diff --git a/preprocessor/preprocessor/steps/stack.py b/preprocessor/preprocessor/steps/stack.py index e70865405db9b8651ab95cd5d745dd91ba95cdb4..55d3eff53be2339e4e202da7221227acabee9e41 100644 --- a/preprocessor/preprocessor/steps/stack.py +++ b/preprocessor/preprocessor/steps/stack.py @@ -1,20 +1,16 @@ import os -from os.path import basename, join, splitext +from os.path import basename, join from itertools import groupby import re -from glob import glob from typing import List -from ..util import replace_ext, gdal +from ..util import replace_ext, gdal, get_all_data_files def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None): """ Stack bands of the individual images """ - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) # check if we have a group_by regex. If yes, use the first # re-group to group by. # Fallback is basename of file as groupname diff --git a/preprocessor/preprocessor/steps/subdataset.py b/preprocessor/preprocessor/steps/subdataset.py index 6fe05bcb9c52a14fd1076ceede30fc829edfb8a8..552bd29fd0eec0aeeb056afc9bbd8785e497a8ba 100644 --- a/preprocessor/preprocessor/steps/subdataset.py +++ b/preprocessor/preprocessor/steps/subdataset.py @@ -1,20 +1,16 @@ import os -from os.path import join, splitext, basename, dirname, isdir -from glob import glob +from os.path import join, basename from typing import Dict -from ..util import replace_ext, gdal +from ..util import replace_ext, gdal, get_all_data_files def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None): - filenames = [] - for dataglob in preprocessor_config.get('data_file_globs', '*'): - for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not isdir(path)]: - filenames.append(p) + filenames = get_all_data_files(source_dir, preprocessor_config) if len(filenames) == 0: raise Exception('No datafiles were matched by the provided glob') - for filename in datafiles: + for filename in filenames: extract_subdatasets( filename, target_dir, diff --git a/preprocessor/preprocessor/util.py b/preprocessor/preprocessor/util.py index d668037a2d32044808ca3092985e759a9f9a9b6c..bcccb149ff62034e5b8d9a172fe82192321ab98b 100644 --- a/preprocessor/preprocessor/util.py +++ b/preprocessor/preprocessor/util.py @@ -1,8 +1,9 @@ import os -from os.path import splitext +from os.path import splitext, join from contextlib import contextmanager from tempfile import TemporaryDirectory, mkdtemp from time import time +from glob import glob try: from osgeo import gdal @@ -86,3 +87,14 @@ def get_size_in_bytes(file_path, unit): """ Get size of file at given path in bytes""" size = os.path.getsize(file_path) return convert_unit(size, unit) + + +def get_all_data_files(source_dir, preprocessor_config): + """ Based on 'data_file_globs' configuration, gets all unique data file paths from folder""" + file_paths = [] + for dataglob in preprocessor_config.get('data_file_globs', '*'): + for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]: + file_paths.append(p) + # get only unique files to compensate for possibly bad glob yielding doubles, keeping order + file_paths = list(dict.fromkeys(file_paths)) + return file_paths