From 7ff95207191413e27cddeb902a4b28c4add327ba Mon Sep 17 00:00:00 2001
From: Lubomir Bucek <lubomir.bucek@eox.at>
Date: Mon, 1 Feb 2021 17:06:22 +0100
Subject: [PATCH] removing duplicates from filelists preprocessor

---
 preprocessor/preprocessor/steps/georeference.py |  7 ++-----
 preprocessor/preprocessor/steps/output.py       |  8 ++------
 preprocessor/preprocessor/steps/stack.py        | 10 +++-------
 preprocessor/preprocessor/steps/subdataset.py   | 12 ++++--------
 preprocessor/preprocessor/util.py               | 14 +++++++++++++-
 5 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/preprocessor/preprocessor/steps/georeference.py b/preprocessor/preprocessor/steps/georeference.py
index afadc7bf..ebe8b1ae 100644
--- a/preprocessor/preprocessor/steps/georeference.py
+++ b/preprocessor/preprocessor/steps/georeference.py
@@ -5,7 +5,7 @@ from glob import glob
 import shutil
 from typing import List, Tuple
 
-from ..util import gdal, osr, replace_ext
+from ..util import gdal, osr, replace_ext, get_all_data_files
 
 
 logger = logging.getLogger(__name__)
@@ -31,10 +31,7 @@ def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preproce
         else:
             raise Exception('Invalid georeference type %s' % type_name)
         try:
-            filenames = []
-            for dataglob in preprocessor_config.get('data_file_globs', '*'):
-                for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
-                    filenames.append(p)
+            filenames = get_all_data_files(source_dir, preprocessor_config)
             for filename in filenames:
                 target_filename = join(target_dir, basename(filename))
                 georef_func(filename, target_filename, **opts_dict)
diff --git a/preprocessor/preprocessor/steps/output.py b/preprocessor/preprocessor/steps/output.py
index efc34ba9..f7e035f6 100644
--- a/preprocessor/preprocessor/steps/output.py
+++ b/preprocessor/preprocessor/steps/output.py
@@ -1,9 +1,8 @@
 import os
 from os.path import join, basename
 from uuid import uuid4
-from glob import glob
 
-from ..util import replace_ext, gdal
+from ..util import replace_ext, gdal, get_all_data_files
 import logging
 
 logger = logging.getLogger(__name__)
@@ -19,10 +18,7 @@ def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_c
     extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0]
     # warp each individual file
     warped_files = []
-    filenames = []
-    for dataglob in preprocessor_config.get('data_file_globs', '*'):
-        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
-            filenames.append(p)
+    filenames = get_all_data_files(source_dir, preprocessor_config)
     for filename in filenames:
         target_filename = join(target_dir, replace_ext(basename(filename), extension))
         logger.debug('Warping file %s' % filename)
diff --git a/preprocessor/preprocessor/steps/stack.py b/preprocessor/preprocessor/steps/stack.py
index e7086540..55d3eff5 100644
--- a/preprocessor/preprocessor/steps/stack.py
+++ b/preprocessor/preprocessor/steps/stack.py
@@ -1,20 +1,16 @@
 import os
-from os.path import basename, join, splitext
+from os.path import basename, join
 from itertools import groupby
 import re
-from glob import glob
 from typing import List
 
-from ..util import replace_ext, gdal
+from ..util import replace_ext, gdal, get_all_data_files
 
 
 def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None):
     """ Stack bands of the individual images
     """
-    filenames = []
-    for dataglob in preprocessor_config.get('data_file_globs', '*'):
-        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
-            filenames.append(p)
+    filenames = get_all_data_files(source_dir, preprocessor_config)
     # check if we have a group_by regex. If yes, use the first
     # re-group to group by.
     # Fallback is basename of file as groupname
diff --git a/preprocessor/preprocessor/steps/subdataset.py b/preprocessor/preprocessor/steps/subdataset.py
index 6fe05bcb..552bd29f 100644
--- a/preprocessor/preprocessor/steps/subdataset.py
+++ b/preprocessor/preprocessor/steps/subdataset.py
@@ -1,20 +1,16 @@
 import os
-from os.path import join, splitext, basename, dirname, isdir
-from glob import glob
+from os.path import join, basename
 from typing import Dict
 
-from ..util import replace_ext, gdal
+from ..util import replace_ext, gdal, get_all_data_files
 
 
 def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None):
-    filenames = []
-    for dataglob in preprocessor_config.get('data_file_globs', '*'):
-        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not isdir(path)]:
-            filenames.append(p)
+    filenames = get_all_data_files(source_dir, preprocessor_config)
     if len(filenames) == 0:
         raise Exception('No datafiles were matched by the provided glob')
 
-    for filename in datafiles:
+    for filename in filenames:
         extract_subdatasets(
             filename,
             target_dir,
diff --git a/preprocessor/preprocessor/util.py b/preprocessor/preprocessor/util.py
index d668037a..bcccb149 100644
--- a/preprocessor/preprocessor/util.py
+++ b/preprocessor/preprocessor/util.py
@@ -1,8 +1,9 @@
 import os
-from os.path import splitext
+from os.path import splitext, join
 from contextlib import contextmanager
 from tempfile import TemporaryDirectory, mkdtemp
 from time import time
+from glob import glob
 
 try:
     from osgeo import gdal
@@ -86,3 +87,14 @@ def get_size_in_bytes(file_path, unit):
     """ Get size of file at given path in bytes"""
     size = os.path.getsize(file_path)
     return convert_unit(size, unit)
+
+
+def get_all_data_files(source_dir, preprocessor_config):
+    """ Based on 'data_file_globs' configuration, gets all unique data file paths from folder"""
+    file_paths = []
+    for dataglob in preprocessor_config.get('data_file_globs', '*'):
+        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
+            file_paths.append(p)
+    # get only unique files to compensate for possibly bad glob yielding doubles, keeping order
+    file_paths = list(dict.fromkeys(file_paths))
+    return file_paths
-- 
GitLab