diff --git a/preprocessor/preprocessor/steps/georeference.py b/preprocessor/preprocessor/steps/georeference.py
index afadc7bf1d863676eb39c5729737affd38aea85b..ebe8b1aea9a18f72caa51df7e9cb8d75ca9047aa 100644
--- a/preprocessor/preprocessor/steps/georeference.py
+++ b/preprocessor/preprocessor/steps/georeference.py
@@ -5,7 +5,7 @@ from glob import glob
 import shutil
 from typing import List, Tuple
 
-from ..util import gdal, osr, replace_ext
+from ..util import gdal, osr, replace_ext, get_all_data_files
 
 
 logger = logging.getLogger(__name__)
@@ -31,10 +31,7 @@ def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preproce
         else:
             raise Exception('Invalid georeference type %s' % type_name)
         try:
-            filenames = []
-            for dataglob in preprocessor_config.get('data_file_globs', '*'):
-                for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
-                    filenames.append(p)
+            filenames = get_all_data_files(source_dir, preprocessor_config)
             for filename in filenames:
                 target_filename = join(target_dir, basename(filename))
                 georef_func(filename, target_filename, **opts_dict)
diff --git a/preprocessor/preprocessor/steps/output.py b/preprocessor/preprocessor/steps/output.py
index efc34ba92c123ce446d508d7a3a4a4d9ff38c41d..f7e035f6182422f63ec0c1388e8747cd32bfc3c0 100644
--- a/preprocessor/preprocessor/steps/output.py
+++ b/preprocessor/preprocessor/steps/output.py
@@ -1,9 +1,8 @@
 import os
 from os.path import join, basename
 from uuid import uuid4
-from glob import glob
 
-from ..util import replace_ext, gdal
+from ..util import replace_ext, gdal, get_all_data_files
 import logging
 
 logger = logging.getLogger(__name__)
@@ -19,10 +18,7 @@ def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_c
     extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0]
     # warp each individual file
     warped_files = []
-    filenames = []
-    for dataglob in preprocessor_config.get('data_file_globs', '*'):
-        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
-            filenames.append(p)
+    filenames = get_all_data_files(source_dir, preprocessor_config)
     for filename in filenames:
         target_filename = join(target_dir, replace_ext(basename(filename), extension))
         logger.debug('Warping file %s' % filename)
diff --git a/preprocessor/preprocessor/steps/stack.py b/preprocessor/preprocessor/steps/stack.py
index e70865405db9b8651ab95cd5d745dd91ba95cdb4..55d3eff53be2339e4e202da7221227acabee9e41 100644
--- a/preprocessor/preprocessor/steps/stack.py
+++ b/preprocessor/preprocessor/steps/stack.py
@@ -1,20 +1,16 @@
 import os
-from os.path import basename, join, splitext
+from os.path import basename, join
 from itertools import groupby
 import re
-from glob import glob
 from typing import List
 
-from ..util import replace_ext, gdal
+from ..util import replace_ext, gdal, get_all_data_files
 
 
 def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None):
     """ Stack bands of the individual images
     """
-    filenames = []
-    for dataglob in preprocessor_config.get('data_file_globs', '*'):
-        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
-            filenames.append(p)
+    filenames = get_all_data_files(source_dir, preprocessor_config)
     # check if we have a group_by regex. If yes, use the first
     # re-group to group by.
     # Fallback is basename of file as groupname
diff --git a/preprocessor/preprocessor/steps/subdataset.py b/preprocessor/preprocessor/steps/subdataset.py
index 6fe05bcb9c52a14fd1076ceede30fc829edfb8a8..552bd29fd0eec0aeeb056afc9bbd8785e497a8ba 100644
--- a/preprocessor/preprocessor/steps/subdataset.py
+++ b/preprocessor/preprocessor/steps/subdataset.py
@@ -1,20 +1,16 @@
 import os
-from os.path import join, splitext, basename, dirname, isdir
-from glob import glob
+from os.path import join, basename
 from typing import Dict
 
-from ..util import replace_ext, gdal
+from ..util import replace_ext, gdal, get_all_data_files
 
 
 def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None):
-    filenames = []
-    for dataglob in preprocessor_config.get('data_file_globs', '*'):
-        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not isdir(path)]:
-            filenames.append(p)
+    filenames = get_all_data_files(source_dir, preprocessor_config)
     if len(filenames) == 0:
         raise Exception('No datafiles were matched by the provided glob')
 
-    for filename in datafiles:
+    for filename in filenames:
         extract_subdatasets(
             filename,
             target_dir,
diff --git a/preprocessor/preprocessor/util.py b/preprocessor/preprocessor/util.py
index d668037a2d32044808ca3092985e759a9f9a9b6c..bcccb149ff62034e5b8d9a172fe82192321ab98b 100644
--- a/preprocessor/preprocessor/util.py
+++ b/preprocessor/preprocessor/util.py
@@ -1,8 +1,9 @@
 import os
-from os.path import splitext
+from os.path import splitext, join
 from contextlib import contextmanager
 from tempfile import TemporaryDirectory, mkdtemp
 from time import time
+from glob import glob
 
 try:
     from osgeo import gdal
@@ -86,3 +87,14 @@ def get_size_in_bytes(file_path, unit):
     """ Get size of file at given path in bytes"""
     size = os.path.getsize(file_path)
     return convert_unit(size, unit)
+
+
+def get_all_data_files(source_dir, preprocessor_config):
+    """ Based on 'data_file_globs' configuration, gets all unique data file paths from folder"""
+    file_paths = []
+    for dataglob in preprocessor_config.get('data_file_globs', '*'):
+        for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
+            file_paths.append(p)
+    # get only unique files to compensate for possibly bad glob yielding doubles, keeping order
+    file_paths = list(dict.fromkeys(file_paths))
+    return file_paths