EOX GitLab Instance

Skip to content
Snippets Groups Projects
Commit 7ff95207 authored by Lubomir Dolezal's avatar Lubomir Dolezal
Browse files

removing duplicates from filelists preprocessor

parent b6b60ae4
No related branches found
No related tags found
2 merge requests!55Production release 1.2.0,!54Shib configs update
...@@ -5,7 +5,7 @@ from glob import glob ...@@ -5,7 +5,7 @@ from glob import glob
import shutil import shutil
from typing import List, Tuple from typing import List, Tuple
from ..util import gdal, osr, replace_ext from ..util import gdal, osr, replace_ext, get_all_data_files
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -31,10 +31,7 @@ def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preproce ...@@ -31,10 +31,7 @@ def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preproce
else: else:
raise Exception('Invalid georeference type %s' % type_name) raise Exception('Invalid georeference type %s' % type_name)
try: try:
filenames = [] filenames = get_all_data_files(source_dir, preprocessor_config)
for dataglob in preprocessor_config.get('data_file_globs', '*'):
for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
filenames.append(p)
for filename in filenames: for filename in filenames:
target_filename = join(target_dir, basename(filename)) target_filename = join(target_dir, basename(filename))
georef_func(filename, target_filename, **opts_dict) georef_func(filename, target_filename, **opts_dict)
......
import os import os
from os.path import join, basename from os.path import join, basename
from uuid import uuid4 from uuid import uuid4
from glob import glob
from ..util import replace_ext, gdal from ..util import replace_ext, gdal, get_all_data_files
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -19,10 +18,7 @@ def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_c ...@@ -19,10 +18,7 @@ def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_c
extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0] extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0]
# warp each individual file # warp each individual file
warped_files = [] warped_files = []
filenames = [] filenames = get_all_data_files(source_dir, preprocessor_config)
for dataglob in preprocessor_config.get('data_file_globs', '*'):
for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
filenames.append(p)
for filename in filenames: for filename in filenames:
target_filename = join(target_dir, replace_ext(basename(filename), extension)) target_filename = join(target_dir, replace_ext(basename(filename), extension))
logger.debug('Warping file %s' % filename) logger.debug('Warping file %s' % filename)
......
import os import os
from os.path import basename, join, splitext from os.path import basename, join
from itertools import groupby from itertools import groupby
import re import re
from glob import glob
from typing import List from typing import List
from ..util import replace_ext, gdal from ..util import replace_ext, gdal, get_all_data_files
def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None): def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None):
""" Stack bands of the individual images """ Stack bands of the individual images
""" """
filenames = [] filenames = get_all_data_files(source_dir, preprocessor_config)
for dataglob in preprocessor_config.get('data_file_globs', '*'):
for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
filenames.append(p)
# check if we have a group_by regex. If yes, use the first # check if we have a group_by regex. If yes, use the first
# re-group to group by. # re-group to group by.
# Fallback is basename of file as groupname # Fallback is basename of file as groupname
......
import os import os
from os.path import join, splitext, basename, dirname, isdir from os.path import join, basename
from glob import glob
from typing import Dict from typing import Dict
from ..util import replace_ext, gdal from ..util import replace_ext, gdal, get_all_data_files
def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None): def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None):
filenames = [] filenames = get_all_data_files(source_dir, preprocessor_config)
for dataglob in preprocessor_config.get('data_file_globs', '*'):
for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not isdir(path)]:
filenames.append(p)
if len(filenames) == 0: if len(filenames) == 0:
raise Exception('No datafiles were matched by the provided glob') raise Exception('No datafiles were matched by the provided glob')
for filename in datafiles: for filename in filenames:
extract_subdatasets( extract_subdatasets(
filename, filename,
target_dir, target_dir,
......
import os import os
from os.path import splitext from os.path import splitext, join
from contextlib import contextmanager from contextlib import contextmanager
from tempfile import TemporaryDirectory, mkdtemp from tempfile import TemporaryDirectory, mkdtemp
from time import time from time import time
from glob import glob
try: try:
from osgeo import gdal from osgeo import gdal
...@@ -86,3 +87,14 @@ def get_size_in_bytes(file_path, unit): ...@@ -86,3 +87,14 @@ def get_size_in_bytes(file_path, unit):
""" Get size of file at given path in bytes""" """ Get size of file at given path in bytes"""
size = os.path.getsize(file_path) size = os.path.getsize(file_path)
return convert_unit(size, unit) return convert_unit(size, unit)
def get_all_data_files(source_dir, preprocessor_config):
""" Based on 'data_file_globs' configuration, gets all unique data file paths from folder"""
file_paths = []
for dataglob in preprocessor_config.get('data_file_globs', '*'):
for p in [path for path in glob(join(source_dir, '**', dataglob), recursive=True) if not os.path.isdir(path)]:
file_paths.append(p)
# get only unique files to compensate for possibly bad glob yielding doubles, keeping order
file_paths = list(dict.fromkeys(file_paths))
return file_paths
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment