EOX GitLab Instance

Commit 057d07f8 authored by Lubomir Doležal's avatar Lubomir Doležal
Browse files

[preprocessor] multiple sar related updates

parent 0d4f2ff7
......@@ -38,7 +38,7 @@ properties:
description: The local directory, where intermediary files are to be stored.
type: string
keep_temp:
description: Whether to keep temporary files for each step. DEPRECATED.
description: Whether to keep temporary files for each step.
type: boolean
metadata_glob:
description: A file glob to select metadata files from the downloaded archive.
......@@ -95,6 +95,8 @@ required:
- preprocessing
definitions:
steps:
data_file_globs:
description: Custom globs for filtering which files will be used for this step.
custom_preprocessor:
description: Definition of a custom preprocessor step
type: object
......@@ -112,6 +114,9 @@ definitions:
description: The definition of the subdataset extraction step.
type: object
properties:
data_file_globs:
description: Custom globs for filtering which files will be used for this step. Overrides data_file_globs configured for whole preprocessor.
type: array
subdataset_types:
description: Mapping of subdataset identifier to output filename postfix for subdatasets to be extracted for each data file.
type: object
......@@ -121,6 +126,9 @@ definitions:
georeference:
type: object
properties:
data_file_globs:
description: Custom globs for filtering which files will be used for this step. Overrides data_file_globs configured for whole preprocessor.
type: array
geotransforms:
description: A list of geotransform methods to use
type: array
......@@ -164,6 +172,9 @@ definitions:
description: Definition of a calculation step.
type: object
properties:
data_file_globs:
description: Custom globs for filtering which files will be used for this step. Overrides data_file_globs configured for whole preprocessor.
type: array
formulas:
description: A list of formulas to calculate
type: array
......@@ -187,7 +198,7 @@ definitions:
description: The output data type for the calculated file. (GDAL notation)
type: string
formula:
description: The formula to calculate. See gdal_calc.py for details.
description: "The formula to calculate. See gdal_calc.py for details. Can contain custom templates for getting band stats, like '${A_1_statistics_min}'."
type: string
output_postfix:
description: The filename postfix to append to the output filename. By default an enumeration is used.
......@@ -195,10 +206,16 @@ definitions:
nodata_value:
description: Use this nodata value in the calculation.
type: float
creationOptions:
description: List of creation options for gdal_calc.py output.
type: array
stack_bands:
description: Definition of a stack bands step.
type: object
properties:
data_file_globs:
description: Custom globs for filtering which files will be used for this step. Overrides data_file_globs configured for whole preprocessor.
type: array
group_by:
description: A regex to group the input datasets, if consisting of multiple file. The first regex group is used for the grouping.
type: string
......@@ -214,6 +231,9 @@ definitions:
description: Definition of an output step.
type: object
properties:
data_file_globs:
description: Custom globs for filtering which files will be used for this step. Overrides data_file_globs configured for whole preprocessor.
type: array
options:
description: "Options to be passed to `gdal.Warp`. See https://gdal.org/python/osgeo.gdal-module.html#WarpOptions for details"
type: object
......
import os
from os.path import basename, dirname, join, isfile
from os.path import basename, join, isfile
import subprocess
from typing import List
from glob import glob
import shutil
import logging
import re
from ..util import replace_ext
from ..util import replace_ext, get_all_data_files, gdal
logger = logging.getLogger(__name__)
def calc_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, formulas: List[dict]):
def calc_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, formulas: List[dict], data_file_globs: List[str]=[]):
for i, item in enumerate(formulas):
# get first filename as a base
filename = glob(join(source_dir, list(item['inputs'].values())[0]['glob']))[0]
# get first filename as a base, first looking into source_dir, then target_dir as fallback
filenames_source = glob(join(source_dir, list(item['inputs'].values())[0]['glob']))
if len(filenames_source) > 0:
filename = filenames_source[0]
else:
filenames_target = glob(join(target_dir, list(item['inputs'].values())[0]['glob']))
if len(filenames_target) > 0:
filename = filenames_target[0]
else:
raise Exception('No input file in source or target directory for calc: %s' % item)
target_filename = join(
target_dir,
replace_ext(basename(filename), item.get('output_postfix', '_proc%d' % i) + '.tif', False)
......@@ -24,33 +33,89 @@ def calc_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_con
if isfile(target_filename):
logger.warn('Calc output filename %s already exists' % target_filename)
calc_formula(source_dir, item['inputs'], target_filename, item['formula'], item.get('data_type', 'Float32'), item.get('nodata_value', None))
calc_formula(source_dir, target_dir, item['inputs'], target_filename, item['formula'], item.get('data_type', 'Float32'), item.get('nodata_value', None), item.get('creationOptions', {}))
# take all original files with from the last step
for filename in glob('%s/*' % source_dir):
# take all original files with from the last step matching the data_file_glob
filenames = get_all_data_files(source_dir, preprocessor_config, data_file_globs)
for filename in filenames:
target_filename = join(target_dir, basename(filename))
if isfile(target_filename):
logger.warn('Calc output filename %s already exists' % target_filename)
shutil.copy(filename, target_filename)
def calc_formula(source_dir: os.PathLike, inputs: List[dict], target_filename: os.PathLike, formula: str, data_type: str="Float32", nodata_value: float=None):
def calc_formula(source_dir: os.PathLike, target_dir: os.PathLike, inputs: List[dict], target_filename: os.PathLike, formula: str, data_type: str="Float32", nodata_value: float=None, creationOptions: List[str]=[]):
used_formula = formula
cmd = [
"gdal_calc.py",
"--calc=%s" % formula,
"--outfile=%s" % target_filename,
"--type", data_type,
]
for name in inputs:
# select first
filename = glob(join(source_dir, inputs[name]['glob']))[0]
for input_name in inputs:
# get first filename as a base, first looking into source_dir, then target_dir as fallback
filenames_source = glob(join(source_dir, inputs[input_name]['glob']))
if len(filenames_source) > 0:
filename = filenames_source[0]
else:
filenames_target = glob(join(target_dir, inputs[input_name]['glob']))
if len(filenames_target) > 0:
filename = filenames_target[0]
else:
raise Exception('No input file in source or target directory for formula: %s' % formula)
band_number = inputs[input_name].get('band', 1)
cmd.extend([
"-%s" % name, filename,
"--%s_band=%d" % (name, inputs[name].get('band', 1)),
"-%s" % input_name, filename,
"--%s_band=%d" % (input_name, band_number),
])
# evaluate formula as a template
used_formula = evaluate_formula(formula, filename, input_name, band_number)
cmd.extend([
"--calc=%s" % used_formula,
])
if nodata_value is not None:
cmd.append("--NoDataValue=%f" % nodata_value)
cmd.extend([
"--NoDataValue=%s" % nodata_value,
])
for option in creationOptions:
cmd.extend([
"--co",
option,
])
process = subprocess.run(cmd, capture_output=True, text=True)
logger.debug("gdal calc stderr: %s" % process.stderr)
subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def evaluate_formula(formula, filename, input_name, band):
""" Tries to replace a few common placeholders in the calc formula
"""
# formula: "10*log10((A.astype(float)-${A_1_statistics_max}+0.0001)/(${A_1_statistics_max}-${A_1_statistics_min}))"
evaluated_formula = formula
if "statistics" in formula:
# find all occurence of templates in form of ${A_1_statistics_max}
found = re.findall(r'(\${[0-9a-zA-Z_]*})', formula)
# get unique, keeping order
found = list(dict.fromkeys(found))
for item in found:
parts = item.replace("${", "").replace("}", "").split("_")
# if current input is the one in template, replace it with statistics
if parts[0] == input_name and int(parts[1]) == band:
src_ds = gdal.Open(filename)
statistics = src_ds.GetRasterBand(band).GetStatistics(True, True) # force recount
if parts[3] == "min":
replace = "%s" % statistics[0]
elif parts[3] == "max":
replace = "%s" % statistics[1]
elif parts[3] == "mean":
replace = "%s" % statistics[2]
elif parts[3] == "std":
replace = "%s" % statistics[3]
else:
logger.warn("Unknown statistics found in expression %s" % item)
continue
# replace the template with actual value
evaluated_formula = evaluated_formula.replace(item, replace)
return evaluated_formula
......@@ -11,7 +11,7 @@ from ..util import gdal, osr, replace_ext, get_all_data_files
logger = logging.getLogger(__name__)
def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, geotransforms: List[dict]):
def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, geotransforms: List[dict], data_file_globs: List[str]=[]):
success = False
for options in geotransforms:
type_name = options['type'].lower()
......@@ -31,7 +31,7 @@ def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, preproce
else:
raise Exception('Invalid georeference type %s' % type_name)
try:
filenames = get_all_data_files(source_dir, preprocessor_config)
filenames = get_all_data_files(source_dir, preprocessor_config, data_file_globs)
for filename in filenames:
target_filename = join(target_dir, basename(filename))
georef_func(filename, target_filename, **opts_dict)
......
import os
from os.path import join, basename
from uuid import uuid4
from typing import List
from ..util import replace_ext, gdal, get_all_data_files
import logging
......@@ -8,7 +9,7 @@ import logging
logger = logging.getLogger(__name__)
def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, options: dict=None):
def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, options: dict=None, data_file_globs: List[str]=[]):
# find out the driver to get the extension
options = options if options is not None else {}
frmt = options.get('format', 'GTiff')
......@@ -18,7 +19,7 @@ def output_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_c
extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0]
# warp each individual file
warped_files = []
filenames = get_all_data_files(source_dir, preprocessor_config)
filenames = get_all_data_files(source_dir, preprocessor_config, data_file_globs)
for filename in filenames:
target_filename = join(target_dir, replace_ext(basename(filename), extension))
logger.debug('Warping file %s' % filename)
......
......@@ -10,10 +10,10 @@ from ..util import replace_ext, gdal, get_all_data_files
logger = logging.getLogger(__name__)
def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None):
def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, group_by: str=None, sort_by: str=None, order: List[str]=None, data_file_globs: List[str]=[]):
""" Stack bands of the individual images
"""
filenames = get_all_data_files(source_dir, preprocessor_config)
filenames = get_all_data_files(source_dir, preprocessor_config, data_file_globs)
# check if we have a group_by regex. If yes, use the first
# re-group to group by.
# Fallback is basename of file as groupname
......
import os
from os.path import join, basename
from typing import Dict
from typing import Dict, List
import logging
from ..util import replace_ext, gdal, get_all_data_files
def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None):
filenames = get_all_data_files(source_dir, preprocessor_config)
def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, preprocessor_config: dict, subdataset_types: Dict[str, str]=None, data_file_globs: List[str]=[]):
filenames = get_all_data_files(source_dir, preprocessor_config, data_file_globs)
if len(filenames) == 0:
raise Exception('No datafiles were matched by the provided glob')
......
......@@ -91,13 +91,18 @@ def get_size_in_bytes(file_path, unit):
return convert_unit(size, unit)
def get_all_data_files(source_dir, preprocessor_config):
def get_all_data_files(source_dir, preprocessor_config, data_file_globs=[]):
""" Based on 'data_file_globs' configuration, gets all unique data file paths from folder matching any of the globs"""
# get all file paths recursively
file_paths = [p for p in glob(join(source_dir, '**'), recursive=True) if not os.path.isdir(p)]
# filter them by data_globs
file_paths_filt = []
for dataglob in preprocessor_config.get('data_file_globs', ['*']):
used_globs = preprocessor_config.get('data_file_globs', ['*'])
# override global data_file_globs by the provided one if possible
if len(data_file_globs) > 0:
used_globs = data_file_globs
for dataglob in used_globs:
file_paths_filt += filter_filenames(file_paths, dataglob, preprocessor_config.get('glob_case', False))
# get only unique files to compensate for possibly bad glob yielding doubles, keeping order
file_paths_filt = list(dict.fromkeys(file_paths_filt))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment