diff --git a/documentation/operator-guide/configuration.rst b/documentation/operator-guide/configuration.rst index e9ac61e2b5b2cd48ed15fbc6984e087df105eab6..fedf0a9569123792707496ee8ebb8b820a293744 100644 --- a/documentation/operator-guide/configuration.rst +++ b/documentation/operator-guide/configuration.rst @@ -291,6 +291,10 @@ metadata_glob This file glob is used to determine the main metadata file to extract the product type from. This file will be searched in the downloaded package. +glob_case + + If all globs will be used in a case-sensitive way. + type_extractor This setting configures how the product type is extracted from the previously diff --git a/preprocessor/preprocessor/archive.py b/preprocessor/preprocessor/archive.py index 7dedcd5f4a040384a2d224c65a6573c763fed3ff..57ccb973939d2c6799cc387b8759bc43f64372ad 100644 --- a/preprocessor/preprocessor/archive.py +++ b/preprocessor/preprocessor/archive.py @@ -4,17 +4,24 @@ import io from typing import List, Union, BinaryIO import tarfile import zipfile -from fnmatch import fnmatch, fnmatchcase +import logging +from fnmatch import translate +import re +logger = logging.getLogger(__name__) ARCHIVE_EXTENSIONS = ['ZIP', 'zip', 'TAR', 'tar', 'TAR.BZ2', 'tar.bz2', 'TAR.GZ', 'tar.gz'] -def filter_filenames(filenames: List[PathLike], glob: str, case=False) -> List[PathLike]: - cmp = fnmatchcase if case else fnmatch +def filter_filenames(filenames: List[PathLike], glob: str, case: bool=False) -> List[PathLike]: + regex = translate(glob) + if case: + reobj = re.compile(regex) + else: + reobj = re.compile(regex, re.IGNORECASE) return [ filename for filename in filenames - if cmp(filename, glob) + if reobj.match(filename) ] @@ -41,7 +48,7 @@ def open_tarfile(archive_file: Union[PathLike, BinaryIO]) -> tarfile.TarFile: return tarfile.open(archive_file) -def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike, glob=None, filenames=None, recursive=False) -> List[PathLike]: +def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike, glob=None, case=None, filenames=None, recursive=False) -> List[PathLike]: """ Unpacks the contents of the specified ZIP or TAR archive to the given target directory. Optionally, only a given list of filenames will be extracted. @@ -67,7 +74,7 @@ def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike, # filter the filenames when a glob is passed if glob: - filenames = filter_filenames(filenames, glob) + filenames = filter_filenames(filenames, glob, case) extracted_filenames = [] @@ -113,6 +120,7 @@ def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike, sub_archive_filename, os.path.join(target_dir, sub_archive), glob, + case, filenames, recursive, ) diff --git a/preprocessor/preprocessor/config-schema.yaml b/preprocessor/preprocessor/config-schema.yaml index cff6590e90804ed0e3b4332aacb8a8053beda667..dd67fed643576da60143c38d16c930555d01444f 100644 --- a/preprocessor/preprocessor/config-schema.yaml +++ b/preprocessor/preprocessor/config-schema.yaml @@ -35,6 +35,9 @@ properties: metadata_glob: description: A file glob to select metadata files from the downloaded archive. type: string + glob_case: + description: If all file globs will use case-sensitive match. + type: boolean type_extractor: description: How the product type is to be extracted from the metadata file. type: object diff --git a/preprocessor/preprocessor/preprocess.py b/preprocessor/preprocessor/preprocess.py index ae5a3efb7a8e31e150f94105a207531f1b54a72b..696f614e8dac7150d89c3f9f5417225980d16c9f 100644 --- a/preprocessor/preprocessor/preprocess.py +++ b/preprocessor/preprocessor/preprocess.py @@ -162,7 +162,7 @@ def preprocess_file(config: dict, file_path: os.PathLike, use_dir: os.PathLike=N logger.info('Download dir already exists, skipping...') # fetch the metadata XML file from the downloaded archive - metadata_files = unpack_files(source_archive_path, 'extra', glob=config['metadata_glob']) + metadata_files = unpack_files(source_archive_path, 'extra', glob=config['metadata_glob'], case=config.get('glob_case', False)) # open the XML to retrieve the product type and level product_type, product_level = extract_product_type_and_level(metadata_files, config) @@ -186,6 +186,7 @@ def preprocess_file(config: dict, file_path: os.PathLike, use_dir: os.PathLike=N source_archive_path, 'unpack', glob=glob, + case=config.get('glob_case', False), recursive=preprocess_config.get('nested', False), ) for glob in preprocess_config['data_file_globs'] @@ -195,6 +196,7 @@ def preprocess_file(config: dict, file_path: os.PathLike, use_dir: os.PathLike=N source_archive_path, 'unpack', glob=glob, + case=config.get('glob_case', False), recursive=preprocess_config.get('nested', False), ) for glob in preprocess_config.get('additional_file_globs', [])