EOX GitLab Instance

Skip to content
Snippets Groups Projects
Commit 9899cf18 authored by Lubomir Dolezal's avatar Lubomir Dolezal
Browse files

add option for case-insensitive match in unpack

parent 0d808c15
Branches
Tags
No related merge requests found
......@@ -291,6 +291,10 @@ metadata_glob
This file glob is used to determine the main metadata file to extract the
product type from. This file will be searched in the downloaded package.
glob_case
If all globs will be used in a case-sensitive way.
type_extractor
This setting configures how the product type is extracted from the previously
......
......@@ -4,17 +4,24 @@ import io
from typing import List, Union, BinaryIO
import tarfile
import zipfile
from fnmatch import fnmatch, fnmatchcase
import logging
from fnmatch import translate
import re
logger = logging.getLogger(__name__)
ARCHIVE_EXTENSIONS = ['ZIP', 'zip', 'TAR', 'tar', 'TAR.BZ2', 'tar.bz2', 'TAR.GZ', 'tar.gz']
def filter_filenames(filenames: List[PathLike], glob: str, case=False) -> List[PathLike]:
cmp = fnmatchcase if case else fnmatch
def filter_filenames(filenames: List[PathLike], glob: str, case: bool=False) -> List[PathLike]:
regex = translate(glob)
if case:
reobj = re.compile(regex)
else:
reobj = re.compile(regex, re.IGNORECASE)
return [
filename
for filename in filenames
if cmp(filename, glob)
if reobj.match(filename)
]
......@@ -41,7 +48,7 @@ def open_tarfile(archive_file: Union[PathLike, BinaryIO]) -> tarfile.TarFile:
return tarfile.open(archive_file)
def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike, glob=None, filenames=None, recursive=False) -> List[PathLike]:
def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike, glob=None, case=None, filenames=None, recursive=False) -> List[PathLike]:
""" Unpacks the contents of the specified ZIP or TAR archive to the
given target directory. Optionally, only a given list of filenames
will be extracted.
......@@ -67,7 +74,7 @@ def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike,
# filter the filenames when a glob is passed
if glob:
filenames = filter_filenames(filenames, glob)
filenames = filter_filenames(filenames, glob, case)
extracted_filenames = []
......@@ -113,6 +120,7 @@ def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike,
sub_archive_filename,
os.path.join(target_dir, sub_archive),
glob,
case,
filenames,
recursive,
)
......
......@@ -35,6 +35,9 @@ properties:
metadata_glob:
description: A file glob to select metadata files from the downloaded archive.
type: string
glob_case:
description: If all file globs will use case-sensitive match.
type: boolean
type_extractor:
description: How the product type is to be extracted from the metadata file.
type: object
......
......@@ -162,7 +162,7 @@ def preprocess_file(config: dict, file_path: os.PathLike, use_dir: os.PathLike=N
logger.info('Download dir already exists, skipping...')
# fetch the metadata XML file from the downloaded archive
metadata_files = unpack_files(source_archive_path, 'extra', glob=config['metadata_glob'])
metadata_files = unpack_files(source_archive_path, 'extra', glob=config['metadata_glob'], case=config.get('glob_case', False))
# open the XML to retrieve the product type and level
product_type, product_level = extract_product_type_and_level(metadata_files, config)
......@@ -186,6 +186,7 @@ def preprocess_file(config: dict, file_path: os.PathLike, use_dir: os.PathLike=N
source_archive_path,
'unpack',
glob=glob,
case=config.get('glob_case', False),
recursive=preprocess_config.get('nested', False),
)
for glob in preprocess_config['data_file_globs']
......@@ -195,6 +196,7 @@ def preprocess_file(config: dict, file_path: os.PathLike, use_dir: os.PathLike=N
source_archive_path,
'unpack',
glob=glob,
case=config.get('glob_case', False),
recursive=preprocess_config.get('nested', False),
)
for glob in preprocess_config.get('additional_file_globs', [])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment