EOX GitLab Instance

Commit 5c1d26c0 authored by Nikola Jankovic's avatar Nikola Jankovic 💻
Browse files

added tests

refactored functions
added fixtures and skeleton
parent 7c98e7a6
Pipeline #18465 failed with stage
in 19 seconds
include ./harvester/config-schema.yaml
from os.path import join, dirname """
cli.py
======
Contains command line interface
"""
import logging.config import logging.config
import click import click
import yaml
import jsonschema
from .daemon import run_daemon from .daemon import run_daemon, init_client
from .config import load_config from .config import load_config, validate_config
from .harvester import main from .harvester import main
from .utils import get_params
def setup_logging(debug=False): def setup_logging(debug=False):
logging.config.dictConfig({ logging.config.dictConfig(
'version': 1, {
'disable_existing_loggers': False, "version": 1,
'formatters': { "disable_existing_loggers": False,
'brief': { "formatters": {"brief": {"format": "%(levelname)s %(name)s: %(message)s"}},
'format': '%(levelname)s %(name)s: %(message)s' "handlers": {
} "console": {
}, "class": "logging.StreamHandler",
'handlers': { "level": "DEBUG" if debug else "INFO",
'console': { "formatter": "brief",
'class': 'logging.StreamHandler', }
'level': 'DEBUG' if debug else 'INFO', },
'formatter': 'brief', "root": {
} "handlers": ["console"],
}, "level": "DEBUG" if debug else "INFO",
'root': { },
'handlers': ['console'],
'level': 'DEBUG' if debug else 'INFO',
} }
}) )
def validate_config(config):
with open(join(dirname(__file__), 'config-schema.yaml')) as f:
schema = yaml.load(f)
jsonschema.validate(config, schema)
@click.group() @click.group()
...@@ -45,14 +40,21 @@ def cli(): ...@@ -45,14 +40,21 @@ def cli():
pass pass
@cli.command(help='Run the harvester daemon, attaching to a Redis queue') @cli.command(help="Run the harvester daemon, attaching to a Redis queue")
@click.option('--config-file', type=click.File('r')) @click.option("--host", type=str, required=True)
@click.option('--validate/--no-validate', default=False) @click.option("--port", type=int, required=True)
@click.option('--host', type=str) @click.option("--listen-queue", type=str, required=True)
@click.option('--port', type=int) @click.option("--config-file", type=click.File("r"), required=True)
@click.option('--listen-queue', type=str) @click.option("--validate/--no-validate", default=False)
@click.option('--debug/--no-debug', default=False) @click.option("--debug/--no-debug", default=False)
def daemon(config_file=None, validate=False, host=None, port=None, listen_queue=None, debug=False): def daemon(
host,
port,
listen_queue,
config_file=None,
validate=False,
debug=False,
):
setup_logging(debug) setup_logging(debug)
config = load_config(config_file) config = load_config(config_file)
if validate: if validate:
...@@ -60,21 +62,29 @@ def daemon(config_file=None, validate=False, host=None, port=None, listen_queue= ...@@ -60,21 +62,29 @@ def daemon(config_file=None, validate=False, host=None, port=None, listen_queue=
run_daemon(config, host, port, listen_queue) run_daemon(config, host, port, listen_queue)
@cli.command(help='Run a single, one-off harvest') @cli.command(help="Run a single, one-off harvest")
@click.argument('harvester_name', type=str) @click.argument("harvester_name", type=str)
@click.option('--config-file', type=click.File('r')) @click.option("--host", type=str, required=True)
@click.option('--validate/--no-validate', default=False) @click.option("--port", type=int, required=True)
@click.option('--debug/--no-debug', default=False) @click.option("--config-file", type=click.File("r"), required=True)
@click.option( '--param', '-p', multiple=True) @click.option("--validate/--no-validate", default=False)
def harvest(harvester_name, config_file: str=None, validate: bool=False, debug: bool=False, param: tuple=()): @click.option("--debug/--no-debug", default=False)
def harvest(
harvester_name: str,
host: str,
port: int,
config_file: str = None,
validate: bool = False,
debug: bool = False,
):
setup_logging(debug) setup_logging(debug)
config = load_config(config_file) config = load_config(config_file)
if validate: if validate:
validate_config(config) validate_config(config)
kwargs = get_params(param) client = init_client(host, port)
kwargs['name'] = harvester_name main(config, harvester_name, client)
main(config, harvester_name, **kwargs)
if __name__ == '__main__': if __name__ == "__main__":
cli() cli()
...@@ -2,16 +2,6 @@ $id: https://example.com/address.schema.json ...@@ -2,16 +2,6 @@ $id: https://example.com/address.schema.json
$schema: http://json-schema.org/draft-07/schema# $schema: http://json-schema.org/draft-07/schema#
type: object type: object
properties: properties:
redis:
description: Redis configuration
type: object
properties:
host:
description: Host address for Redis
type: string
port:
description: Port for Redis
type: integer
harvesters: harvesters:
description: List of harvesters description: List of harvesters
type: array type: array
...@@ -20,7 +10,7 @@ properties: ...@@ -20,7 +10,7 @@ properties:
type: object type: object
properties: properties:
name: name:
description: Name of the harvester. Should be unique description: Name of the harvester. Should be unique
type: string type: string
queue: queue:
description: Name of queue to send queried data to description: Name of queue to send queried data to
...@@ -38,7 +28,7 @@ properties: ...@@ -38,7 +28,7 @@ properties:
type: type:
description: type of the endpoint description: type of the endpoint
type: string type: string
enum: enum:
- STACAPI - STACAPI
- STACCatalog - STACCatalog
- OpenSearch - OpenSearch
...@@ -50,11 +40,11 @@ properties: ...@@ -50,11 +40,11 @@ properties:
time_property: time_property:
description: what time to extract from queried results. description: what time to extract from queried results.
type: string type: string
enum: enum:
- sensed - sensed
- updated - updated
- created - created
- modified - modified
bbox: bbox:
description: Bounding box to be queried description: Bounding box to be queried
type: string type: string
...@@ -69,6 +59,6 @@ properties: ...@@ -69,6 +59,6 @@ properties:
required: required:
- begin - begin
required: required:
- queue - queue
- name - name
...@@ -3,52 +3,58 @@ import re ...@@ -3,52 +3,58 @@ import re
import datetime import datetime
from typing import TextIO from typing import TextIO
import jsonschema
import yaml import yaml
ENV_PATTERN = re.compile(r'.*?\${(\w+)}.*?') ENV_PATTERN = re.compile(r".*?\${(\w+)}.*?")
LOADER = yaml.SafeLoader
def constructor_env_variables(loader, node): def constructor_env_variables(loader, node):
""" """
Extracts the environment variable from the node's value Extracts the environment variable from the node's value
:param yaml.Loader loader: the yaml loader :param yaml.Loader loader: the yaml loader
:param node: the current node in the yaml :param node: the current node in the yaml
:return: the parsed string that contains the value of the environment :return: the parsed string that contains the value of the environment
variable variable
""" """
value = loader.construct_scalar(node) value = loader.construct_scalar(node)
match = ENV_PATTERN.findall(value) # to find all env variables in line match = ENV_PATTERN.findall(value) # to find all env variables in line
if match: if match:
full_value = value full_value = value
for g in match: for g in match:
env_variable = os.environ.get(g, ) env_variable = os.environ.get(
if env_variable is not None: g,
full_value = full_value.replace( )
f'${{{g}}}', env_variable if env_variable is not None:
) full_value = full_value.replace(f"${{{g}}}", env_variable)
else: else:
return None return None
return full_value return full_value
return value return value
def now(loader, node):
def constructor_now(loader, node):
return datetime.datetime.now(tz=datetime.timezone.utc).isoformat() return datetime.datetime.now(tz=datetime.timezone.utc).isoformat()
tags = {
'!env': constructor_env_variables, # the tag will be used to mark where to start searching for the pattern
'!now': now, # e.g. somekey: !env somestring${MYENVVAR}blah blah blah
} LOADER.add_implicit_resolver("!env", ENV_PATTERN, None)
LOADER.add_constructor("!env", constructor_env_variables)
# this tag resolves !now to datetime.now
LOADER.add_constructor("!now", constructor_now)
def load_config(input_file: TextIO) -> dict: def load_config(input_file: TextIO) -> dict:
loader = yaml.SafeLoader return yaml.load(input_file, Loader=LOADER)
# the tag will be used to mark where to start searching for the pattern
# e.g. somekey: !env somestring${MYENVVAR}blah blah blah def validate_config(config):
for tag, func in tags.items(): with open(
if tag == '!env': os.path.join(os.path.dirname(__file__), "config-schema.yaml"), encoding="utf-8"
loader.add_implicit_resolver(tag, ENV_PATTERN, None) ) as file:
loader.add_constructor(tag, func) schema = yaml.load(file)
return yaml.load(input_file, Loader=loader) jsonschema.validate(config, schema)
"""
daemon.py
==========
Contains functionality related to running the daemon
"""
import logging import logging
import redis from redis import Redis
from .harvester import main from .harvester import main
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def init_client(host: str, port: int) -> Redis:
redis = Redis(host=host, port=port, charset="utf-8", decode_responses=True)
return redis
def run_daemon(config: dict, host: str, port: str, listen_queue: str): def run_daemon(config: dict, host: str, port: str, listen_queue: str):
""" Run the harvester daemon, listening on a redis queue """Run the harvester daemon, listening on a redis queue
for harvest jobs. for harvest jobs.
""" """
# initialize the queue client # initialize the queue client
client = redis.Redis( client = init_client(host, port)
host=host, port=port, charset="utf-8", decode_responses=True logger.debug("waiting for redis queue '%s'", listen_queue)
)
logger.debug("waiting for redis queue '%s'..." % listen_queue)
while True: while True:
# fetch an item from the queue to be harvested # fetch an item from the queue to be harvested
_, value = client.brpop(listen_queue) # first param which queue ther result came from _, value = client.brpop(listen_queue)
# start the harvesting # start the harvesting
try: main(config, value, client)
main(config, value, client)
except Exception as e:
logger.exception(e)
from datetime import datetime
import logging import logging
from typing import Type, List from abc import ABC, abstractmethod
from typing import List
from pygeofilter.parsers.cql_json import parse as json_parse from pygeofilter.parsers.cql_json import parse as json_parse
from pygeofilter.backends.native.evaluate import NativeEvaluator from pygeofilter.backends.native.evaluate import NativeEvaluator
...@@ -9,14 +9,16 @@ from ..query import Query ...@@ -9,14 +9,16 @@ from ..query import Query
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Endpoint:
class Endpoint(ABC):
type = None
def __init__(self, url: str, query: dict, filter: dict, *args, **kwargs): def __init__(self, url: str, query: dict, filter: dict, *args, **kwargs):
self.url = url self.url = url
self.query = Query(**query) self.query = Query(**query)
self.filter = json_parse(filter) self.filter = json_parse(filter)
@abstractmethod
def harvest(self) -> list: def harvest(self) -> list:
# All endpoints should extend function to do following # All endpoints should extend function to do following
# 1. prepare query # 1. prepare query
...@@ -24,24 +26,20 @@ class Endpoint: ...@@ -24,24 +26,20 @@ class Endpoint:
# 3. convert to stac items # 3. convert to stac items
# 4. filter data if necessary # 4. filter data if necessary
# 5. return list of stac items # 5. return list of stac items
pass
raise NotImplementedError()
@classmethod @classmethod
def from_config(cls, endpoint_config: dict) -> Type['Endpoint']: def from_config(cls, endpoint_config: dict) -> "Endpoint":
subclass_map = {subclass.type: subclass for subclass in cls.__subclasses__()} subclass_map = {subclass.type: subclass for subclass in cls.__subclasses__()}
endpoint_type = endpoint_config.pop('type', None) endpoint_type = endpoint_config.pop("type", None)
SubClass = subclass_map[endpoint_type] subclass = subclass_map[endpoint_type]
return SubClass(**endpoint_config) return subclass(**endpoint_config)
def filter_data(self, data: List[dict]) -> List[dict]: def filter_data(self, data: List[dict]) -> List[dict]:
attr_map = { attr_map = {"point_attr": "geometry", "*": "properties.*"}
'point_attr': 'geometry', nat_eval = NativeEvaluator(attribute_map=attr_map, use_getattr=False)
'*': 'properties.*' evaluator = nat_eval.evaluate(self.filter)
}
e = NativeEvaluator(attribute_map=attr_map, use_getattr=False)
evaluator = e.evaluate(self.filter)
result = list(filter(evaluator, data)) result = list(filter(evaluator, data))
return result return result
from .FTPEndpoint import FTPEndpoint from ..query import Query
from .OADSEndpoint import OADSEndpoint from .ftp import FTPEndpoint
from .OGCAPIEndpoint import OGCAPIEndpoint from .oads import OADSEndpoint
from .OpenSearchEndpoint import OpenSearchEndpoint from .ogcapi import OGCAPIEndpoint
from .S3Endpoint import S3Endpoint from .opensearch import OpenSearchEndpoint
from .STACAPIEndpoint import STACAPIEndpoint from .s3 import S3Endpoint
from .STACCatalogEndpoint import STACCatalogEndpoint from .stacapi import STACAPIEndpoint
from .SwiftEndpoint import SwiftEndpoint from .stac_catalog import STACCatalogEndpoint
from .swift import SwiftEndpoint
from .Endpoint import Endpoint from .Endpoint import Endpoint
__all__ = [ __all__ = [
FTPEndpoint, "FTPEndpoint",
OADSEndpoint, "OADSEndpoint",
OGCAPIEndpoint, "OGCAPIEndpoint",
OpenSearchEndpoint, "OpenSearchEndpoint",
S3Endpoint, "S3Endpoint",
STACAPIEndpoint, "STACAPIEndpoint",
STACCatalogEndpoint, "STACCatalogEndpoint",
SwiftEndpoint, "SwiftEndpoint",
Endpoint "Endpoint",
] ]
from .Endpoint import Endpoint from .Endpoint import Endpoint
class FTPEndpoint(Endpoint): class FTPEndpoint(Endpoint):
type = 'FTP' type = "FTP"
from .Endpoint import Endpoint from .Endpoint import Endpoint
class OADSEndpoint(Endpoint): class OADSEndpoint(Endpoint):
type = 'OADS' type = "OADS"
from .Endpoint import Endpoint from .Endpoint import Endpoint
class OGCAPIEndpoint(Endpoint): class OGCAPIEndpoint(Endpoint):
type = 'OGCAPI' type = "OGCAPI"
from abc import abstractmethod, ABC
import logging import logging
from typing import List, Optional, Tuple, Type from typing import List
from dataclasses import dataclass from dataclasses import dataclass, field
import requests import requests
import lxml.etree as ET import lxml.etree as ET
import pystac import pystac
from .Endpoint import Endpoint from .Endpoint import Endpoint
from ..stac import STACItemComposer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SearchPage = Tuple[List[dict], Optional[str]]
@dataclass @dataclass
class SearchPage: class SearchPage:
records: List[dict]
index: int index: int
total: int total: int
records: List[dict] = field(default_factory=list)
class OpenSearchFormat: class OpenSearchFormat(ABC):
mimetype = None mimetype = None
@classmethod @classmethod
def from_config(cls, config: dict) -> Type['OpenSearchFormat']: def from_config(cls, config: dict) -> "OpenSearchFormat":
subclass_map = { subclass_map = {
subclass.mimetype: subclass for subclass in cls.__subclasses__() subclass.mimetype: subclass for subclass in cls.__subclasses__()
} }