EOX GitLab Instance

Commit 8b45e89b authored by Nikola Jankovic's avatar Nikola Jankovic 💻
Browse files

implemented generators for better data streaming

parent 88aa7553
Pipeline #19049 failed with stage
in 42 seconds
......@@ -37,14 +37,6 @@ properties:
- Swift
- OGCAPI
- OADS
time_property:
description: what time to extract from queried results.
type: string
enum:
- sensed
- updated
- created
- modified
bbox:
description: Bounding box to be queried
type: string
......
from abc import abstractmethod, ABC
import logging
from typing import Any, Dict, List
from typing import Any, Dict, Generator, List
from dataclasses import dataclass, field
import requests
......@@ -127,13 +127,12 @@ class OpenSearchEndpoint(Endpoint):
self.format = get_format(format_config)
self.query = OpenSearchQuery(**query)
def harvest(self) -> list:
def harvest(self) -> Generator[dict, None, None]:
logger.info("Starting OpenSearch harvesting")
parser = ET.XMLParser(recover=True)
data = ET.fromstring(requests.get(self.url).content, parser)
urls = self._get_url_param_mapping(data)
result = []
params = self.query.prepare_params(urls=urls, mimetype=self.format.mimetype)
......@@ -148,12 +147,10 @@ class OpenSearchEndpoint(Endpoint):
search_params[index_keyword] += index_value
page = self.format.parse(response)
if page.records:
result.extend(page.records)
yield from page.records
else:
break
return result
def _get_url_param_mapping(self, data: ET._Element) -> dict:
urls: Dict[str, Any] = {}
for url in data.findall("opensearch:Url", self.NS):
......
......@@ -10,17 +10,16 @@ class Query(ABC):
bbox: str = None,
collection: str = None,
):
self.time_begin, self.time_end, self.time_property = self._time(time)
self.time_begin, self.time_end = self._time(time)
self.bbox = bbox
self.collection = collection
@staticmethod
def _time(time: Dict[str, Any]) -> Tuple[datetime, datetime, str]:
def _time(time: Dict[str, Any]) -> Tuple[datetime, datetime]:
time_begin = time.pop("begin", datetime.now(timezone.utc))
time_end = time.pop("end", datetime.now(timezone.utc))
time_property = time.pop("property", None)
return time_begin, time_end, time_property
return time_begin, time_end
@abstractmethod
def prepare_params(self, *args, **kwargs):
......
import logging
from typing import Generator
import requests
......@@ -15,28 +16,25 @@ class STACAPIEndpoint(Endpoint):
super().__init__(*args, **kwargs)
self.query = StacAPIQuery(**query)
def harvest(self) -> list:
def harvest(self) -> Generator[dict, None, None]:
logger.info("Starting STACAPI harvesting")
main = requests.get(self.url).json()
search_url = next(
link["href"] for link in main["links"] if link["rel"] == "search"
)
search_params = self.query.prepare_params(root_response=main)
result = []
logger.debug("querying %s with %s", search_url, search_params)
query_data = requests.get(search_url, params=search_params).json()
context = query_data["context"]
while context["returned"]:
for feature in query_data["features"]:
result.append(feature)
yield feature
search_params["page"] += 1
logger.debug("querying %s with %s", search_url, search_params)
query_data = requests.get(search_url, params=search_params).json()
context = query_data["context"]
return result
class StacAPIQuery(Query):
def __init__(self, *args, **kwargs):
......
import json
import logging
from typing import List, Optional
from typing import Generator, Optional
from redis import Redis
......@@ -15,15 +15,14 @@ logger = logging.getLogger(__name__)
def stringify(
result: List[dict], mode: str = "item", extract_property: Optional[str] = None
):
encoded: List[str] = []
result: Generator[dict, None, None],
mode: str = "item",
extract_property: Optional[str] = None,
) -> Generator[str, None, None]:
if mode == "item":
encoded.extend((json.dumps(item, default=str) for item in result))
yield from (json.dumps(item, default=str) for item in result)
elif mode == "property":
encoded.extend((item["properties"][extract_property] for item in result))
return encoded
yield from (item["properties"][extract_property] for item in result)
def init_resource(harvest_config: dict) -> Resource:
......@@ -64,4 +63,5 @@ def main(config: dict, value: str, client: Redis):
)
# Send to queue
client.lpush(harvest_config["queue"], *encoded)
for value in encoded:
client.lpush(harvest_config["queue"], value)
from abc import abstractmethod, ABC
from typing import List
from typing import Generator
class Resource(ABC):
......@@ -12,9 +12,8 @@ class Resource(ABC):
# 1. prepare harvest
# 2. harvest resource
# 3. convert to stac items
# 4. filter data if necessary
# 5. return list of stac items as dictionaries
# 4. return list of stac items as dictionaries
@abstractmethod
def harvest(self) -> List[dict]:
def harvest(self) -> Generator[dict, None, None]:
pass
import re
import logging
from dateutil.parser import isoparse
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Generator
if TYPE_CHECKING:
from mypy_boto3_s3.client import S3Client
......@@ -39,12 +39,11 @@ class S3Source(Source):
bucket = self.parameters["url"].strip("https://").split(".")[0]
return bucket
def harvest(self) -> list:
def harvest(self) -> Generator[dict, None, None]:
logger.info("Starting S3 harvesting")
paginator = self.client.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=self.bucket, Prefix=self.parameters["prefix"])
result = []
time_regex: str = self.parameters["time_regex"]
for page in pages:
......@@ -52,9 +51,7 @@ class S3Source(Source):
if match := re.search(time_regex, file["Key"]):
dt = isoparse(match[0])
item = self._create_item(file, dt, self.parameters["url"])
result.append(item.to_dict())
return result
yield item.to_dict()
def _create_item(self, data, dt, url):
identifier = dt.strftime("%Y%m%d_%H%M%S")
......
from typing import List
from typing import Generator
from pygeofilter.backends.native.evaluate import NativeEvaluator
from pygeofilter.parsers.cql_json import parse as json_parse
def cql_filter(_filter: dict, data: List[dict]) -> List[dict]:
def cql_filter(
_filter: dict, data: Generator[dict, None, None]
) -> Generator[dict, None, None]:
if not _filter:
return data
yield from data
return
_filter = json_parse(_filter)
attr_map = {"point_attr": "geometry", "*": "properties.*"}
nat_eval = NativeEvaluator(attribute_map=attr_map, use_getattr=False)
evaluator = nat_eval.evaluate(_filter)
data = list(filter(evaluator, data))
return data
yield from filter(evaluator, data)
......@@ -48,11 +48,11 @@ def data_map():
json_load(DATA_PATH / "stac_e84_root.json"),
),
(
"""https://earth-search.aws.element84.com/v0/search?datetime=2019-09-10T00%3A00%3A00%2B00%3A00%2F2019-09-11T00%3A00%3A00%2B00%3A00&limit=100&page=1&bbox=%5B14.9%2C47.7%2C16.4%2C48.7%5D&collections=%5B%22sentinel-s2-l2a-cogs%22%5D""",
"""https://earth-search.aws.element84.com/v0/search?datetime=2020-09-26T00%3A00%3A00%2B00%3A00%2F2020-09-27T00%3A00%3A00%2B00%3A00&limit=100&page=1&bbox=%5B14.9%2C47.7%2C16.4%2C48.7%5D&collections=%5B%22sentinel-s2-l2a-cogs%22%5D""",
json_load(DATA_PATH / "stac_e84_data.json"),
),
(
"https://earth-search.aws.element84.com/v0/search?datetime=2019-09-10T00%3A00%3A00%2B00%3A00%2F2019-09-11T00%3A00%3A00%2B00%3A00&limit=100&page=2&bbox=%5B14.9%2C47.7%2C16.4%2C48.7%5D&collections=%5B%22sentinel-s2-l2a-cogs%22%5D",
"https://earth-search.aws.element84.com/v0/search?datetime=2020-09-26T00%3A00%3A00%2B00%3A00%2F2020-09-27T00%3A00%3A00%2B00%3A00&limit=100&page=2&bbox=%5B14.9%2C47.7%2C16.4%2C48.7%5D&collections=%5B%22sentinel-s2-l2a-cogs%22%5D",
json_load(DATA_PATH / "stac_e84_empty.json"),
),
],
......
......@@ -5,17 +5,11 @@ harvesters:
type: STACAPI # STACAPI, STACCatalog, OpenSearch, FTP, S3, Swift, OGCAPI, OADS
query:
time:
begin: 2019-09-10T00:00:00+00:00
end: 2019-09-11T00:00:00+00:00
property: sensed
begin: 2020-09-26T00:00:00+00:00
end: 2020-09-27T00:00:00+00:00
collection: sentinel-s2-l2a-cogs
bbox: 14.9,47.7,16.4,48.7
filter:
and:
- during:
- property: updated
- - P5D
- !now
filter: {}
queue: register # register, ingest, delete, update, preprocess
mode: item # item, property
extract_property: null
......@@ -40,7 +34,6 @@ harvesters:
type: OpenSearch
query:
time:
property: sensed # sensed eox:updated
begin: 2019-09-10T00:00:00+00:00
end: 2019-09-11T00:00:00+00:00
bbox: 14.9,47.7,16.4,48.7
......@@ -52,7 +45,6 @@ harvesters:
type: OpenSearch
query:
time:
property: sensed
begin: 2019-09-10T00:00:00+00:00
end: 2019-09-11T00:00:00+00:00
collection: null
......
......@@ -19,7 +19,7 @@ def test_stacapi(
requests_mocker.get(url=url, json=mock_json)
main(config, value, client)
client.lpush.assert_called_once()
client.lpush.assert_called()
@pytest.mark.parametrize("value", [("Creodias-Opensearch")])
......@@ -36,7 +36,7 @@ def test_opensearch(
requests_mocker.get(url=mock_data["empty"][0], json=mock_data["empty"][1])
main(config, value, client)
client.lpush.assert_called_once()
client.lpush.assert_called()
@pytest.mark.parametrize("value", [("Fusion-data")])
......@@ -52,4 +52,4 @@ def test_s3(
session_mock().client().get_paginator().paginate.return_value = mock_data
main(config, value, client)
client.lpush.assert_called_once()
client.lpush.assert_called()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment