diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 25357e679999f4de870690aa1c2428faa5492691..658937aad7216c60be89e3761c8e8b82192f273b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -33,8 +33,11 @@ build-master: - IMAGE_5="$CI_REGISTRY_IMAGE/fluentd" - docker pull "$IMAGE_5":latest || true - docker build --cache-from "$IMAGE_5":latest -t "$IMAGE_5":latest -t "$IMAGE_5":$VERSION_5 fluentd/ - - chmod +x gitlab_test.sh - - ./gitlab_test.sh + - VERSION_6=`grep 'version="*"' ingestor/Dockerfile | cut -d '"' -f2` + - IMAGE_6="$CI_REGISTRY_IMAGE/pvs_ingestor" + - docker pull "$IMAGE_6":latest || true + - docker build --cache-from "$IMAGE_6":latest -t "$IMAGE_6":latest -t "$IMAGE_6":$VERSION_6 ingestor/ + - cd ./testing && ./gitlab_test.sh && cd - - docker push "$IMAGE_1":$VERSION_1 - docker push "$IMAGE_1":latest - docker push "$IMAGE_2":$VERSION_2 @@ -45,6 +48,8 @@ build-master: - docker push "$IMAGE_4":latest - docker push "$IMAGE_5":$VERSION_5 - docker push "$IMAGE_5":latest + - docker push "$IMAGE_6":$VERSION_6 + - docker push "$IMAGE_6":latest only: - master @@ -76,8 +81,10 @@ build: - docker pull "$IMAGE":latest || true - docker build --cache-from "$IMAGE":latest -t "$IMAGE":$CI_COMMIT_REF_SLUG -t "$IMAGE":$CI_COMMIT_REF_NAME fluentd/ - docker tag "$IMAGE:$CI_COMMIT_REF_NAME" "$IMAGE:latest" - - chmod +x gitlab_test.sh - - ./gitlab_test.sh + - IMAGE="$CI_REGISTRY_IMAGE/pvs_ingestor" + - docker pull "$IMAGE":$CI_COMMIT_REF_NAME || true + - docker build --cache-from "$IMAGE":latest -t "$IMAGE":$CI_COMMIT_REF_SLUG -t "$IMAGE":$CI_COMMIT_REF_NAME ingestor/ + - cd ./testing && ./gitlab_test.sh && cd - except: - master - + diff --git a/README.md b/README.md index 09e0cf3f2ba7e33316619592d04751723e523c3e..4e74a1aab9fd822b5250f9492434ecf25c4eed57 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,8 @@ docker build cache/ --cache-from registry.gitlab.eox.at/esa/prism/vs/pvs_cache - docker build preprocessor/ --cache-from registry.gitlab.eox.at/esa/prism/vs/pvs_preprocessor -t registry.gitlab.eox.at/esa/prism/vs/pvs_preprocessor docker build client/ --cache-from registry.gitlab.eox.at/esa/prism/vs/pvs_client -t registry.gitlab.eox.at/esa/prism/vs/pvs_client docker build fluentd/ --cache-from registry.gitlab.eox.at/esa/prism/vs/fluentd -t registry.gitlab.eox.at/esa/prism/vs/fluentd +docker build ingestor/ --cache-from registry.gitlab.eox.at/esa/prism/vs/pvs_ingestor -t registry.gitlab.eox.at/esa/prism/vs/pvs_ingestor + ``` Or pull them from the registry: ``` @@ -207,11 +209,9 @@ docker stack rm vhr18-pvs # stop stack docker volume rm vhr18-pvs_db-data # delete volumes docker volume rm vhr18-pvs_redis-data docker volume rm vhr18-pvs_traefik-data -docker volume rm vhr18-pvs_cache-db docker volume rm vhr18-pvs_instance-data ``` - ### Setup logging To access the logs, navigate to http://localhost:5601 . Ignore all of the fancy enterprise capabilities and select Kibana > Discover in the hamburger menu. @@ -221,6 +221,21 @@ Since we only have fluentd, you can just use `*` as index pattern. Select `@timestamp` as time field ([see also](https://www.elastic.co/guide/en/kibana/current/tutorial-define-index.html)). + +### setup sftp + +The `SFTP` image allow remote access into 2 logging folders, you can define (edit/add) users, passwords and (UID/GID) in the respictive configuration file ( e.g *config/vhr_sftp_users.conf* ). + +The default username is `eox`, once the stack is deployed you can sftp into the logging folders through port 2222 on -if you rn the dev stack- localhost : + +```bash +sftp -P 2222 eox@127.0.0.1 +``` +You will log in into`/home/eox/data` directory which contains the 2 logging directories : `to/panda` and `from/fepd` + + **NOTE:** The mounted directory that you are directed into is *`/home/user`*, where `user` is the username, hence when changing the username in the `.conf` file, the `sftp` mounted volumes path in `docker-compse.<collection>.yml` must change respectivly. + + # Documentation ## Installation diff --git a/cache/configure.sh b/cache/configure.sh index d2d3c3ceb60556161c26a26f033e4a59afe2045c..e362050f8d11d2ed6805113b927d90cae05667c6 100755 --- a/cache/configure.sh +++ b/cache/configure.sh @@ -25,8 +25,6 @@ cd - chown -R www-data:www-data "${INSTALL_DIR}" -mkdir -p "/cache-db/${COLLECTION}" - if [ ! -f "${APACHE_CONF}" ] ; then echo "Adding Apache configuration" diff --git a/cache/install.sh b/cache/install.sh index d7c73c713b74893b196c6f9acaeedbf504fd0de6..64859f909e9e87b78c30e282597597fe9178af7f 100755 --- a/cache/install.sh +++ b/cache/install.sh @@ -10,7 +10,7 @@ add-apt-repository -y ppa:schpidi2/eox apt update echo "Installing packages" -VERSION=1.8.0-1~bionic1eox6 +VERSION=1.10.0-1~bionic0eox2 DEBIAN_FRONTEND=noninteractive apt install -y \ libmapcache1=${VERSION} libapache2-mod-mapcache=${VERSION} mapcache-tools=${VERSION} \ sqlite3 curl apache2 python3-dateutil python3-redis wait-for-it postgresql-client diff --git a/config/dem_mapcache-dev.xml b/config/dem_mapcache-dev.xml index 42332ca2a01f47508c3cafd718e065f3039619b5..6a96f676635350eec2f45ba8c758d12f6c80dde2 100644 --- a/config/dem_mapcache-dev.xml +++ b/config/dem_mapcache-dev.xml @@ -1,4 +1,7 @@ <mapcache> + <connection_pool> + <max_connections>15</max_connections> + </connection_pool> <default_format>mixed</default_format> <format name="mypng" type="PNG"> <compression>fast</compression> diff --git a/config/dem_mapcache-ops.xml b/config/dem_mapcache-ops.xml index a9f2a2aa67b9a3dcf96cb48384f9c85a00314b6e..9fb0f007876fb62570d586609609935b74866043 100644 --- a/config/dem_mapcache-ops.xml +++ b/config/dem_mapcache-ops.xml @@ -1,4 +1,7 @@ <mapcache> + <connection_pool> + <max_connections>15</max_connections> + </connection_pool> <default_format>mixed</default_format> <format name="mypng" type="PNG"> <compression>fast</compression> diff --git a/config/dem_preprocessor-config.yml b/config/dem_preprocessor-config.yml new file mode 100644 index 0000000000000000000000000000000000000000..3ae30fe3a7b0cf59ba735aabe6e05db529ff4fb1 --- /dev/null +++ b/config/dem_preprocessor-config.yml @@ -0,0 +1,55 @@ +source: + type: swift + kwargs: + username: !env '${OS_USERNAME_DOWNLOAD}' + password: !env '${OS_PASSWORD_DOWNLOAD}' + tenant_name: !env '${OS_TENANT_NAME_DOWNLOAD}' + tenant_id: !env '${OS_TENANT_ID_DOWNLOAD}' + region_name: !env '${OS_REGION_NAME_DOWNLOAD}' + auth_url: !env '${OS_AUTH_URL_DOWNLOAD}' + auth_version: !env '${ST_AUTH_VERSION_DOWNLOAD}' + user_domain_name: !env '${OS_USER_DOMAIN_NAME_DOWNLOAD}' +target: + type: swift + replace: false + kwargs: + username: !env '${OS_USERNAME}' + password: !env '${OS_PASSWORD}' + tenant_name: !env '${OS_TENANT_NAME}' + tenant_id: !env '${OS_TENANT_ID}' + region_name: !env '${OS_REGION_NAME}' + auth_version: !env '${ST_AUTH_VERSION}' + auth_url: !env '${OS_AUTH_URL}' + user_domain_name: !env '${OS_USER_DOMAIN_NAME}' + container: !env '${UPLOAD_CONTAINER}' +workdir: /tmp +keep_temp: false +metadata_glob: '*GSC*.xml' +type_extractor: + xpath: + - /gsc:report/gsc:sar_metadata/gml:metaDataProperty/gsc:EarthObservationMetaData/eop:productType/text() +level_extractor: + # xpath can also be a list of xpaths to be tried one after another + xpath: substring-after(substring-after(/gsc:report/gsc:opt_metadata/gml:metaDataProperty/gsc:EarthObservationMetaData/eop:parentIdentifier/text(), '/'), '/') +preprocessing: + defaults: + move_files: true + data_file_globs: # needs to be taken only from DEM sub-folder, otherwise previews get in + - '**/DEM/*.dt2' + - '**/DEM/*.dt1' + - '**/DEM/*.dt0' + - '**/DEM/*.tif' + output: + options: + format: COG + dstSRS: 'EPSG:4326' + dstNodata: 0 + creationOptions: + - BLOCKSIZE=512 + - COMPRESS=DEFLATE + - NUM_THREADS=8 + - BIGTIFF=IF_SAFER + - OVERVIEWS=AUTO + types: + SAR_DGE_30: # just to pass validation + nested: true diff --git a/config/dem_sftp_users.conf b/config/dem_sftp_users.conf new file mode 100644 index 0000000000000000000000000000000000000000..a26dadbcb59ae265463143670502c500c72f06c1 --- /dev/null +++ b/config/dem_sftp_users.conf @@ -0,0 +1 @@ +***REMOVED***:1001:100 \ No newline at end of file diff --git a/config/emg_init-db.sh b/config/emg_init-db.sh index 73f9c1129b1767c66ccd09c6e920e143190b9328..5944a22d495d77b97a2b2e7dfa9cee068742c984 100644 --- a/config/emg_init-db.sh +++ b/config/emg_init-db.sh @@ -92,7 +92,7 @@ if python3 manage.py id check "${COLLECTION}"; then --blue-range 0 255 \ --red-nodata 0 \ --green-nodata 0 \ - --blue-nodata 0 + --blue-nodata 0 # EQ02_3 python3 manage.py producttype create "${COLLECTION}"_Product_EQ02_3 --traceback \ --coverage-type "RGB" @@ -125,7 +125,7 @@ if python3 manage.py id check "${COLLECTION}"; then --blue-range 0 500 \ --red-nodata 0 \ --green-nodata 0 \ - --blue-nodata 0 + --blue-nodata 0 # EQ02_4 python3 manage.py producttype create "${COLLECTION}"_Product_EQ02_4 --traceback \ --coverage-type "RGBNir" @@ -204,7 +204,7 @@ if python3 manage.py id check "${COLLECTION}"; then --blue-range 0 800 \ --red-nodata 0 \ --green-nodata 0 \ - --blue-nodata 0 + --blue-nodata 0 # EW02_4 python3 manage.py producttype create "${COLLECTION}"_Product_EW02_4 --traceback \ --coverage-type "RGBNir" @@ -307,7 +307,7 @@ if python3 manage.py id check "${COLLECTION}"; then --blue-range 0 800 \ --red-nodata 0 \ --green-nodata 0 \ - --blue-nodata 0 + --blue-nodata 0 # EW03_4 python3 manage.py producttype create "${COLLECTION}"_Product_EW03_4 --traceback \ --coverage-type "RGBNir" @@ -851,19 +851,6 @@ if python3 manage.py id check "${COLLECTION}"; then python3 manage.py browsetype create "${COLLECTION}"_Product_SP07 "NDVI" --traceback \ --grey "(nir-red)/(nir+red)" --grey-range -1 1 # PH1A - python3 manage.py producttype create "${COLLECTION}"_Product_PH1A --traceback \ - --coverage-type "RGBNir" - python3 manage.py browsetype create "${COLLECTION}"_Product_PH1A --traceback \ - --red "red" \ - --green "green" \ - --blue "blue" \ - --red-range 1 1000 \ - --green-range 1 1000 \ - --blue-range 1 1000 \ - --red-nodata 0 \ - --green-nodata 0 \ - --blue-nodata 0 - # PH1A python3 manage.py producttype create "${COLLECTION}"_Product_PH1A --traceback \ --coverage-type "RGBNir" python3 manage.py browsetype create "${COLLECTION}"_Product_PH1A --traceback \ diff --git a/config/emg_mapcache-dev.xml b/config/emg_mapcache-dev.xml index c768c922323784283a61e9111860b00c28b8f4b8..279ecd3118ef90a7e0acaacb4526d8b10a0b50cb 100644 --- a/config/emg_mapcache-dev.xml +++ b/config/emg_mapcache-dev.xml @@ -1,4 +1,7 @@ <mapcache> + <connection_pool> + <max_connections>15</max_connections> + </connection_pool> <default_format>mixed</default_format> <format name="mypng" type="PNG"> <compression>fast</compression> diff --git a/config/emg_mapcache-ops.xml b/config/emg_mapcache-ops.xml index bb4c17ab7c975b2a75de27aec62e19f4bf274f0f..02cd349edadae751c895b0e0d0fd36e848261ed4 100644 --- a/config/emg_mapcache-ops.xml +++ b/config/emg_mapcache-ops.xml @@ -1,4 +1,7 @@ <mapcache> + <connection_pool> + <max_connections>15</max_connections> + </connection_pool> <default_format>mixed</default_format> <format name="mypng" type="PNG"> <compression>fast</compression> diff --git a/config/emg_preprocessor-config.yml b/config/emg_preprocessor-config.yml new file mode 100644 index 0000000000000000000000000000000000000000..dc6fc7b08763e5069c0839ea9409f36c6336c271 --- /dev/null +++ b/config/emg_preprocessor-config.yml @@ -0,0 +1,173 @@ +source: + type: swift + kwargs: + username: !env{{OS_USERNAME_DOWNLOAD}} + password: "!env{{OS_PASSWORD_DOWNLOAD}}" + tenant_name: "!env{{OS_TENANT_NAME_DOWNLOAD}}" + tenant_id: "!env{{OS_TENANT_ID_DOWNLOAD}}" + region_name: "!env{{OS_REGION_NAME_DOWNLOAD}}" + auth_url: "!env{{OS_AUTH_URL_DOWNLOAD}}" + auth_version: "!env{{ST_AUTH_VERSION_DOWNLOAD}}" + user_domain_name: !env{{OS_USER_DOMAIN_NAME_DOWNLOAD}} +# target: +# type: swift +# kwargs: +# auth_version: !env{{ST_AUTH_VERSION}} +# auth_url: "!env{{OS_AUTH_URL}}" +# username: "!env{{OS_USERNAME}}" +# password: "!env{{OS_PASSWORD}}" +# tenant_name: !env{{OS_TENANT_NAME}} +# tenant_id: !env{{OS_TENANT_ID}} +# region_name: !env{{OS_REGION_NAME}} +# user_domain_name: !env{{OS_USER_DOMAIN_NAME}} +target: + type: local + replace: true + kwargs: + storage_path: /mnt/data/target + +workdir: /mnt/data/workdir +keep_temp: true + +# metadata file to look for in downloaded tar/zip file +metadata_glob: "*GSC*.xml" + +# extractors for Product type / level +type_extractor: + # xpath can also be a list of xpaths to be tried one after another + xpath: + - /gsc:report/gsc:opt_metadata/gml:using/eop:EarthObservationEquipment/eop:platform/eop:Platform/eop:shortName/text() + - /gsc:report/gsc:sar_metadata/gml:using/eop:EarthObservationEquipment/eop:platform/eop:Platform/eop:shortName/text() + map: # optional mapping from extracted type name to used product type name + PHR_FUS__3: PH00 + +level_extractor: + # xpath can also be a list of xpaths to be tried one after another + xpath: substring-after(substring-after(/gsc:report/gsc:opt_metadata/gml:metaDataProperty/gsc:EarthObservationMetaData/eop:parentIdentifier/text(), '/'), '/') + map: # optional mapping + + +preprocessing: + defaults: + output: + options: + # WarpOptions (see https://gdal.org/python/osgeo.gdal-module.html#WarpOptions) + format: COG + dstSRS: "EPSG:4326" + dstNodata: 0 + creationOptions: + - BLOCKSIZE=512 + - COMPRESS=DEFLATE + - NUM_THREADS=8 + - BIGTIFF=IF_SAFER + - OVERVIEWS=AUTO + types: + KS03: + nested: true + data_file_globs: + - "*.tif" + additional_file_globs: + - "*.rpc" + georeference: + stack_bands: + # stack all bands for each scene in the product + group_by: ".*/(.*)_P..tif" + sort_by: ".*_P(R|G|B|N).tif" + order: + - R + - G + - B + - N + + + + + RS02: # as extracted/translated above + # whether the package can contain sub-packages of TARs/ZIPs + nested: true + # glob selectors to look for source images in the source package + data_file_globs: + - "*.TIF" + additional_file_globs: + - "*.rpc" + + # a custom preprocessor function to be called on all selected files + # custom_preprocessor: + # path: "path.to.some.module:attribute" + # # TODO: specify args/kwargs and pass meaningful parameters + + georeference: + # georeference each file individually + # - type: geotransform # one of geotransform, RPC, GCP, world file + # - type: GCP + + + stack_bands: + # stack all bands for each scene in the product + group_by: # TODO: figure out a way to get a grouping. e.g: part of the filename using regex? + + output: + + # define a custom postprocessor function to be called on the processed file + # custom_postprocessor: + # path: "path.to.some.module:attribute" + # # TODO: specify args/kwargs and pass meaningful parameters + + CS00: + nested: true + data_file_globs: + - "*.h5" + + subdatasets: + data_file_glob: '*/*/*.h5' + subdataset_types: + '//S01/SBI': 'S01_SBI' + + georeference: + type: corners + corner_names: ["S01_SBI_Bottom_Left_Geodetic_Coordinates", "S01_SBI_Bottom_Right_Geodetic_Coordinates", "S01_SBI_Top_Left_Geodetic_Coordinates", "S01_SBI_Top_Right_Geodetic_Coordinates"] + orbit_direction_name: Orbit_Direction + force_north_up: false + # gcp_srid: + + calc: + formulas: + - inputs: + A: + glob: '*.tif' + band: 1 + B: + glob: '*.tif' + band: 2 + data_type: Float32 + formula: sqrt(A.astype(float)*A.astype(float)+B.astype(float)*B.astype(float)) + output_postfix: _proc + nodata_value: 0 + output: + options: + # WarpOptions (see https://gdal.org/python/osgeo.gdal-module.html#WarpOptions) + format: "COG" + dstSRS: "EPSG:3857" + dstNodata: 0 + creationOptions: + - BLOCKSIZE=512 + - COMPRESS=DEFLATE + - LEVEL=6 + - OVERVIEWS=AUTO + - NUM_THREADS=8 + - BIGTIFF=IF_SAFER + - RESAMPLING=CUBIC + CS01: + nested: true + data_file_globs: + - "*.h5" + + subdatasets: + data_file_glob: '*/*.h5' + subdataset_types: + '//S01/SBI': 'S01_SBI' + + georeference: + type: corners + +# this configuration is still a stub diff --git a/config/emg_sftp_users.conf b/config/emg_sftp_users.conf new file mode 100644 index 0000000000000000000000000000000000000000..a26dadbcb59ae265463143670502c500c72f06c1 --- /dev/null +++ b/config/emg_sftp_users.conf @@ -0,0 +1 @@ +***REMOVED***:1001:100 \ No newline at end of file diff --git a/config/vhr18_mapcache-dev.xml b/config/vhr18_mapcache-dev.xml index 7cf52772adc81ad07ec72616f6650f95404e175c..0935921a46bd44e8a948a088b8d9d2fb0a699984 100644 --- a/config/vhr18_mapcache-dev.xml +++ b/config/vhr18_mapcache-dev.xml @@ -1,4 +1,7 @@ <mapcache> + <connection_pool> + <max_connections>15</max_connections> + </connection_pool> <default_format>mixed</default_format> <format name="mypng" type="PNG"> <compression>fast</compression> diff --git a/config/vhr18_mapcache-ops.xml b/config/vhr18_mapcache-ops.xml index b5afd384f6b2e1bb6c0e5d5eff82170266ce0a21..91ab89d0a27c0ba4a499f5bd4f1a2407c5b55c04 100644 --- a/config/vhr18_mapcache-ops.xml +++ b/config/vhr18_mapcache-ops.xml @@ -1,4 +1,7 @@ <mapcache> + <connection_pool> + <max_connections>15</max_connections> + </connection_pool> <default_format>mixed</default_format> <format name="mypng" type="PNG"> <compression>fast</compression> diff --git a/config/vhr18_preprocessor-config.yml b/config/vhr18_preprocessor-config.yml new file mode 100644 index 0000000000000000000000000000000000000000..359c52da54a21f67d2b38ab45d88671dbe404d23 --- /dev/null +++ b/config/vhr18_preprocessor-config.yml @@ -0,0 +1,53 @@ +source: + type: swift + kwargs: + username: !env '${OS_USERNAME_DOWNLOAD}' + password: !env '${OS_PASSWORD_DOWNLOAD}' + tenant_name: !env '${OS_TENANT_NAME_DOWNLOAD}' + tenant_id: !env '${OS_TENANT_ID_DOWNLOAD}' + region_name: !env '${OS_REGION_NAME_DOWNLOAD}' + auth_url: !env '${OS_AUTH_URL_DOWNLOAD}' + auth_version: !env '${ST_AUTH_VERSION_DOWNLOAD}' + user_domain_name: !env '${OS_USER_DOMAIN_NAME_DOWNLOAD}' +target: + type: swift + replace: false + kwargs: + username: !env '${OS_USERNAME}' + password: !env '${OS_PASSWORD}' + tenant_name: !env '${OS_TENANT_NAME}' + tenant_id: !env '${OS_TENANT_ID}' + region_name: !env '${OS_REGION_NAME}' + auth_version: !env '${ST_AUTH_VERSION}' + auth_url: !env '${OS_AUTH_URL}' + user_domain_name: !env '${OS_USER_DOMAIN_NAME}' + container: !env '${UPLOAD_CONTAINER}' +workdir: /tmp +keep_temp: false +metadata_glob: '*GSC*.xml' +type_extractor: + xpath: + - /gsc:report/gsc:opt_metadata/gml:using/eop:EarthObservationEquipment/eop:platform/eop:Platform/eop:shortName/text() +level_extractor: + # xpath can also be a list of xpaths to be tried one after another + xpath: substring-after(substring-after(/gsc:report/gsc:opt_metadata/gml:metaDataProperty/gsc:EarthObservationMetaData/eop:parentIdentifier/text(), '/'), '/') +preprocessing: + defaults: + move_files: true + data_file_globs: + - '*.tif' + - '*.jp2' + output: + options: + format: COG + dstSRS: 'EPSG:4326' + dstNodata: 0 + creationOptions: + - BLOCKSIZE=512 + - COMPRESS=DEFLATE + - NUM_THREADS=8 + - BIGTIFF=IF_SAFER + - OVERVIEWS=AUTO + types: + PH1B: # just to pass validation + nested: true diff --git a/config/vhr18_sftp_users.conf b/config/vhr18_sftp_users.conf new file mode 100644 index 0000000000000000000000000000000000000000..a26dadbcb59ae265463143670502c500c72f06c1 --- /dev/null +++ b/config/vhr18_sftp_users.conf @@ -0,0 +1 @@ +***REMOVED***:1001:100 \ No newline at end of file diff --git a/core/Dockerfile b/core/Dockerfile index 85a0d289043ed7fcbaf7437332fc68598051f381..33baf310685828ebc93ededfc5b9816786491945 100644 --- a/core/Dockerfile +++ b/core/Dockerfile @@ -70,7 +70,8 @@ ENV INSTANCE_ID="prism-view-server_core" \ REDIS_REGISTER_QUEUE_KEY= \ REDIS_REGISTERED_SET_KEY= \ INIT_SCRIPTS="/configure.sh" \ - COLLECT_STATIC="false" + COLLECT_STATIC="false" \ + REGISTRAR_REPLACE= ADD rgbnir_definition.json \ configure.sh \ diff --git a/core/configure.sh b/core/configure.sh index 943475d4bc273ce64e6f5814babfe8679f9d1d80..3966e886e6f9e09c4c1a41c3fe2b1542466db9e5 100644 --- a/core/configure.sh +++ b/core/configure.sh @@ -51,4 +51,4 @@ chmod g+w -R . chgrp users -R . -} 1> &2 \ No newline at end of file +} 1>&2 diff --git a/core/entrypoint.sh b/core/entrypoint.sh index 8d82bbf7d54a26f1e90d89ec61f4beec452e216b..a8e57f88358df480cba3f7e99640f455eb84f427 100644 --- a/core/entrypoint.sh +++ b/core/entrypoint.sh @@ -5,7 +5,7 @@ TIMEOUT=${WAIT_TIMEOUT:='15'} if [[ ! -z $SERVICES ]] ; then for service in $SERVICES ; do - wait-for-it -t $TIMEOUT $service > &2 + wait-for-it -t $TIMEOUT $service >&2 done fi diff --git a/core/initialized.sh b/core/initialized.sh index f0fdcf300b33b7eaf55ee431b0e9120af632b2e7..07fc196a6d0925e93e78227c33bfef0b53635832 100644 --- a/core/initialized.sh +++ b/core/initialized.sh @@ -1,4 +1,4 @@ #!/bin/bash -e touch "${INSTANCE_DIR}/.initialized" -echo "Instance ${INSTANCE_ID} is initialized" > &2 +echo "Instance ${INSTANCE_ID} is initialized" >&2 diff --git a/core/registrar.py b/core/registrar.py index c501b6cd22e2de7f695dba6018f2ee41e288b179..cb052a74a3433ecd6a261397525c28ca89f61c19 100644 --- a/core/registrar.py +++ b/core/registrar.py @@ -35,14 +35,17 @@ import argparse import textwrap import logging import traceback -import redis +from xml.sax.saxutils import escape +import subprocess +import redis import lxml.etree from swiftclient.service import SwiftService import django from django.db import transaction from django.contrib.gis.geos import GEOSGeometry +from osgeo import gdal path = os.path.join(os.getenv('INSTALL_DIR', "/var/www/pvs"), "pvs_instance") if path not in sys.path: @@ -52,7 +55,9 @@ os.environ.setdefault("DJANGO_SETTINGS_MODULE", "pvs_instance.settings") django.setup() from eoxserver.backends import access +from eoxserver.contrib import vsi from eoxserver.backends import models as backends +from eoxserver.core.util.timetools import isoformat from eoxserver.resources.coverages import models from eoxserver.resources.coverages.registration.product import ( ProductRegistrator @@ -63,7 +68,6 @@ from eoxserver.resources.coverages.registration.registrators.gdal import ( logger = logging.getLogger(__name__) - def setup_logging(verbosity): # start logging setup # get command line level @@ -87,6 +91,16 @@ def setup_logging(verbosity): # finished logging setup +def set_gdal_swift_auth(): + # parsing command line output of swift auth + auth_keys = subprocess.check_output(["swift", "auth"]).decode(sys.stdout.encoding).split("\n") + storage_url = auth_keys[0].split("OS_STORAGE_URL=")[1] + auth_token = auth_keys[1].split("OS_AUTH_TOKEN=")[1] + # setting gdal config + gdal.SetConfigOption("SWIFT_STORAGE_URL", storage_url) + gdal.SetConfigOption("SWIFT_AUTH_TOKEN", auth_token) + + def add_mask(product): metadata_item = product.metadata_items.all()[0] with access.vsi_open(metadata_item) as f: @@ -98,6 +112,7 @@ def add_mask(product): )[0] geometry = GEOSGeometry(wkt) mask_type = models.MaskType.objects.get(product_type=product.product_type) + logger.debug("Adding mask") models.Mask.objects.create( product=product, mask_type=mask_type, @@ -115,7 +130,7 @@ def get_product_type_and_level(metadata_item): xp = '//gml:using/eop:EarthObservationEquipment/eop:platform/eop:Platform/eop:shortName/text()' product_type_name = tree.xpath(xp, namespaces=root.nsmap)[0] except Exception as e: - logger.warning( + logger.debug( 'Failed to determine product type of %s, error was %s' % (metadata_item.location, e) ) @@ -124,7 +139,6 @@ def get_product_type_and_level(metadata_item): xp = '//gml:metaDataProperty/gsc:EarthObservationMetaData/eop:parentIdentifier/text()' parent_identifier = tree.xpath(xp, namespaces=root.nsmap)[0] - print("parent identifier --->", parent_identifier) if parent_identifier.endswith('Level_1'): level = 'Level_1' if parent_identifier.endswith('Level_3'): @@ -132,7 +146,7 @@ def get_product_type_and_level(metadata_item): else: raise Exception('Invalid parent identifier type name %s' % parent_identifier) except Exception as e: - logger.warning( + logger.debug( 'Failed to determine product level of %s, error was %s' % (metadata_item.location, e) ) @@ -140,36 +154,72 @@ def get_product_type_and_level(metadata_item): return product_type_name, level +def get_product_collection(metadata_file): + # in case collection needs to be determined from metadata + try: + if metadata_file.startswith("/vsiswift"): + set_gdal_swift_auth() + with vsi.open(metadata_file, "r") as f: + tree = lxml.etree.parse(f) + root = tree.getroot() + xp = '//gml:metaDataProperty/gsc:EarthObservationMetaData/eop:parentIdentifier/text()' + product_type_name = tree.xpath(xp, namespaces=root.nsmap) + extracted = product_type_name[0].split('/')[0] + return extracted + except Exception as e: + logger.debug( + 'Failed to determine product collection for metadata file %s, error was %s' + % (metadata_file, e) + ) + + +def get_product_type_from_band_count(product_type_name, file_path): + # get raster band count via gdal + logger.debug("Opening file using GDAL: %s" % file_path) + if file_path.startswith("/vsiswift"): + set_gdal_swift_auth() + src_ds = gdal.Open(file_path) + if src_ds is None: + raise RegistrationError("Band check: failed to open dataset: %s " % file_path) + # try to fetch product model with _bandcount + product_type_name_upd = "%s_%s" % (product_type_name, src_ds.RasterCount) + try: + product_type_model = models.ProductType.objects.get(name=product_type_name_upd) + return product_type_model + except models.ProductType.DoesNotExist: + raise RegistrationError("Product Type: '%s' was not found" % product_type_name_upd) + + class RegistrationError(Exception): pass @transaction.atomic def registrar( - collection, - objects_prefix, upload_container, replace=False, client=None, registered_set_key=None + collection_stack, + objects_prefix, upload_container=None, replace=False, client=None, registered_set_key=None, + reporting_dir=None, service_url=None + ): logger.info("Starting registration of product '%s'." % objects_prefix) - metadata_package, data_package, has_vrt = None, None, None - + metadata_package, data_package = None, None + if not upload_container: + # assuming objects_prefix = bucket/itemname + upload_container = objects_prefix.partition("/")[0] + objects_prefix = objects_prefix.partition("/")[2] with SwiftService() as swift: list_parts_gen = swift.list( container=upload_container, options={"prefix": objects_prefix}, ) for page in list_parts_gen: - print(page) if page["success"]: for item in page["listing"]: if item["name"].endswith(".xml"): metadata_package = item["name"] elif item["name"].endswith(".TIF") or \ item["name"].endswith(".tif"): - if has_vrt is not True: - data_package = item["name"] - elif item["name"].endswith(".vrt"): data_package = item["name"] - has_vrt = True elif not item["name"].endswith(".tar"): raise RegistrationError( "Product with objects prefix '%s' has " @@ -187,58 +237,95 @@ def registrar( "Product with objects prefix '%s' has missing content." % objects_prefix ) + logger.debug("Found objects '%s' and '%s'." % (data_package, metadata_package)) storage = backends.Storage.objects.get(name=upload_container) metadata_item = models.MetaDataItem(storage=storage, location=metadata_package) product_type, level = get_product_type_and_level(metadata_item) + if collection_stack == 'DEM': + # special for DEM files, collection name === product_type + gdal_metadata_file_path = "/vsiswift/%s/%s" % (upload_container, metadata_package) + product_type = get_product_collection(gdal_metadata_file_path) + logger.debug("Registering product") + product_type_name = "%s_Product_%s" % (collection_stack, product_type) + + try: + # first find product type by name from path + product_type_model = models.ProductType.objects.get(name=product_type_name) + except models.ProductType.DoesNotExist: + # if not found, maybe there are more product types with _bandcount suffix + gdal_file_path = "/vsiswift/%s/%s" % (upload_container, data_package) + product_type_model = get_product_type_from_band_count(product_type_name, gdal_file_path) + product_type_name = product_type_model.name + coverage_type_names = product_type_model.allowed_coverage_types.all() + if len(coverage_type_names) > 1: + logger.warning("More available 'CoverageType' found, selecting the first one.") + coverage_type_name = coverage_type_names[0].name product, replaced = ProductRegistrator().register( metadata_locations=[[upload_container, metadata_package, ], ], - type_name="%s_Product_%s" % (collection, product_type), + type_name=product_type_name, replace=replace, extended_metadata=True, mask_locations=None, package_path=None, + simplify_footprint_tolerance=0.0001, # ~10meters overrides={}, ) + if product.footprint.empty: + product.delete() + raise RegistrationError("No footprint was extracted. full product: %s" % product) collection = models.Collection.objects.get( - identifier=collection + identifier=collection_stack ) + logger.debug("Inserting product into collection %s" % collection_stack) models.collection_insert_eo_object(collection, product) + if collection_stack == "DEM": + # also insert it to its own collection + collection_own = models.Collection.objects.get( + identifier="%s_%s" % (collection, product_type) + ) + logger.debug("Inserting product to collection %s_%s" % (collection, product_type)) + models.collection_insert_eo_object(collection_own, product) + if level == 'Level_1': collection_level_1 = models.Collection.objects.get( identifier="%s_Level_1" % collection ) + logger.debug("Inserting product to collection %s_Level_1" % collection) models.collection_insert_eo_object(collection_level_1, product) elif level == 'Level_3': collection_level_3 = models.Collection.objects.get( identifier="%s_Level_3" % collection ) + logger.debug("Inserting product to collection %s_Level_3" % collection) models.collection_insert_eo_object(collection_level_3, product) + logger.debug("Registering coverage") report = GDALRegistrator().register( data_locations=[[upload_container, data_package, ], ], metadata_locations=[[upload_container, metadata_package, ], ], - coverage_type_name="RGBNir", + coverage_type_name=coverage_type_name, overrides={ "identifier": "%s__coverage" % product.identifier, "footprint": None, }, replace=replace, ) + logger.debug("Adding coverage to product") models.product_add_coverage(product, report.coverage) try: add_mask(product) except Exception as e: - logger.info("Couldn't add mask.") + logger.debug("Couldn't add mask.") logger.debug(traceback.format_exc()) - logger.warning("%s: %s\n" % (type(e).__name__, str(e))) + logger.debug("%s: %s\n" % (type(e).__name__, str(e))) if client is not None: logger.debug( @@ -252,6 +339,40 @@ def registrar( ) ) + timestamp = product.inserted.strftime("%Y%m%dT%H%M%S") + + if reporting_dir is not None: + with open(os.path.join(reporting_dir, 'item_%s_%s.xml' % (timestamp, product.identifier)),'w') as f: + f.write(textwrap.dedent(""" + <?xml version="1.0" encoding="UTF-8"?> + <DataAccessItem + xsi:schemaLocation="http://www.telespazio.com/CSCDA/CDD/PDAS PDAS_interfaces%2020190924_1916.xsd" + xmlns="http://www.telespazio.com/CSCDA/CDD/PDAS" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <identifier>{identifier}</identifier> + <BROWSE_AVAILABILITY_DATETIME>{availability_time}</BROWSE_AVAILABILITY_DATETIME> + <URL> + <Service>WCS</Service> + <URL>{wms_capabilities_url}</URL> + </URL> + <URL> + <Service>WMS</Service> + <URL>{wcs_capabilities_url}</URL> + </URL> + </DataAccessItem> + """.format( + identifier=escape(product.identifier), + availability_time=escape(isoformat(product.inserted)), + wcs_capabilities_url=escape( + '%s/ows?service=wcs&request=GetCapabilities&cql=identifier="%s"' + % (service_url, product.identifier) + ), + wms_capabilities_url=escape( + '%s/ows?service=wms&request=GetCapabilities&cql=identifier="%s"' + % (service_url, product.identifier) + ), + ))) + logger.info( "Successfully finished registration of product '%s'." % objects_prefix ) @@ -263,6 +384,8 @@ def registrar_redis_wrapper( replace=False, host="localhost", port=6379, register_queue_key="register_queue", registered_set_key="registered_set", + reporting_dir=None, + service_url=None, ): client = redis.Redis( host=host, port=port, charset="utf-8", decode_responses=True @@ -277,7 +400,9 @@ def registrar_redis_wrapper( upload_container, replace=replace, client=client, - registered_set_key=registered_set_key + registered_set_key=registered_set_key, + reporting_dir=reporting_dir, + service_url=service_url, ) except Exception as e: logger.debug(traceback.format_exc()) @@ -321,6 +446,12 @@ if __name__ == "__main__": parser.add_argument( "--redis-port", type=int, default=6379 ) + parser.add_argument( + "--reporting-dir", + ) + parser.add_argument( + "--service-url", + ) parser.add_argument( "-v", "--verbosity", type=int, default=3, choices=[0, 1, 2, 3, 4], @@ -341,8 +472,7 @@ if __name__ == "__main__": upload_container = os.environ.get('UPLOAD_CONTAINER') if upload_container is None: - logger.critical("UPLOAD_CONTAINER environment variable not set.") - sys.exit(1) + logger.warn("UPLOAD_CONTAINER environment variable not set. Assuming part of path bucket/item") if arg_values.mode == "standard": registrar( @@ -350,6 +480,8 @@ if __name__ == "__main__": arg_values.objects_prefix, upload_container, replace=arg_values.replace, + reporting_dir=arg_values.reporting_dir, + service_url=arg_values.service_url, ) else: registrar_redis_wrapper( @@ -360,4 +492,6 @@ if __name__ == "__main__": port=arg_values.redis_port, register_queue_key=arg_values.redis_register_queue_key, registered_set_key=arg_values.redis_registered_set_key, + reporting_dir=arg_values.reporting_dir, + service_url=arg_values.service_url, ) diff --git a/core/run-httpd.sh b/core/run-httpd.sh index 30f876a5eb22d6b07381e9be28258a79b3b57b65..44498e3c1f6338dd0ecb4a49941038e544f44a49 100644 --- a/core/run-httpd.sh +++ b/core/run-httpd.sh @@ -2,4 +2,4 @@ echo "Running gunicorn" -exec gunicorn --chdir ${INSTALL_DIR}/pvs_instance/ --bind :80 pvs_instance.wsgi:application --workers 8 --max-requests 10 --max-requests-jitter 3 --worker-class sync --timeout 120 --access-logfile - --error-logfile - --log-level warning --disable-redirect-access-to-syslog 2> &1 +exec gunicorn --chdir ${INSTALL_DIR}/pvs_instance/ --bind :80 pvs_instance.wsgi:application --workers 8 --max-requests 10 --max-requests-jitter 3 --worker-class sync --timeout 120 --access-logfile - --error-logfile - --log-level warning --disable-redirect-access-to-syslog 2>&1 diff --git a/core/run-registrar.sh b/core/run-registrar.sh index 088f4bfa4b6cefb06e868ae6159021361d1de1bc..348b4f75081870185eb370e84b568f3f46254cd6 100644 --- a/core/run-registrar.sh +++ b/core/run-registrar.sh @@ -1,5 +1,18 @@ #!/bin/sh -echo "Running registrar" > &2 +echo "Running registrar" >&2 +replace="" +if test "$REGISTRAR_REPLACE" = true; then + replace="--replace" +fi -python3 /registrar.py --mode redis --redis-host ${REDIS_HOST} --redis-port ${REDIS_PORT} --redis-register-queue-key ${REDIS_REGISTER_QUEUE_KEY} --redis-registered-set-key ${REDIS_REGISTERED_SET_KEY} > &2 +python3 /registrar.py \ + --mode redis \ + --redis-host ${REDIS_HOST} \ + --redis-port ${REDIS_PORT} \ + --redis-register-queue-key ${REDIS_REGISTER_QUEUE_KEY} \ + --redis-registered-set-key ${REDIS_REGISTERED_SET_KEY} \ + --redis-registered-set-key ${REDIS_REGISTERED_SET_KEY} \ + --reporting-dir ${REPORTING_DIR} \ + --service-url ${SERVICE_URL} \ + ${replace} >&2 diff --git a/core/wait-initialized.sh b/core/wait-initialized.sh index da9746eeb2f90aec198cb15c97580baa2117421a..95afa5bde5cc5642492125f53e914d0d8ddfa9a1 100644 --- a/core/wait-initialized.sh +++ b/core/wait-initialized.sh @@ -1,7 +1,7 @@ #!/bin/bash -e until [ -f "${INSTANCE_DIR}/.initialized" ] ; do - echo "Waiting until instance ${INSTANCE_ID} is initialized" > &2 + echo "Waiting until instance ${INSTANCE_ID} is initialized" >&2 sleep 3 # TODO: timeout? done diff --git a/docker-compose.dem.dev.yml b/docker-compose.dem.dev.yml index 3e213c68cebc843d445f5ec9c4110bd6cb44b28b..8ac49a6c078ec3cdc21e82686f8aee3543ea070f 100644 --- a/docker-compose.dem.dev.yml +++ b/docker-compose.dem.dev.yml @@ -9,8 +9,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" client: ports: - "80:80" @@ -21,8 +19,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" renderer: ports: - "81:80" @@ -31,8 +27,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" registrar: volumes: - type: bind @@ -53,8 +47,6 @@ services: configs: - source: mapcache-dev target: /mapcache-template.xml - logging: - driver: "fluentd" preprocessor: volumes: - type: tmpfs @@ -65,8 +57,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" networks: extnet: name: dem-extnet diff --git a/docker-compose.dem.yml b/docker-compose.dem.yml index 7b496f1be491dd5c2f8d9989ddfab8c611b96cfd..16de9b1d7b9d8baf45e384ccc1aaf6d24c720558 100644 --- a/docker-compose.dem.yml +++ b/docker-compose.dem.yml @@ -14,6 +14,11 @@ services: constraints: [node.role == manager] networks: - intnet + command: ["postgres", "-c", "max_connections=300"] + sysctls: + net.ipv4.tcp_keepalive_time: 600 + net.ipv4.tcp_keepalive_probes: 5 + net.ipv4.tcp_keepalive_intvl: 10 redis: image: redis volumes: @@ -28,9 +33,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db - type: volume source: instance-data target: /var/www/pvs @@ -60,9 +62,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db env_file: - env/dem.env - env/dem_db.env @@ -83,9 +82,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db env_file: - env/dem.env - env/dem_obs.env @@ -106,10 +102,12 @@ services: - env/dem.env - env/dem_obs.env - env/dem_redis.env - - env/dem_preprocessor.env environment: INSTANCE_ID: "prism-view-server_preprocessor" WAIT_SERVICES: "redis:6379" + configs: + - source: preprocessor-config + target: /config.yaml deploy: replicas: 1 networks: @@ -124,6 +122,9 @@ services: - type: volume source: instance-data target: /var/www/pvs + - type: volume + source: report-data + target: /mnt/reports/ env_file: - env/dem.env - env/dem_db.env @@ -138,6 +139,7 @@ services: INIT_SCRIPTS: "/configure.sh /init-db.sh /initialized.sh" STARTUP_SCRIPTS: "/wait-initialized.sh" WAIT_SERVICES: "redis:6379 database:5432" + REPORTING_DIR: '/mnt/reports/' configs: - source: init-db target: /init-db.sh @@ -151,7 +153,36 @@ services: image: registry.gitlab.eox.at/esa/prism/vs/pvs_client:latest deploy: replicas: 1 + sftp: + image: atmoz/sftp:latest + volumes: + - type: volume + source: report-data + target: /home/eox/data/to/panda + - type: volume + source: from-fepd + target: /home/eox/data/from/fepd + configs: + - source: sftp-users + target: /etc/sftp/users.conf + + ports: + - "2222:22" + deploy: + replicas: 1 + ingestor: + image: registry.gitlab.eox.at/esa/prism/vs/pvs_ingestor:latest + env_file: + - env/dem_redis.env + environment: + INSTANCE_ID: "prism-view-server_ingestor" + deploy: + replicas: 1 + networks: + - intnet configs: + sftp-users: + file: ./config/dem_sftp_users.conf init-db: file: ./config/dem_init-db.sh mapcache-dev: @@ -162,10 +193,13 @@ configs: file: ./config/dem_index-dev.html client-ops: file: ./config/dem_index-ops.html + preprocessor-config: + file: ./config/dem_preprocessor-config.yml volumes: db-data: redis-data: - cache-db: instance-data: + from-fepd: + report-data: networks: intnet: diff --git a/docker-compose.emg.dev.yml b/docker-compose.emg.dev.yml index 8e8c7d65bfbddd9875a8e4e9441ab53510c040d9..af436d2e83c8e6a65fe69ad5473896b4ec523bd1 100644 --- a/docker-compose.emg.dev.yml +++ b/docker-compose.emg.dev.yml @@ -9,8 +9,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" client: ports: - "80:80" @@ -21,8 +19,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" renderer: ports: - "81:80" @@ -31,8 +27,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" registrar: volumes: - type: bind @@ -41,8 +35,6 @@ services: - type: bind source: ./core/ target: /core/ - logging: - driver: "fluentd" cache: ports: - "83:80" @@ -53,8 +45,6 @@ services: configs: - source: mapcache-dev target: /mapcache-template.xml - logging: - driver: "fluentd" preprocessor: volumes: - type: tmpfs @@ -65,8 +55,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" networks: extnet: name: emg-extnet diff --git a/docker-compose.emg.yml b/docker-compose.emg.yml index 807f1e48fa2fe19dbf0ec94e649653e81a215abb..4fcd5168c7b88c936527fe5412ec83d8e6fd1add 100644 --- a/docker-compose.emg.yml +++ b/docker-compose.emg.yml @@ -14,6 +14,11 @@ services: constraints: [node.role == manager] networks: - intnet + command: ["postgres", "-c", "max_connections=300"] + sysctls: + net.ipv4.tcp_keepalive_time: 600 + net.ipv4.tcp_keepalive_probes: 5 + net.ipv4.tcp_keepalive_intvl: 10 redis: image: redis volumes: @@ -28,9 +33,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db - type: volume source: instance-data target: /var/www/pvs @@ -60,9 +62,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db env_file: - env/emg.env - env/emg_db.env @@ -83,9 +82,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db env_file: - env/emg.env - env/emg_obs.env @@ -100,16 +96,28 @@ services: - intnet command: ["/run-seeder.sh"] + ingestor: + image: registry.gitlab.eox.at/esa/prism/vs/pvs_ingestor:latest + env_file: + - env/emg_redis.env + environment: + INSTANCE_ID: "prism-view-server_ingestor" + deploy: + replicas: 1 + networks: + - intnet preprocessor: image: registry.gitlab.eox.at/esa/prism/vs/pvs_preprocessor:latest env_file: - env/emg.env - env/emg_obs.env - env/emg_redis.env - - env/emg_preprocessor.env environment: INSTANCE_ID: "prism-view-server_preprocessor" WAIT_SERVICES: "redis:6379" + configs: + - source: preprocessor-config + target: /config.yaml deploy: replicas: 1 networks: @@ -124,6 +132,9 @@ services: - type: volume source: instance-data target: /var/www/pvs + - type: volume + source: report-data + target: /mnt/reports/ env_file: - env/emg.env - env/emg_db.env @@ -138,6 +149,7 @@ services: INIT_SCRIPTS: "/configure.sh /init-db.sh /initialized.sh" STARTUP_SCRIPTS: "/wait-initialized.sh" WAIT_SERVICES: "redis:6379 database:5432" + REPORTING_DIR: '/mnt/reports/' configs: - source: init-db target: /init-db.sh @@ -151,7 +163,26 @@ services: image: registry.gitlab.eox.at/esa/prism/vs/pvs_client:latest deploy: replicas: 1 + sftp: + image: atmoz/sftp:latest + volumes: + - type: volume + source: report-data + target: /home/eox/data/to/panda + - type: volume + source: from-fepd + target: /home/eox/data/from/fepd + configs: + - source: sftp-users + target: /etc/sftp/users.conf + + ports: + - "2222:22" + deploy: + replicas: 1 configs: + sftp-users: + file: ./config/emg_sftp_users.conf init-db: file: ./config/emg_init-db.sh mapcache-dev: @@ -162,10 +193,13 @@ configs: file: ./config/emg_index-dev.html client-ops: file: ./config/emg_index-ops.html + preprocessor-config: + file: ./config/emg_preprocessor-config.yml volumes: db-data: redis-data: - cache-db: instance-data: + from-fepd: + report-data: networks: intnet: diff --git a/docker-compose.logging.dev.yml b/docker-compose.logging.dev.yml index 6ce3f23db24f96836bfef5b3f17bc2af06d83931..d749cb97edd584b85c8bbe46b708d06e2653ee1f 100644 --- a/docker-compose.logging.dev.yml +++ b/docker-compose.logging.dev.yml @@ -11,6 +11,24 @@ services: resources: limits: memory: 500M + database: + logging: + driver: "fluentd" + client: + logging: + driver: "fluentd" + renderer: + logging: + driver: "fluentd" + registrar: + logging: + driver: "fluentd" + cache: + logging: + driver: "fluentd" + preprocessor: + logging: + driver: "fluentd" kibana: ports: - "5601:5601" diff --git a/docker-compose.vhr18.dev.yml b/docker-compose.vhr18.dev.yml index 1a576b4abd0748a1d93811616801719a3568df97..e7c46c3f2348a66e5fa40df62c5c86dc5557d3de 100644 --- a/docker-compose.vhr18.dev.yml +++ b/docker-compose.vhr18.dev.yml @@ -9,8 +9,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" client: ports: - "80:80" @@ -21,8 +19,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" renderer: ports: - "81:80" @@ -31,8 +27,6 @@ services: - type: bind source: ./data/ target: /data/ - logging: - driver: "fluentd" registrar: volumes: - type: bind @@ -41,8 +35,6 @@ services: - type: bind source: ./core/ target: /core/ - logging: - driver: "fluentd" cache: ports: - "83:80" @@ -53,8 +45,6 @@ services: configs: - source: mapcache-dev target: /mapcache-template.xml - logging: - driver: "fluentd" preprocessor: volumes: - type: tmpfs @@ -62,8 +52,6 @@ services: - type: bind source: ./preprocessor/ target: /preprocessor/ - logging: - driver: "fluentd" networks: extnet: name: vhr18-extnet diff --git a/docker-compose.vhr18.yml b/docker-compose.vhr18.yml index 870e34271a010f8c0689fd2e3afd9d491a0dca41..dbaa05bf63427e90deaf1fa82f1f81488214bb5b 100644 --- a/docker-compose.vhr18.yml +++ b/docker-compose.vhr18.yml @@ -14,6 +14,11 @@ services: constraints: [node.role == manager] networks: - intnet + command: ["postgres", "-c", "max_connections=300"] + sysctls: + net.ipv4.tcp_keepalive_time: 600 + net.ipv4.tcp_keepalive_probes: 5 + net.ipv4.tcp_keepalive_intvl: 10 redis: image: redis volumes: @@ -28,9 +33,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db - type: volume source: instance-data target: /var/www/pvs @@ -60,9 +62,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db env_file: - env/vhr18.env - env/vhr18_db.env @@ -86,9 +85,6 @@ services: volumes: - type: tmpfs target: /tmp - - type: volume - source: cache-db - target: /cache-db env_file: - env/vhr18.env - env/vhr18_obs.env @@ -103,16 +99,28 @@ services: - intnet command: ["/run-seeder.sh"] + ingestor: + image: registry.gitlab.eox.at/esa/prism/vs/pvs_ingestor:latest + env_file: + - env/vhr18_redis.env + environment: + INSTANCE_ID: "prism-view-server_ingestor" + deploy: + replicas: 1 + networks: + - intnet preprocessor: image: registry.gitlab.eox.at/esa/prism/vs/pvs_preprocessor:latest env_file: - env/vhr18.env - env/vhr18_obs.env - env/vhr18_redis.env - - env/vhr18_preprocessor.env environment: INSTANCE_ID: "prism-view-server_preprocessor" WAIT_SERVICES: "redis:6379" + configs: + - source: preprocessor-config + target: /config.yaml deploy: replicas: 1 networks: @@ -127,6 +135,9 @@ services: - type: volume source: instance-data target: /var/www/pvs + - type: volume + source: report-data + target: /mnt/reports/ env_file: - env/vhr18.env - env/vhr18_db.env @@ -141,6 +152,7 @@ services: INIT_SCRIPTS: "/configure.sh /init-db.sh /initialized.sh" STARTUP_SCRIPTS: "/wait-initialized.sh" WAIT_SERVICES: "redis:6379 database:5432" + REPORTING_DIR: '/mnt/reports/' configs: - source: init-db target: /init-db.sh @@ -154,7 +166,30 @@ services: image: registry.gitlab.eox.at/esa/prism/vs/pvs_client:latest deploy: replicas: 1 + sftp: + image: atmoz/sftp:latest + volumes: + - type: volume + source: report-data + target: /home/eox/data/to/panda + - type: volume + source: from-fepd + target: /home/eox/data/from/fepd + configs: + - source: sftp-users + target: /etc/sftp/users.conf + deploy: + replicas: 1 + + ports: + - "2222:22" + ingestor: + image: registry.gitlab.eox.at/esa/prism/vs/pvs_ingestor:latest + deploy: + replicas: 1 configs: + sftp-users: + file: ./config/vhr18_sftp_users.conf init-db: file: ./config/vhr18_init-db.sh mapcache-dev: @@ -165,10 +200,13 @@ configs: file: ./config/vhr18_index-dev.html client-ops: file: ./config/vhr18_index-ops.html + preprocessor-config: + file: ./config/vhr18_preprocessor-config.yml volumes: db-data: redis-data: - cache-db: instance-data: + from-fepd: + report-data: networks: intnet: diff --git a/documentation/operator-guide/configuration.rst b/documentation/operator-guide/configuration.rst index 327832e46be3b71202293e4df20d59354518231b..578bd680b47cbe2e61596c6357ca9edbf89c2866 100644 --- a/documentation/operator-guide/configuration.rst +++ b/documentation/operator-guide/configuration.rst @@ -117,8 +117,6 @@ The following ``.env`` files are typically used: django admin user to be used with the admin GUI. * ``<stack-name>_obs.env``: This contains access parameters for the object storage(s). -* ``<stack-name>_preprocessor.env``: Preprocessor related environment - variables * ``<stack-name>_redis.env``: Redis access credentials and queue names @@ -173,6 +171,7 @@ retrieve the original product files: * ``OS_REGION_NAME_DOWNLOAD`` * ``OS_AUTH_URL_DOWNLOAD`` * ``ST_AUTH_VERSION_DOWNLOAD`` +* ``OS_USER_DOMAIN_NAME_DOWNLOAD`` VS Environment Variables ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -193,6 +192,8 @@ parts. to. * ``DJANGO_USER``, ``DJANGO_MAIL``, ``DJANGO_PASSWORD`` - The Django admin user account credentials to use the Admin GUI. +* ``REPORTING_DIR`` - + This sets the directory to write the reports of the registered products to. .. note:: These variables are used during the initial stack setup. When these @@ -240,18 +241,256 @@ such a configuration file is defined and the used in a service: The following configuration files are used throughout the VS: -* ``<stack-name>_init-db.sh``: This shell script file's purpose is to set up - the EOxServer instance used by both the renderer and registrar. -* ``<stack-name>_index-dev.html``/``<stack-name>_index-ops.html``: The - clients main HTML page, containing various client settings. The ``dev`` one - is used for development only, whereas the ``ops`` one is used for operational - deployment. -* ``<stack-name>_mapcache-dev.xml``/``<stack-name>_mapcache-ops.xml``: The - configuration file for MapCache, the software powering the cache service. - Similarly to the client configuration files, the ``dev`` and ``ops`` files - used for development and operational usage respectively. Further - documentation can be found at `the official site - <https://mapserver.org/mapcache/config.html>`_. +``<stack-name>_init-db.sh`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This shell script file's purpose is to set up the EOxServer instance used by +both the renderer and registrar. + +``<stack-name>_index-dev.html``/``<stack-name>_index-ops.html`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The clients main HTML page, containing various client settings. The ``dev`` one +is used for development only, whereas the ``ops`` one is used for operational +deployment. + +``<stack-name>_mapcache-dev.xml``/``<stack-name>_mapcache-ops.xml`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The configuration file for MapCache, the software powering the cache service. +Similarly to the client configuration files, the ``dev`` and ``ops`` files +used for development and operational usage respectively. Further +documentation can be found at `the official site +<https://mapserver.org/mapcache/config.html>`_. + +``<stack-name>_preprocessor-config.yaml`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The configuration for the proprocessing service to use to process to be +ingested files. + +The files are using YAML as a format and are structured in the following +fashion: + +source/target + + Here, the source file storage and the target file storage are configured. + This can either be a local directory or an OpenStack Swift object storage. + +workdir + + The workdir can be configured, to determine where the intermediate files are + placed. This can be convenient for debugging and development. + +keep_temp + + This boolean decides if the temporary directory for the preprocessing will be + cleaned up after being finished. Also, convenient for development. + +metadata_glob + + This file glob is used to determine the main metadata file to extract the + product type from. This file will be searched in the downloaded package. + +glob_case + + If all globs will be used in a case-sensitive way. + +type_extractor + + This setting configures how the product type is extracted from the previously + extracted metadata. In the ``xpath`` setting one or more XPath expressions + can supplied to fetch the product type. Each XPath will be tried until one is + found that produces a result. These results can then be mapped using the + ``map`` dictionary. + +level_extractor + + This section works very similar to the ``type_extractor`` but only for the + product level. The product level is currently not used. + +preprocessing + + This is the actual preprocessing configuration setting. It is split in + defaults and product type specific settings. The defaults are applied + where there is no setting supplied for that specific type. The product type + is the one extracted earlier. + + defaults + + This section allows to configure any one of the available steps. Each step + configuration can be overridden in a specific product type configuration. + + The available steps are as follows: + + custom_preprocessor + + A custom python function to be called. + + path + + The Python module path to the function to call. + + args + + A list of arguments to pass to the function. + + kwargs + + A dictionary of keyword arguments to pass to the function. + + subdatasets + + What subdatasets to extract and how to name them. + + data_file_glob + + A file glob pattern to select files to extract from. + + subdataset_types + + Mapping of subdataset identifier to output filename postfix for + subdatasets to be extracted for each data file. + + georeference + + How the extracted files shall be georeferenced. + + type + + The type of georeferencing to apply. One of ``gcp``, ``rpc``, + ``corner``, ``world``. + + options + + Additional options for the georeferencing. Depends on the type of + georeferencing. + + order + + The polynomial order to use for GCP related georeferencing. + + projection + + The projection to use for ungeoreferenced images. + + rpc_file_template + + The file glob template to use to find the RPC file. Template + parameters are {filename}, {fileroot}, and {extension}. + + warp_options + + Warp options. See + https://gdal.org/python/osgeo.gdal-module.html#WarpOptions for + details + + corner_names + + The metadata field name including the corner names. Tuple of four: + bottom-left, bottom-right, top-left and top-right + + orbit_direction_name + + The metadata field name containing the orbit direction + + force_north_up + + TODO + + tps + + Whether to use TPS transformation instead of GCP polynomials. + + calc + + Calculate derived data using formulas. + + formulas + + A list of formulas to use to calculate derived data. Each has the + following fields + + inputs + + A map of characters in the range of A-Z to respective inputs. Each + has the following properties + + glob + + The input file glob + + band + + The input file band index (1-based) + + data_type + + The GDAL data type name for the output + + formula + + The formula to apply. See + https://gdal.org/programs/gdal_calc.html#cmdoption-calc for details. + + output_postfix + + The postfix to apply for the filename of the created file. + + nodata_value + + The nodata value to be used. + + stack_bands + + Concatenate bands and arrange them in a single file. + + group_by + + A regex to group the input datasets, if consisting of multiple file. + The first regex group is used for the grouping. + + sort_by + + A regex to select a portion of the filename to be used for sorting. The + first regex group is used. + + order + + The order of the extracted item used in 'sort_by'. When the value + extracted by ``sort_by`` is missing, then that file will be dropped. + + output + + Final adjustments to generate an output file. Add overviews, reproject to + a common projection, etc. + + options + + Options to be passed to `gdal.Warp`. See + https://gdal.org/python/osgeo.gdal-module.html#WarpOptions for details. + + custom_preprocessor + + A custom python function to be called. + + path + + The Python module path to the function to call. + + args + + A list of arguments to pass to the function. + + kwargs + + A dictionary of keyword arguments to pass to the function. + + types + + This mapping of product type identifier to step configuration allows to + define specific step settings, even overriding the values from the + defaults. The next section :ref:`management` describes how an operator interacts with a deployed VS stack. diff --git a/documentation/operator-guide/ingestion.rst b/documentation/operator-guide/ingestion.rst index dc1b31aaab837ab4514ec8949c3047d389a7cb34..16be9b8cf531ba9df398debe962f7c313d9b7c27 100644 --- a/documentation/operator-guide/ingestion.rst +++ b/documentation/operator-guide/ingestion.rst @@ -146,9 +146,9 @@ registrar can be accomplished. Preprocessing ~~~~~~~~~~~~~ -In this section all command examples are assumed to be run from within a running -preprocessor container. To open a shell on a preprocessor, the following -command can be used. +In this section all command examples are assumed to be run from within a +running preprocessor container. To open a shell on a preprocessor, the +following command can be used. .. code-block:: bash diff --git a/documentation/operator-guide/setup.rst b/documentation/operator-guide/setup.rst index fbafa38bda800a7f858170a7776c605f87862885..bb91533c06ab1e442c2ed932fd03f589be0d94d2 100644 --- a/documentation/operator-guide/setup.rst +++ b/documentation/operator-guide/setup.rst @@ -116,6 +116,25 @@ Now the relevant images can be pulled: .. # TODO: ingestor image? + +Logging +------- + +For production, the docker images in the compose files use the default logging +driver. Therefore we configure the default logging driver for the docker daemon to +be fluent by createing the file ``/etc/docker/daemon.json`` with the following content: + +.. code-block:: json + + { + "log-driver": "fluentd" + } + +For development, we don't want to redirect all of the docker logging output, +so the respective compose files for dev configure the logging driver for each +container. + + Stack Deployment ---------------- diff --git a/env/dem.env b/env/dem.env index 687b7c830b448883a8e3a146645cf804f67ecf6a..a1d307f435679da7324a0ec1115bb99e98523345 100644 --- a/env/dem.env +++ b/env/dem.env @@ -3,3 +3,4 @@ UPLOAD_CONTAINER=dem-data GDAL_DISABLE_READDIR_ON_OPEN=TRUE CPL_VSIL_CURL_ALLOWED_EXTENSIONS=.TIF,.tif,.xml +SERVICE_URL=dem.pass.copernicus.eu diff --git a/env/dem_preprocessor.env b/env/dem_preprocessor.env deleted file mode 100644 index 9b650c648b40c6e947507f1dff7fb83af90d974a..0000000000000000000000000000000000000000 --- a/env/dem_preprocessor.env +++ /dev/null @@ -1,5 +0,0 @@ -SPLIT_PARTS_CHECK=False -DATA_FILE_SIZE_LIMIT=100000 -FILENAME_PART_SKIP=auxraster,preview,support,annotation,auxfiles -PREPROCESSOR_REPLACE=TRUE -FORCE_NO_DATA_VALUE=0 \ No newline at end of file diff --git a/env/dem_redis.env b/env/dem_redis.env index 45dcfdf80c5dbf4b1dcdfccaba2671ee2b57d7ef..3eff4afd5a700d498d26fadb791632909d8b5f30 100644 --- a/env/dem_redis.env +++ b/env/dem_redis.env @@ -4,6 +4,7 @@ REDIS_PORT=6379 REDIS_QUEUE_KEY=seed_queue REDIS_PREPROCESS_QUEUE_KEY=preprocess_queue +REDIS_PREPROCESS_MD_QUEUE_KEY=preprocess-md_queue REDIS_REGISTER_QUEUE_KEY=register_queue REDIS_REGISTERED_SET_KEY=registered_set REDIS_SET_KEY=registered_set diff --git a/env/emg.env b/env/emg.env index d056fa2885245bb91d5d250f4fed083dcc54eafe..f3af09603b2a3b96912d5fe0c0ebcb128aadc92e 100644 --- a/env/emg.env +++ b/env/emg.env @@ -3,3 +3,4 @@ UPLOAD_CONTAINER=emg-data GDAL_DISABLE_READDIR_ON_OPEN=TRUE CPL_VSIL_CURL_ALLOWED_EXTENSIONS=.TIF,.tif,.xml +SERVICE_URL=emg.pass.copernicus.eu diff --git a/env/emg_preprocessor.env b/env/emg_preprocessor.env deleted file mode 100644 index c25407773d5186db820c33ce89bd2d35ab47fce7..0000000000000000000000000000000000000000 --- a/env/emg_preprocessor.env +++ /dev/null @@ -1,2 +0,0 @@ -SPLIT_PARTS_CHECK=False -ENFORCE_FOUR_BANDS=True diff --git a/env/emg_redis.env b/env/emg_redis.env index 45dcfdf80c5dbf4b1dcdfccaba2671ee2b57d7ef..3eff4afd5a700d498d26fadb791632909d8b5f30 100644 --- a/env/emg_redis.env +++ b/env/emg_redis.env @@ -4,6 +4,7 @@ REDIS_PORT=6379 REDIS_QUEUE_KEY=seed_queue REDIS_PREPROCESS_QUEUE_KEY=preprocess_queue +REDIS_PREPROCESS_MD_QUEUE_KEY=preprocess-md_queue REDIS_REGISTER_QUEUE_KEY=register_queue REDIS_REGISTERED_SET_KEY=registered_set REDIS_SET_KEY=registered_set diff --git a/env/vhr18.env b/env/vhr18.env index 33c93caf76fbc1d6cfc995e939bf40a66cfce66d..12eacf3ff0adc299d6f800b2cd58414e7c904aae 100644 --- a/env/vhr18.env +++ b/env/vhr18.env @@ -3,3 +3,4 @@ UPLOAD_CONTAINER=vhr18-data GDAL_DISABLE_READDIR_ON_OPEN=TRUE CPL_VSIL_CURL_ALLOWED_EXTENSIONS=.TIF,.tif,.xml +SERVICE_URL=vhr18.pass.copernicus.eu diff --git a/env/vhr18_preprocessor.env b/env/vhr18_preprocessor.env deleted file mode 100644 index c25407773d5186db820c33ce89bd2d35ab47fce7..0000000000000000000000000000000000000000 --- a/env/vhr18_preprocessor.env +++ /dev/null @@ -1,2 +0,0 @@ -SPLIT_PARTS_CHECK=False -ENFORCE_FOUR_BANDS=True diff --git a/env/vhr18_redis.env b/env/vhr18_redis.env index 45dcfdf80c5dbf4b1dcdfccaba2671ee2b57d7ef..3eff4afd5a700d498d26fadb791632909d8b5f30 100644 --- a/env/vhr18_redis.env +++ b/env/vhr18_redis.env @@ -4,6 +4,7 @@ REDIS_PORT=6379 REDIS_QUEUE_KEY=seed_queue REDIS_PREPROCESS_QUEUE_KEY=preprocess_queue +REDIS_PREPROCESS_MD_QUEUE_KEY=preprocess-md_queue REDIS_REGISTER_QUEUE_KEY=register_queue REDIS_REGISTERED_SET_KEY=registered_set REDIS_SET_KEY=registered_set diff --git a/env_setup.sh b/env_setup.sh deleted file mode 100644 index 9f2bb95bc7ffe5d2cb9e26b7676bf2c464ba304d..0000000000000000000000000000000000000000 --- a/env_setup.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -cat $vhr18_db > ./env/vhr18_db.env -cat $vhr18_django > ./env/vhr18_django.env -cat $vhr18_obs > ./env/vhr18_obs.env - -cat $emg_db > ./env/emg_db.env -cat $emg_django > ./env/emg_django.env -cat $emg_obs > ./env/emg_obs.env - - -set -o allexport - -source ./env/emg_db.env -source ./env/vhr18_db.env - -set +o allexport - - -sed -i -e 's/emg-data/pvs_testing/g' ./env/emg.env -sed -i -e 's/vhr18-data/pvs_testing/g' ./env/vhr18.env - -sed -i -e 's/emg-cache/pvs_testing/g' ./env/emg_obs.env -sed -i -e 's/vhr18-cache/pvs_testing/g' ./env/vhr18_obs.env diff --git a/ingestor/.dockerignore b/ingestor/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..c72a38af9c876136701d901b31f2c9e058641b14 --- /dev/null +++ b/ingestor/.dockerignore @@ -0,0 +1,5 @@ +tests +lib +bin +__pycache__ +.pytest_cache diff --git a/ingestor/.gitignore b/ingestor/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..758d3e612a2674122446b61b101f78a23b6a9bd8 --- /dev/null +++ b/ingestor/.gitignore @@ -0,0 +1,139 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# venv stuff +bin +include +pyvenv.cfg diff --git a/ingestor/Dockerfile b/ingestor/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..6490ddff5c31c8f9eb2e9dc2bbfa452e0088434f --- /dev/null +++ b/ingestor/Dockerfile @@ -0,0 +1,61 @@ +#------------------------------------------------------------------------------ +# +# Project: prism view server +# Authors: Fabian Schindler <fabian.schindler@eox.at> +# +#------------------------------------------------------------------------------ +# Copyright (C) 2020 EOX IT Services GmbH <https://eox.at> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies of this Software or works derived from this Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#----------------------------------------------------------------------------- + +FROM ubuntu:18.04 + +MAINTAINER EOX +LABEL name="prism view server cache" \ + vendor="EOX IT Services GmbH <https://eox.at>" \ + license="MIT Copyright (C) 2020 EOX IT Services GmbH <https://eox.at>" \ + type="prism view server ingestor" \ + version="0.0.1-dev" + +USER root +ADD install.sh requirements.txt \ + / +RUN /install.sh +RUN mkdir /ingestor +COPY app.py config.py filedaemon.py / +COPY ingestor/ /ingestor + +ENV COLLECTION_ID= \ + INSTANCE_ID="prism-view-server_ingestor" \ + RENDERER_HOST= \ + COLLECTION= \ + REDIS_HOST= \ + REDIS_PORT="6379" \ + REDIS_PREPROCESS_MD_QUEUE_KEY="preprocess-md_queue" \ + INOTIFY_WATCH_DIR="/mnt/data" \ + ST_AUTH_VERSION=3 \ + OS_AUTH_URL= \ + OS_USERNAME= \ + OS_PASSWORD= \ + OS_TENANT_NAME= \ + OS_TENANT_ID= \ + OS_REGION_NAME= + +CMD ["gunicorn3", "-c", "config.py", "app"] diff --git a/ingestor/Readme.md b/ingestor/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..01d9c75b00a38a20190a7f59c900af337c530b03 --- /dev/null +++ b/ingestor/Readme.md @@ -0,0 +1,16 @@ +# Ingestor + +## Set up + +```bash +python3 -m venv . +source bin/activate +pip install -r requirements.txt +pip install pytest +``` + +## Testing + +```bash +pytest ingestor/ +``` diff --git a/ingestor/app.py b/ingestor/app.py new file mode 100644 index 0000000000000000000000000000000000000000..6fde0df409198fd618ff77f3fd3752a4841bb0d3 --- /dev/null +++ b/ingestor/app.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +#------------------------------------------------------------------------------ +# +# Project: prism view server +# Authors: Fabian Schindler <fabian.schindler@eox.at> +# +#------------------------------------------------------------------------------ +# Copyright (C) 2020 EOX IT Services GmbH <https://eox.at> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies of this Software or works derived from this Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#----------------------------------------------------------------------------- + +import os +import logging +import logging.config +import json + +from flask import Flask, request, Response +import redis + +from ingestor.browse_report import parse_browse_report +from ingestor.util import converter + + +application = Flask(__name__) + +logger = logging.getLogger(__name__) + +logging.config.dictConfig({ + 'version': 1, + 'formatters': { + 'simple': { + 'format': '%(levelname)s: %(message)s', + }, + 'verbose': { + 'format': '[%(asctime)s][%(module)s] %(levelname)s: %(message)s', + } + }, + 'handlers': { + 'console': { + 'level': 'DEBUG', + 'class': 'logging.StreamHandler', + 'formatter': 'verbose', + } + }, + 'loggers': { + '': { + 'handlers': ['console'], + 'level': 'DEBUG', + 'propagate': False, + } + } +}) + +client = redis.Redis( + host=os.environ['REDIS_HOST'], + port=int(os.environ.get('REDIS_PORT', '6379')), + charset="utf-8", + decode_responses=True, +) + +queue_name = os.environ['REDIS_PREPROCESS_MD_QUEUE_KEY'] + +@application.route('/', methods=['POST']) +def ingest(): + try: + request.get_data() + browse_report = parse_browse_report(request.data) + logger.debug(browse_report) + client.lpush(queue_name, json.dumps( + browse_report, default=converter + )) + return Response(status=202) + + except Exception as e: + return Response(str(e), status=400) diff --git a/ingestor/config.py b/ingestor/config.py new file mode 100644 index 0000000000000000000000000000000000000000..069ada04df74d97773a3a1c935d3f3384b1ed4f6 --- /dev/null +++ b/ingestor/config.py @@ -0,0 +1 @@ +bind = ['0.0.0.0:8000'] diff --git a/ingestor/filedaemon.py b/ingestor/filedaemon.py new file mode 100644 index 0000000000000000000000000000000000000000..2b3bef423a489d6f004295382117cea3305a5fd7 --- /dev/null +++ b/ingestor/filedaemon.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python +#------------------------------------------------------------------------------ +# +# Project: prism view server +# Authors: Fabian Schindler <fabian.schindler@eox.at> +# +#------------------------------------------------------------------------------ +# Copyright (C) 2020 EOX IT Services GmbH <https://eox.at> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies of this Software or works derived from this Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#----------------------------------------------------------------------------- + +import os +import logging +import logging.config +import json + +import pyinotify +import redis + +from ingestor.browse_report import parse_browse_report +from ingestor.util import converter + + +logger = logging.getLogger(__name__) + +logging.config.dictConfig({ + 'version': 1, + 'formatters': { + 'simple': { + 'format': '%(levelname)s: %(message)s', + }, + 'verbose': { + 'format': '[%(asctime)s][%(module)s] %(levelname)s: %(message)s', + } + }, + 'handlers': { + 'console': { + 'level': 'DEBUG', + 'class': 'logging.StreamHandler', + 'formatter': 'verbose', + } + }, + 'loggers': { + '': { + 'handlers': ['console'], + 'level': 'DEBUG', + 'propagate': False, + } + } +}) + +queue_name = os.environ['REDIS_PREPROCESS_MD_QUEUE_KEY'] +watch_dir = os.environ['INOTIFY_WATCH_DIR'] + +client = redis.Redis( + host=os.environ['REDIS_HOST'], + port=int(os.environ.get('REDIS_PORT', '6379')), + charset='utf-8', + decode_responses=True, +) + + +watchmanager = pyinotify.WatchManager() + +class EventHandler(pyinotify.ProcessEvent): + def process_IN_CREATE(self, event): + logger.info(f'Parsing browse file: {event.pathname}') + try: + with open(event.pathname) as f: + browse_report = parse_browse_report(f) + logger.debug(browse_report) + client.lpush(queue_name, json.dumps( + browse_report, default=converter + )) + except Exception as e: + logger.exception(e) + +handler = EventHandler() +notifier = pyinotify.Notifier(watchmanager, handler) + +wdd = watchmanager.add_watch(watch_dir, pyinotify.IN_CREATE, rec=True) + +notifier.loop() diff --git a/ingestor/ingestor/__init__.py b/ingestor/ingestor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ingestor/ingestor/browse_report.py b/ingestor/ingestor/browse_report.py new file mode 100644 index 0000000000000000000000000000000000000000..c3669bd77719f32609d4a8b6c344b1f30f3ed675 --- /dev/null +++ b/ingestor/ingestor/browse_report.py @@ -0,0 +1,161 @@ +#------------------------------------------------------------------------------ +# +# Project: prism view server +# Authors: Fabian Schindler <fabian.schindler@eox.at> +# +#------------------------------------------------------------------------------ +# Copyright (C) 2020 EOX IT Services GmbH <https://eox.at> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies of this Software or works derived from this Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#----------------------------------------------------------------------------- + +import io + +from lxml import etree +import dateutil.parser + +from .util import pairwise + + +class BrowseReportParserError(ValueError): + pass + +NS_REP = 'http://ngeo.eo.esa.int/ngEO/browseReport/1.0' +NS_REP_OLD = 'http://ngeo.eo.esa.int/schema/browseReport' +NS_BSI = 'http://ngeo.eo.esa.int/schema/browse/ingestion' + + +nsmap = { + 'rep': NS_REP, + 'bsi': NS_BSI +} + + +def rep(tag): + return f'{{{NS_REP}}}{tag}' + +def rep_old(tag): + return f'{{{NS_REP_OLD}}}{tag}' + +def bsi(tag): + return f'{{{NS_BSI}}}{tag}' + + +ALLOWED_ROOT_TAGS = {rep('browseReport'), rep_old('browseReport'), bsi('ingestBrowse')} + + +def parse_browse_report(input_file): + """ + :returns: list of browses + """ + if isinstance(input_file, bytes): + input_file = io.BytesIO(input_file) + + try: + tree = etree.parse(input_file) + except etree.XMLSyntaxError as e: + raise BrowseReportParserError('Failed to parse XML.') from e + root = tree.getroot() + + if not root.tag in ALLOWED_ROOT_TAGS: + raise BrowseReportParserError( + 'Document is not a browse report or an ingest browse instruction.' + ) + + if root.tag == rep_old('browseReport'): + used_rep = rep_old + else: + used_rep = rep + + return { + 'responsible_org_name': root.findtext(used_rep('responsibleOrgName')), + 'date_time': dateutil.parser.parse(root.findtext(used_rep('dateTime'))), + 'browse_type': root.findtext(used_rep('browseType')), + 'browses': [ + parse_browse(elem, used_rep) + for elem in root.iterfind(used_rep('browse')) + ], + } + + +def parse_browse(elem, used_rep): + browse = { + 'type': '', + 'browse_identifier': elem.findtext(used_rep('browseIdentifier')), + 'filename': elem.findtext(used_rep('fileName')), + 'image_type': elem.findtext(used_rep('imageType')), + 'reference_system_identifier': elem.findtext( + used_rep('referenceSystemIdentifier') + ), + 'start_time': dateutil.parser.parse(elem.findtext(used_rep('startTime'))), + 'end_time': dateutil.parser.parse(elem.findtext(used_rep('endTime'))), + } + + rectified_elem = elem.find(used_rep('rectifiedBrowse')) + footprint_elem = elem.find(used_rep('footprint')) + geotiff_elem = elem.find(used_rep('modelInGeotiff')) + regular_grid_browse = elem.find(used_rep('regularGrid')) + + if rectified_elem is not None: + browse['type'] = 'rectified_browse' + browse['rectified'] = { + 'coord_list': [ + (float(x), float(y)) + for x, y in pairwise( + rectified_elem.findtext(used_rep('coordList')).split() + ) + ], + } + + elif footprint_elem is not None: + browse['type'] = 'footprint_browse' + browse['footprint'] = { + 'col_row_list': [ + (int(x), int(y)) + for x, y in pairwise( + footprint_elem.findtext(used_rep('colRowList')).split() + ) + ], + 'coord_list': [ + (float(x), float(y)) + for x, y in pairwise( + footprint_elem.findtext(used_rep('coordList')).split() + ) + ], + } + + elif geotiff_elem is not None: + browse['type'] = 'model_in_geotiff_browse' + + elif regular_grid_browse is not None: + browse['type'] = 'regular_grid_browse' + browse['regular_grid'] = { + 'col_node_number': int(regular_grid_browse.findtext(used_rep('colNodeNumber'))), + 'row_node_number': int(regular_grid_browse.findtext(used_rep('rowNodeNumber'))), + 'col_step': float(regular_grid_browse.findtext(used_rep('colStep'))), + 'row_step': float(regular_grid_browse.findtext(used_rep('rowStep'))), + 'coord_lists': [ + [ + (float(x), float(y)) + for x, y in pairwise(elem.text.split()) + ] for elem in regular_grid_browse.iterfind(used_rep('coordList')) + ] + } + + return browse diff --git a/ingestor/ingestor/test_browse_report.py b/ingestor/ingestor/test_browse_report.py new file mode 100644 index 0000000000000000000000000000000000000000..d30cbbcaed92c4cda348858f0220ba57191dff4e --- /dev/null +++ b/ingestor/ingestor/test_browse_report.py @@ -0,0 +1,165 @@ +#------------------------------------------------------------------------------ +# +# Project: prism view server +# Authors: Fabian Schindler <fabian.schindler@eox.at> +# +#------------------------------------------------------------------------------ +# Copyright (C) 2020 EOX IT Services GmbH <https://eox.at> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies of this Software or works derived from this Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#----------------------------------------------------------------------------- + + +from os.path import dirname, join +from datetime import datetime +from dateutil import tz + +from ingestor.browse_report import parse_browse_report + + +TEST_DATA_DIR = join(dirname(dirname(__file__)), 'tests/data') + + +def test_parse_footprint_browse(): + with open(join(TEST_DATA_DIR, 'footprint_browse.xml')) as f: + browse_report = parse_browse_report(f) + + assert browse_report == { + 'responsible_org_name': 'Generated by Eoli 2 ngEO Converter V1.2.0', + 'date_time': datetime(2013, 9, 25, 14, 54, 38, 0, tz.UTC), + 'browse_type': 'SAR', + 'browses': [{ + 'type': 'footprint_browse', + 'browse_identifier': 'ERS-2-11040113373745-1507.SAR_IM0_0P.BP', + 'filename': 'ERS-2-11040113373745-1507.SAR_IM0_0P.BP.jpg', + 'image_type': 'Jpeg', + 'reference_system_identifier': 'EPSG:4326', + 'footprint': { + 'col_row_list': [ + (0, 0), + (500, 0), + (500, 250), + (0, 250), + (0, 0), + ], + 'coord_list': [ + (83.66, 42.31), + (84.53, 42.42), + (84.48, 51.28), + (83.61, 50.32), + (83.66, 42.31), + ] + }, + 'start_time': datetime(2011, 4, 1, 13, 37, 37, 0, tz.UTC), + 'end_time': datetime(2011, 4, 1, 13, 37, 52, 0, tz.UTC), + }] + } + + +def test_parse_model_in_geotiff_browse(): + with open(join(TEST_DATA_DIR, 'model_in_geotiff_browse.xml')) as f: + browse_report = parse_browse_report(f) + + assert browse_report == { + 'responsible_org_name': 'DMI', + 'date_time': datetime(2012, 7, 13, 11, 54, 26, 0, tz.UTC), + 'browse_type': 'SAR', + 'browses': [{ + 'type': 'model_in_geotiff_browse', + 'browse_identifier': 'ID_DODWH_MG2_CORE_09DM010001_1', + 'filename': 'ID_DEIMOS01-v2_DE0028bfp_L3R.tif', + 'image_type': 'TIFF', + 'reference_system_identifier': 'EPSG:4326', + 'start_time': datetime(2011, 2, 1, 11, 48, 1, 0, tz.UTC), + 'end_time': datetime(2011, 2, 1, 11, 48, 27, 0, tz.UTC), + }] + } + +def test_parse_rectified_browse(): + with open(join(TEST_DATA_DIR, 'rectified_browse.xml')) as f: + browse_report = parse_browse_report(f) + + assert browse_report == { + 'responsible_org_name': 'SLAP 03.03', + 'date_time': datetime(2014, 7, 24, 11, 58, 24, 0, tz.UTC), + 'browse_type': 'NGEO-LIGHT', + 'browses': [{ + 'type': 'rectified_browse', + 'browse_identifier': 'LS05_RFUI_TM__GTC_1P_19910928T071939_19910928T072007_040292_0172_0031_B10D', + 'filename': 'http://landsat-ds.eo.esa.int/metadata/LandsatTMCloudFreeCoverage/1991/09/28/LS05_RFUI_TM__GTC_1P_19910928T071939_19910928T072007_040292_0172_0031_B10D.BP.PNG', + 'image_type': 'PNG', + 'reference_system_identifier': 'EPSG:4326', + 'rectified': { + 'coord_list': [ + (40.8395, 40.1005), + (42.6645, 42.7907), + ] + }, + 'start_time': datetime(1991, 9, 28, 7, 19, 39, 0, tz.UTC), + 'end_time': datetime(1991, 9, 28, 7, 20, 7, 0, tz.UTC), + }] + } + + +def test_parse_regular_grid_browse(): + with open(join(TEST_DATA_DIR, 'regular_grid_browse.xml')) as f: + browse_report = parse_browse_report(f) + + assert browse_report == { + 'responsible_org_name': 'Sentinel 1 PDGS', + 'date_time': datetime(2012, 11, 8, 17, 25, 49, tzinfo=tz.tzoffset(None, 3600)), + 'browse_type': 'SAR', + 'browses': [{ + 'type': 'regular_grid_browse', + 'browse_identifier': 'a20120101T043724405923', + 'filename': 'quick-look.png', + 'image_type': 'PNG', + 'reference_system_identifier': 'EPSG:4326', + 'regular_grid': { + 'col_node_number': 6, + 'row_node_number': 20, + 'col_step': 78.8, + 'row_step': 29.157894737, + 'coord_lists': [ + [(16.80678325439503, -156.5765611873593), (16.92753830505998, -156.6004684133847), (17.04829841199142, -156.6243747867503), (17.16905739541897, -156.6482792679816), (17.28981280532396, -156.6721814810342), (17.31890450297918, -156.6779396385797)], + [(16.81392675836918, -156.5375764998714), (16.93467625534994, -156.561457873508), (17.05543081391954, -156.5853381897874), (17.17618425492683, -156.609216410023), (17.2969341243137, -156.6330921572468), (17.32602449992995, -156.6388438384338)], + [(16.82106287728486, -156.498588896317), (16.94180676422443, -156.522444395707), (17.06255571800818, -156.5462986330286), (17.1833035600995, -156.5701505703374), (17.30404783238212, -156.593999829775), (17.33313687211055, -156.599745030351)], + [(16.82819160727274, -156.4595983781882), (16.94892982780319, -156.4834279815255), (17.06967312036884, -156.5072561180554), (17.19041530703756, -156.5310817505579), (17.31115392562071, -156.5549045002936), (17.34024161561499, -156.5606432159938)], + [(16.83531294446828, -156.420604946974), (16.95604544221135, -156.4444086325018), (17.07678301711796, -156.4682106464455), (17.19751949184693, -156.4920099523122), (17.31825240012606, -156.5158061704734), (17.34733872654218, -156.5215383970218)], + [(16.84242688501015, -156.381608604169), (16.96315360357771, -156.4053863501777), (17.08388540437526, -156.4291622197829), (17.20461611063721, -156.4529351772322), (17.32534325199815, -156.4767048419914), (17.35442820099433, -156.4824305751009)], + [(16.84953342503939, -156.3426093512784), (16.97025430803387, -156.3663611361029), (17.09098027826299, -156.3901108396617), (17.21170515952094, -156.4138574269578), (17.33242647733964, -156.437600516534), (17.36151003507623, -156.4433197519077)], + [(16.85663256069959, -156.3036071898168), (16.97734755171454, -156.327332991834), (17.09806763490601, -156.3510565076854), (17.21878663461358, -156.3747767031367), (17.33950207225592, -156.3984931957972), (17.36858422489525, -156.4042059291287)], + [(16.86372428813775, -156.2646021213034), (16.98443333075823, -156.2883019189305), (17.10514747043269, -156.311999225462), (17.22586053203457, -156.3356930074195), (17.34657003285606, -156.3593828814815), (17.37565076656231, -156.3650891084553)], + [(16.87080860350595, -156.225594147253), (16.99151164130902, -156.2492679189448), (17.11221978097664, -156.2729389945953), (17.23292684790889, -156.2966063414503), (17.35363035525441, -156.3202695752825), (17.38270965619353, -156.3259692915751)], + [(16.87424257900799, -156.2066700838774), (16.994942683509, -156.2303312224714), (17.11564789218319, -156.2539895656031), (17.23635203068512, -156.2776440808492), (17.35705261014584, -156.3012943835597), (17.38613121105771, -156.3069909399195)], + [(16.88495498268113, -156.1475694884959), (17.00564584155052, -156.1711911438214), (17.12634181169133, -156.1948096932411), (17.24703671955967, -156.2184241052198), (17.36772806994446, -156.2420339939122), (17.39680446385366, -156.2477206758413)], + [(16.89201703884214, -156.1085528066608), (17.01270172358132, -156.1321483716202), (17.13339152417947, -156.1557406258028), (17.25408026763801, -156.1793285380803), (17.37476545451566, -156.2029117219735), (17.40384037416505, -156.2085918802064)], + [(16.89907166764901, -156.0695332249886), (17.01975012180706, -156.0931026781663), (17.14043369632741, -156.1166686157758), (17.26111621878105, -156.1402300068893), (17.38179518545178, -156.1637864645731), (17.41086861701329, -156.1694600947586)], + [(16.90611886532681, -156.0305107447066), (17.02679103244686, -156.054054064714), (17.14746832434206, -156.0775936644758), (17.26814456918872, -156.1011285129948), (17.38881725894098, -156.124658223118), (17.41788918858779, -156.1303253208992)], + [(16.91315862812868, -155.991485366909), (17.03382445174815, -156.0150025323825), (17.15449540445841, -156.0385157730854), (17.27516531508931, -156.0620240576098), (17.39583167119955, -156.0855269988818), (17.42490208510597, -156.0911875598966)], + [(16.92019095234127, -155.9524570925252), (17.04085037599229, -155.9759480821245), (17.161514932945, -155.9994349426222), (17.28217845274488, -156.0229166417812), (17.40283841847734, -156.046392792973), (17.43190730281881, -156.0520468128543)], + [(16.92721583429081, -155.9134259222858), (17.04786880150054, -155.9368907146913), (17.16852690610996, -155.9603511739047), (17.28918397845739, -155.9838062663549), (17.40983749706388, -156.0072556063005), (17.43890483801674, -156.0129030806771)], + [(16.93423327034943, -155.8743918566859), (17.05487972464038, -155.8978304305973), (17.17553132030729, -155.9212644675158), (17.29618188857495, -155.9446929319396), (17.41682890329453, -155.9681154395373), (17.44589468703602, -155.973756364034)], + [(16.94103455650875, -155.8365177408673), (17.06167463789954, -155.8599308569188), (17.18231986439887, -155.8833392385752), (17.30296406819755, -155.9067418477961), (17.42360471850083, -155.9301383022012), (17.45266897820548, -155.9357728677357)], + ], + }, + 'start_time': datetime(2012, 1, 1, 4, 37, 24, 405923), + 'end_time': datetime(2012, 1, 1, 4, 37, 32, 890783), + }] + } diff --git a/ingestor/ingestor/util.py b/ingestor/ingestor/util.py new file mode 100644 index 0000000000000000000000000000000000000000..6d659c9ce70440242ae62d602c372e6e743c4658 --- /dev/null +++ b/ingestor/ingestor/util.py @@ -0,0 +1,39 @@ +#------------------------------------------------------------------------------ +# +# Project: prism view server +# Authors: Fabian Schindler <fabian.schindler@eox.at> +# +#------------------------------------------------------------------------------ +# Copyright (C) 2020 EOX IT Services GmbH <https://eox.at> +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to +# deal in the Software without restriction, including without limitation the +# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +# sell copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies of this Software or works derived from this Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +#----------------------------------------------------------------------------- + +from datetime import datetime + + +def pairwise(iterable): + "s -> (s0,s1), (s2,s3), (s4, s5), ..." + a = iter(iterable) + return zip(a, a) + + +def converter(o): + if isinstance(o, datetime): + return o.isoformat() diff --git a/ingestor/install.sh b/ingestor/install.sh new file mode 100755 index 0000000000000000000000000000000000000000..c221fe0880d737796250e53db92aa60df39438c2 --- /dev/null +++ b/ingestor/install.sh @@ -0,0 +1,12 @@ +#!/bin/bash +echo "Running install.sh" + +apt update + +echo "Installing packages" +DEBIAN_FRONTEND=noninteractive apt install -y python3-flask python3-lxml python3-dateutil gunicorn3 python3-redis python3-pyinotify + + +# pip3 install -r /requirements.txt + +rm -rf /var/lib/apt/lists/* diff --git a/ingestor/requirements.txt b/ingestor/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc6bf8b47d6dedb12339879756582c3e51ae2440 --- /dev/null +++ b/ingestor/requirements.txt @@ -0,0 +1,5 @@ +flask +lxml +python-dateutil +gunicorn +redis diff --git a/ingestor/tests/data/footprint_browse.xml b/ingestor/tests/data/footprint_browse.xml new file mode 100644 index 0000000000000000000000000000000000000000..8ab11ec5333b8b612e8da391d38d560e39997b27 --- /dev/null +++ b/ingestor/tests/data/footprint_browse.xml @@ -0,0 +1,18 @@ +<?xml version='1.0' encoding='UTF-8'?> +<rep:browseReport xmlns:rep="http://ngeo.eo.esa.int/ngEO/browseReport/1.0" version="1.3"> + <rep:responsibleOrgName>Generated by Eoli 2 ngEO Converter V1.2.0</rep:responsibleOrgName> + <rep:dateTime>2013-09-25T14:54:38Z</rep:dateTime> + <rep:browseType>SAR</rep:browseType> + <rep:browse> + <rep:browseIdentifier>ERS-2-11040113373745-1507.SAR_IM0_0P.BP</rep:browseIdentifier> + <rep:fileName>ERS-2-11040113373745-1507.SAR_IM0_0P.BP.jpg</rep:fileName> + <rep:imageType>Jpeg</rep:imageType> + <rep:referenceSystemIdentifier>EPSG:4326</rep:referenceSystemIdentifier> + <rep:footprint nodeNumber="5"> + <rep:colRowList>0 0 500 0 500 250 0 250 0 0</rep:colRowList> + <rep:coordList>83.66 42.31 84.53 42.42 84.48 51.28 83.61 50.32 83.66 42.31</rep:coordList> + </rep:footprint> + <rep:startTime>2011-04-01T13:37:37Z</rep:startTime> + <rep:endTime>2011-04-01T13:37:52Z</rep:endTime> + </rep:browse> +</rep:browseReport> diff --git a/ingestor/tests/data/model_in_geotiff_browse.xml b/ingestor/tests/data/model_in_geotiff_browse.xml new file mode 100644 index 0000000000000000000000000000000000000000..624a01e9e142fb48200fd855440eace4a8b6858a --- /dev/null +++ b/ingestor/tests/data/model_in_geotiff_browse.xml @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="UTF-8"?> +<bsi:ingestBrowse xmlns:bsi="http://ngeo.eo.esa.int/schema/browse/ingestion" + xmlns:rep="http://ngeo.eo.esa.int/ngEO/browseReport/1.0" version="1.3"> + <rep:responsibleOrgName>DMI</rep:responsibleOrgName> + <rep:dateTime>2012-07-13T11:54:26Z</rep:dateTime> + <rep:browseType>SAR</rep:browseType> + <rep:browse xmlns:gsc="http://earth.esa.int/gsc" + xmlns:gml="http://www.opengis.net/gml" + xmlns:eop="http://earth.esa.int/eop" + xmlns:opt="http://earth.esa.int/opt"> + <rep:browseIdentifier>ID_DODWH_MG2_CORE_09DM010001_1</rep:browseIdentifier> + <rep:fileName>ID_DEIMOS01-v2_DE0028bfp_L3R.tif</rep:fileName> + <rep:imageType>TIFF</rep:imageType> + <rep:referenceSystemIdentifier>EPSG:4326</rep:referenceSystemIdentifier> + <rep:modelInGeotiff>true</rep:modelInGeotiff> + <rep:startTime>2011-02-01T11:48:01Z</rep:startTime> + <rep:endTime>2011-02-01T11:48:27Z</rep:endTime> + </rep:browse> +</bsi:ingestBrowse> diff --git a/ingestor/tests/data/rectified_browse.xml b/ingestor/tests/data/rectified_browse.xml new file mode 100644 index 0000000000000000000000000000000000000000..1daa44670f6c9808ec9c5b30a962b12a6c8aacf3 --- /dev/null +++ b/ingestor/tests/data/rectified_browse.xml @@ -0,0 +1,19 @@ +<?xml version='1.0' encoding='UTF-8'?> +<rep:browseReport version="1.1" xsi:schemaLocation="http://ngeo.eo.esa.int/schema/browseReport IF-ngEO-BrowseReport-1.1.xsd" + xmlns:rep="http://ngeo.eo.esa.int/schema/browseReport" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> + <rep:responsibleOrgName>SLAP 03.03</rep:responsibleOrgName> + <rep:dateTime>2014-07-24T11:58:24Z</rep:dateTime> + <rep:browseType>NGEO-LIGHT</rep:browseType> + <rep:browse> + <rep:browseIdentifier>LS05_RFUI_TM__GTC_1P_19910928T071939_19910928T072007_040292_0172_0031_B10D</rep:browseIdentifier> + <rep:fileName>http://landsat-ds.eo.esa.int/metadata/LandsatTMCloudFreeCoverage/1991/09/28/LS05_RFUI_TM__GTC_1P_19910928T071939_19910928T072007_040292_0172_0031_B10D.BP.PNG</rep:fileName> + <rep:imageType>PNG</rep:imageType> + <rep:referenceSystemIdentifier>EPSG:4326</rep:referenceSystemIdentifier> + <rep:rectifiedBrowse> + <rep:coordList>40.8395 40.1005 42.6645 42.7907</rep:coordList> + </rep:rectifiedBrowse> + <rep:startTime>1991-09-28T07:19:39Z</rep:startTime> + <rep:endTime>1991-09-28T07:20:07Z</rep:endTime> + </rep:browse> +</rep:browseReport> diff --git a/ingestor/tests/data/regular_grid_browse.xml b/ingestor/tests/data/regular_grid_browse.xml new file mode 100644 index 0000000000000000000000000000000000000000..dd4b7d8dbe003ca6b27b2f097b0783f4387b91fa --- /dev/null +++ b/ingestor/tests/data/regular_grid_browse.xml @@ -0,0 +1,40 @@ +<?xml version="1.0" encoding="UTF-8"?> +<bsi:ingestBrowse xmlns:bsi="http://ngeo.eo.esa.int/schema/browse/ingestion" xmlns:eop="http://earth.esa.int/eop" xmlns:gsc="http://earth.esa.int/gsc" xmlns:opt="http://earth.esa.int/opt" xmlns:rep="http://ngeo.eo.esa.int/ngEO/browseReport/1.0" xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://ngeo.eo.esa.int/ngEO/browseReport/1.0 IF-ngEO-BrowseReport.xsd" version="1.3"> + <rep:responsibleOrgName>Sentinel 1 PDGS</rep:responsibleOrgName> + <rep:dateTime>2012-11-08T17:25:49+01:00</rep:dateTime> + <rep:browseType>SAR</rep:browseType> + <rep:browse> + <rep:browseIdentifier>a20120101T043724405923</rep:browseIdentifier> + <rep:fileName>quick-look.png</rep:fileName> + <rep:imageType>PNG</rep:imageType> + <rep:referenceSystemIdentifier>EPSG:4326</rep:referenceSystemIdentifier> + <rep:regularGrid> + <rep:colNodeNumber>6</rep:colNodeNumber> + <rep:rowNodeNumber>20</rep:rowNodeNumber> + <rep:colStep>78.8</rep:colStep> <!-- 555x395px --> + <rep:rowStep>29.157894737</rep:rowStep> + <rep:coordList>1.680678325439503e+01 -1.565765611873593e+02 1.692753830505998e+01 -1.566004684133847e+02 1.704829841199142e+01 -1.566243747867503e+02 1.716905739541897e+01 -1.566482792679816e+02 1.728981280532396e+01 -1.566721814810342e+02 1.731890450297918e+01 -1.566779396385797e+02</rep:coordList> + <rep:coordList>1.681392675836918e+01 -1.565375764998714e+02 1.693467625534994e+01 -1.565614578735080e+02 1.705543081391954e+01 -1.565853381897874e+02 1.717618425492683e+01 -1.566092164100230e+02 1.729693412431370e+01 -1.566330921572468e+02 1.732602449992995e+01 -1.566388438384338e+02</rep:coordList> + <rep:coordList>1.682106287728486e+01 -1.564985888963170e+02 1.694180676422443e+01 -1.565224443957070e+02 1.706255571800818e+01 -1.565462986330286e+02 1.718330356009950e+01 -1.565701505703374e+02 1.730404783238212e+01 -1.565939998297750e+02 1.733313687211055e+01 -1.565997450303510e+02</rep:coordList> + <rep:coordList>1.682819160727274e+01 -1.564595983781882e+02 1.694892982780319e+01 -1.564834279815255e+02 1.706967312036884e+01 -1.565072561180554e+02 1.719041530703756e+01 -1.565310817505579e+02 1.731115392562071e+01 -1.565549045002936e+02 1.734024161561499e+01 -1.565606432159938e+02</rep:coordList> + <rep:coordList>1.683531294446828e+01 -1.564206049469740e+02 1.695604544221135e+01 -1.564444086325018e+02 1.707678301711796e+01 -1.564682106464455e+02 1.719751949184693e+01 -1.564920099523122e+02 1.731825240012606e+01 -1.565158061704734e+02 1.734733872654218e+01 -1.565215383970218e+02</rep:coordList> + <rep:coordList>1.684242688501015e+01 -1.563816086041690e+02 1.696315360357771e+01 -1.564053863501777e+02 1.708388540437526e+01 -1.564291622197829e+02 1.720461611063721e+01 -1.564529351772322e+02 1.732534325199815e+01 -1.564767048419914e+02 1.735442820099433e+01 -1.564824305751009e+02</rep:coordList> + <rep:coordList>1.684953342503939e+01 -1.563426093512784e+02 1.697025430803387e+01 -1.563663611361029e+02 1.709098027826299e+01 -1.563901108396617e+02 1.721170515952094e+01 -1.564138574269578e+02 1.733242647733964e+01 -1.564376005165340e+02 1.736151003507623e+01 -1.564433197519077e+02</rep:coordList> + <rep:coordList>1.685663256069959e+01 -1.563036071898168e+02 1.697734755171454e+01 -1.563273329918340e+02 1.709806763490601e+01 -1.563510565076854e+02 1.721878663461358e+01 -1.563747767031367e+02 1.733950207225592e+01 -1.563984931957972e+02 1.736858422489525e+01 -1.564042059291287e+02</rep:coordList> + <rep:coordList>1.686372428813775e+01 -1.562646021213034e+02 1.698443333075823e+01 -1.562883019189305e+02 1.710514747043269e+01 -1.563119992254620e+02 1.722586053203457e+01 -1.563356930074195e+02 1.734657003285606e+01 -1.563593828814815e+02 1.737565076656231e+01 -1.563650891084553e+02</rep:coordList> + <rep:coordList>1.687080860350595e+01 -1.562255941472530e+02 1.699151164130902e+01 -1.562492679189448e+02 1.711221978097664e+01 -1.562729389945953e+02 1.723292684790889e+01 -1.562966063414503e+02 1.735363035525441e+01 -1.563202695752825e+02 1.738270965619353e+01 -1.563259692915751e+02</rep:coordList> + <rep:coordList>1.687424257900799e+01 -1.562066700838774e+02 1.699494268350900e+01 -1.562303312224714e+02 1.711564789218319e+01 -1.562539895656031e+02 1.723635203068512e+01 -1.562776440808492e+02 1.735705261014584e+01 -1.563012943835597e+02 1.738613121105771e+01 -1.563069909399195e+02</rep:coordList> + <rep:coordList>1.688495498268113e+01 -1.561475694884959e+02 1.700564584155052e+01 -1.561711911438214e+02 1.712634181169133e+01 -1.561948096932411e+02 1.724703671955967e+01 -1.562184241052198e+02 1.736772806994446e+01 -1.562420339939122e+02 1.739680446385366e+01 -1.562477206758413e+02</rep:coordList> + <rep:coordList>1.689201703884214e+01 -1.561085528066608e+02 1.701270172358132e+01 -1.561321483716202e+02 1.713339152417947e+01 -1.561557406258028e+02 1.725408026763801e+01 -1.561793285380803e+02 1.737476545451566e+01 -1.562029117219735e+02 1.740384037416505e+01 -1.562085918802064e+02</rep:coordList> + <rep:coordList>1.689907166764901e+01 -1.560695332249886e+02 1.701975012180706e+01 -1.560931026781663e+02 1.714043369632741e+01 -1.561166686157758e+02 1.726111621878105e+01 -1.561402300068893e+02 1.738179518545178e+01 -1.561637864645731e+02 1.741086861701329e+01 -1.561694600947586e+02</rep:coordList> + <rep:coordList>1.690611886532681e+01 -1.560305107447066e+02 1.702679103244686e+01 -1.560540540647140e+02 1.714746832434206e+01 -1.560775936644758e+02 1.726814456918872e+01 -1.561011285129948e+02 1.738881725894098e+01 -1.561246582231180e+02 1.741788918858779e+01 -1.561303253208992e+02</rep:coordList> + <rep:coordList>1.691315862812868e+01 -1.559914853669090e+02 1.703382445174815e+01 -1.560150025323825e+02 1.715449540445841e+01 -1.560385157730854e+02 1.727516531508931e+01 -1.560620240576098e+02 1.739583167119955e+01 -1.560855269988818e+02 1.742490208510597e+01 -1.560911875598966e+02</rep:coordList> + <rep:coordList>1.692019095234127e+01 -1.559524570925252e+02 1.704085037599229e+01 -1.559759480821245e+02 1.716151493294500e+01 -1.559994349426222e+02 1.728217845274488e+01 -1.560229166417812e+02 1.740283841847734e+01 -1.560463927929730e+02 1.743190730281881e+01 -1.560520468128543e+02</rep:coordList> + <rep:coordList>1.692721583429081e+01 -1.559134259222858e+02 1.704786880150054e+01 -1.559368907146913e+02 1.716852690610996e+01 -1.559603511739047e+02 1.728918397845739e+01 -1.559838062663549e+02 1.740983749706388e+01 -1.560072556063005e+02 1.743890483801674e+01 -1.560129030806771e+02</rep:coordList> + <rep:coordList>1.693423327034943e+01 -1.558743918566859e+02 1.705487972464038e+01 -1.558978304305973e+02 1.717553132030729e+01 -1.559212644675158e+02 1.729618188857495e+01 -1.559446929319396e+02 1.741682890329453e+01 -1.559681154395373e+02 1.744589468703602e+01 -1.559737563640340e+02</rep:coordList> + <rep:coordList>1.694103455650875e+01 -1.558365177408673e+02 1.706167463789954e+01 -1.558599308569188e+02 1.718231986439887e+01 -1.558833392385752e+02 1.730296406819755e+01 -1.559067418477961e+02 1.742360471850083e+01 -1.559301383022012e+02 1.745266897820548e+01 -1.559357728677357e+02</rep:coordList> + </rep:regularGrid> + <rep:startTime>2012-01-01T04:37:24.405923</rep:startTime> + <rep:endTime>2012-01-01T04:37:32.890783</rep:endTime> + </rep:browse> +</bsi:ingestBrowse> diff --git a/preprocessor/Dockerfile b/preprocessor/Dockerfile index 9294c83268a2d60ed15c497dbba838523dce7b8b..1d3cb90f96f4273318e6ef21d24ab061decf677a 100644 --- a/preprocessor/Dockerfile +++ b/preprocessor/Dockerfile @@ -25,7 +25,8 @@ # IN THE SOFTWARE. #----------------------------------------------------------------------------- -FROM osgeo/gdal:ubuntu-small-latest +FROM osgeo/gdal:ubuntu-full-3.1.2 + MAINTAINER EOX LABEL name="prism view server preprocessor" \ vendor="EOX IT Services GmbH <https://eox.at>" \ @@ -33,11 +34,14 @@ LABEL name="prism view server preprocessor" \ type="prism view server preprocessor" \ version="0.0.1-dev" +ENV LC_ALL=C.UTF-8 +ENV LANG=C.UTF-8 + USER root RUN apt update && \ apt install -y \ - python3-redis python3-keystoneclient python3-swiftclient wait-for-it && \ + python3-redis python3-keystoneclient python3-swiftclient python3-click python3-setuptools python3-jsonschema wait-for-it && \ apt autoremove -y && \ apt clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -46,32 +50,40 @@ ENV INSTANCE_ID="prism-data-access-server_preprocessor" \ COLLECTION= \ UPLOAD_CONTAINER= \ ST_AUTH_VERSION=3 \ - OS_AUTH_URL="https://auth.cloud.ovh.net/v3/" \ + OS_AUTH_URL= \ OS_USERNAME= \ OS_PASSWORD= \ OS_TENANT_NAME= \ OS_TENANT_ID= \ OS_REGION_NAME= \ + OS_USER_DOMAIN_NAME= \ OS_AUTH_URL_DOWNLOAD= \ ST_AUTH_VERSION_DOWNLOAD= \ OS_USERNAME_DOWNLOAD= \ OS_PASSWORD_DOWNLOAD= \ OS_TENANT_NAME_DOWNLOAD= \ OS_REGION_NAME_DOWNLOAD= \ + OS_USER_DOMAIN_NAME_DOWNLOAD= \ REDIS_HOST= \ REDIS_PORT= \ REDIS_PREPROCESS_QUEUE_KEY= \ - REDIS_REGISTER_QUEUE_KEY= + REDIS_PREPROCESS_MD_QUEUE_KEY= \ + REDIS_REGISTER_QUEUE_KEY= \ + PREPROCESSOR_DEBUG= ADD run-preprocessor.sh \ - preprocessor.py \ - get_min_max.py \ - transform_chain.py \ entrypoint.sh \ + setup.py \ / +RUN chmod +x /run-preprocessor.sh + +COPY preprocessor /preprocessor + +RUN cd / && \ + python3 setup.py install + RUN chmod -v +x \ - /run-preprocessor.sh \ /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] diff --git a/preprocessor/get_min_max.py b/preprocessor/get_min_max.py deleted file mode 100644 index a9ef3aa181b53eaf753f27cb6da99104d4722e67..0000000000000000000000000000000000000000 --- a/preprocessor/get_min_max.py +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# -# Project: get_min_max.py -# Authors: Stephan Meissl <stephan.meissl@eox.at> -# -# ----------------------------------------------------------------------------- -# Copyright (c) 2019 EOX IT Services GmbH -# -# Python script to retrieve min and max values of items. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies of this Software or works derived from this Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -# ----------------------------------------------------------------------------- - - -import sys -import os -import argparse -import textwrap -import logging -import traceback -import subprocess -import re - -from swiftclient.service import SwiftError, SwiftService - - -logger = logging.getLogger(__name__) - - -def setup_logging(verbosity): - # start logging setup - # get command line level - verbosity = verbosity - if verbosity == 0: - level = logging.CRITICAL - elif verbosity == 1: - level = logging.ERROR - elif verbosity == 2: - level = logging.WARNING - elif verbosity == 3: - level = logging.INFO - else: - level = logging.DEBUG - logger.setLevel(level) - sh = logging.StreamHandler() - sh.setLevel(level) - formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") - sh.setFormatter(formatter) - logger.addHandler(sh) - # finished logging setup - - -def get_min_max(collection): - logger.info("Starting") - - try: - with SwiftService() as swift, open("min_max_out", "a+") as outfile: - p = subprocess.run( - ["swift", "auth"], capture_output=True, - timeout=600, check=True - ) - vars = p.stdout.decode("utf-8") - os.environ["SWIFT_STORAGE_URL"] = re.findall( - r"OS_STORAGE_URL=(.*)\n", vars - )[0] - os.environ["SWIFT_AUTH_TOKEN"] = re.findall( - r"OS_AUTH_TOKEN=(.*)\n", vars - )[0] - outfile.write("container,product_type,min/max\n") - try: - list_gen = swift.list() - for page in list_gen: - if page["success"]: - for item in page["listing"]: - list_gen2 = swift.list(container=item["name"]) - for page2 in list_gen2: - if page2["success"]: - for item2 in page2["listing"]: - if item2["name"].endswith(".TIF") or \ - item2["name"].endswith(".tif"): - gdalout = subprocess.run([ - "gdalinfo", "-mm", - "/vsiswift/%s/%s" % - (item["name"], item2["name"])], - capture_output=True, - timeout=600, check=True - ).stdout.decode("utf-8") - minmax = re.findall( - r"Computed Min/Max=(.*)\n", - gdalout - ) - outfile.write( - "%s,%s,%s\n" % - (item["name"], - item2["name"].split("/")[1], - minmax) - ) - else: - logger.error( - "No product found in container '%s'." - % item["name"] - ) - return(1) - else: - logger.error("No container found.") - return(1) - - except SwiftError as e: - logger.debug(traceback.format_exc()) - logger.error("%s: %s\n" % (type(e).__name__, str(e))) - return(1) - - except Exception as e: - logger.debug(traceback.format_exc()) - logger.error("%s: %s\n" % (type(e).__name__, str(e))) - return(1) - - logger.info("Successfully finished") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.description = textwrap.dedent("""\ - Get min/max of products. - """) - - parser.add_argument( - "-v", "--verbosity", type=int, default=3, choices=[0, 1, 2, 3, 4], - help=( - "Set verbosity of log output " - "(4=DEBUG, 3=INFO, 2=WARNING, 1=ERROR, 0=CRITICAL). (default: 3)" - ) - ) - - arg_values = parser.parse_args() - - setup_logging(arg_values.verbosity) - - collection = os.environ.get('Collection') - if collection is None: - logger.critical("Collection environment variable not set.") - sys.exit(1) - - get_min_max( - collection, - ) diff --git a/preprocessor/gsc_generator.py b/preprocessor/gsc_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..5a863d2f6fa8c8aa34a8908fca7aa10cb313fa79 --- /dev/null +++ b/preprocessor/gsc_generator.py @@ -0,0 +1,147 @@ +from textwrap import dedent + +from osgeo import gdal + + +def positions_to_poslist(positions, projection): + # TODO: maybe reproject if not lat, lon + return ' '.join([ + ' '.join(pair) + for pair in positions + ]) + + +def positions_from_corners(low, high): + minx, miny = low + maxx, maxy = high + + return [ + (minx, miny), + (maxx, miny), + (maxx, maxy), + (minx, maxy), + (minx, minx), + ] + + +def get_footprint_from_browse(data_file, browse): + btype = browse['browse_type'] + + if btype == 'rectified_browse': + low, high = browse['rectified']['coord_list'] + positions = positions_from_corners(low, high) + + elif btype == 'footprint_browse': + positions = browse['footprint'] + + elif btype == 'model_in_geotiff_browse': + ds = gdal.Open(data_file) + gt = ds.GetGeoTransform() + width, height = ds.RasterXSize, ds.RasterXSize + + low = (gt[0], gt[3] + gt[5] * height) + high = (gt[0] + gt[1] * width, gt[3]) + + positions = positions_from_corners(low, high) + + elif btype == 'regular_grid_browse': + raise NotImplementedError('Regular grid browses are not supported') + + return positions_to_poslist( + positions, + browse['reference_system_identifier'], + ) + + +def generate_gsc_metadata(metadata): + return dedent("""\ + <?xml version='1.0' encoding='UTF-8'?> + <gsc:report xmlns:sar="http://earth.esa.int/sar" + xmlns:gml="http://www.opengis.net/gml" + xmlns:eop="http://earth.esa.int/eop" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:opt="http://earth.esa.int/opt" + xmlns:gsc="http://earth.esa.int/gsc" + xmlns:atm="http://earth.esa.int/atm" + xmlns:xlink="http://www.w3.org/1999/xlink" version="2.0"> + <gsc:responsibleOrgName>EOX</gsc:responsibleOrgName> + <gsc:reportType>CIRCULATION</gsc:reportType> + <gsc:dateTime>{now_time}</gsc:dateTime> + <gsc:orderReference></gsc:orderReference> + <gsc:opt_metadata version="1.2.1"> + <gml:metaDataProperty> + <gsc:EarthObservationMetaData> + <eop:identifier>{identifier}</eop:identifier> + <!--<eop:parentIdentifier>D2_MG2b_FRTX_004a/other/FRSC_0059_001/SpotImage</eop:parentIdentifier>--> + <eop:acquisitionType>NOMINAL</eop:acquisitionType> + <eop:productType>{product_type}</eop:productType> + <eop:status>ARCHIVED</eop:status> + <eop:archivedIn> + <eop:ArchivingInformation> + <eop:archivingCenter>CDS</eop:archivingCenter> + <eop:archivingDate>{archiving_time}</eop:archivingDate> + </eop:ArchivingInformation> + </eop:archivedIn> + <gsc:deliveryInfo> + <gsc:deliveryDateTime>{delivery_time}</gsc:deliveryDateTime> + <gsc:deliveryMethod>ELECTRONIC</gsc:deliveryMethod> + </gsc:deliveryInfo> + </gsc:EarthObservationMetaData> + </gml:metaDataProperty> + <gml:validTime> + <gml:TimePeriod> + <gml:beginPosition>{begin_time}</gml:beginPosition> + <gml:endPosition>{end_time}</gml:endPosition> + </gml:TimePeriod> + </gml:validTime> + <gml:using> + <!--<eop:EarthObservationEquipment> + <eop:platform> + <eop:Platform> + <eop:shortName>PH1A</eop:shortName> + <eop:serialIdentifier>1A</eop:serialIdentifier> + </eop:Platform> + </eop:platform> + <eop:instrument> + <eop:Instrument> + <eop:shortName>HR</eop:shortName> + </eop:Instrument> + </eop:instrument> + <eop:sensor> + <eop:Sensor> + <eop:sensorType>OPTICAL</eop:sensorType> + <eop:operationalMode>FUS</eop:operationalMode> + <eop:resolution uom="m">0.5</eop:resolution> + </eop:Sensor> + </eop:sensor> + <eop:acquisitionParameters> + <opt:Acquisition> + <eop:orbitNumber>118</eop:orbitNumber> + <eop:orbitDirection>DESCENDING</eop:orbitDirection> + <eop:acrossTrackIncidenceAngle uom="deg">-4.070247073869651</eop:acrossTrackIncidenceAngle> + <eop:alongTrackIncidenceAngle uom="deg">2.304231907410827</eop:alongTrackIncidenceAngle> + <opt:illuminationAzimuthAngle uom="deg">164.3516878667332</opt:illuminationAzimuthAngle> + </opt:Acquisition> + </eop:acquisitionParameters> + </eop:EarthObservationEquipment>--> + </gml:using> + <gml:target> + <eop:Footprint> + <gml:multiExtentOf> + <gml:MultiSurface srsName="EPSG:4326"> + <gml:surfaceMembers> + <gml:Polygon> + <gml:exterior> + <gml:LinearRing> + <gml:posList>{footprint}</gml:posList> + </gml:LinearRing> + </gml:exterior> + </gml:Polygon> + </gml:surfaceMembers> + </gml:MultiSurface> + </gml:multiExtentOf> + </eop:Footprint> + </gml:target> + <gml:resultOf/> + </gsc:opt_metadata> + </gsc:report>""".format(**metadata)) \ No newline at end of file diff --git a/preprocessor/preprocessor.py b/preprocessor/preprocessor.py deleted file mode 100644 index a0d059797a0ab7e4b3663b4e423c3f34b4edef9c..0000000000000000000000000000000000000000 --- a/preprocessor/preprocessor.py +++ /dev/null @@ -1,417 +0,0 @@ -#!/usr/bin/env python -# ----------------------------------------------------------------------------- -# -# Project: preprocessor.py -# Authors: Stephan Meissl <stephan.meissl@eox.at> -# -# ----------------------------------------------------------------------------- -# Copyright (c) 2019 EOX IT Services GmbH -# -# Python script to preprocess product data. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies of this Software or works derived from this Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. -# ----------------------------------------------------------------------------- - - -import sys -import os -import argparse -import textwrap -import logging -import traceback -import redis -import tempfile -import tarfile -import re -import subprocess - -from swiftclient.multithreading import OutputManager -from swiftclient.service import SwiftError, SwiftService, SwiftUploadObject - -import transform_chain - -SPLIT_PARTS_CHECK = os.environ.get('SPLIT_PARTS_CHECK') -ENFORCE_FOUR_BANDS = os.environ.get('ENFORCE_FOUR_BANDS') - -FILESIZE_LIMIT = 4 * (1024 ** 3) # swift 5GB limit for filesize (non-compressed), here less to have margin -swift_upload_options = { - 'use_slo': True -} - -logger = logging.getLogger("preprocessor") - - -def setup_logging(verbosity): - # start logging setup - # get command line level - verbosity = verbosity - if verbosity == 0: - level = logging.CRITICAL - elif verbosity == 1: - level = logging.ERROR - elif verbosity == 2: - level = logging.WARNING - elif verbosity == 3: - level = logging.INFO - else: - level = logging.DEBUG - logger.setLevel(level) - sh = logging.StreamHandler() - sh.setLevel(level) - formatter = logging.Formatter("%(asctime)s %(levelname)s: %(message)s") - sh.setFormatter(formatter) - logger.addHandler(sh) - # finished logging setup - - -def preprocessor( - collection, tar_object_path, upload_container, replace=False, - client=None, register_queue_key=None -): - logger.info("Starting preprocessing of '%s'." % (tar_object_path)) - - try: - container = tar_object_path.split("/")[1] - package = "/".join(tar_object_path.split("/")[2:]) - - with SwiftService() as swift, OutputManager(), \ - tempfile.TemporaryDirectory() as tmpdirname: - if not replace: - try: - list_parts_gen = swift.list( - container=upload_container, options={"prefix": tar_object_path}, - ) - for page in list_parts_gen: - if page["success"]: - logger.critical( - "Aborting, package '%s' already exists at " - "target container '%s'." % (package, container) - ) - return(1) - except SwiftError as e: - logger.debug(traceback.format_exc()) - logger.error("%s: %s\n" % (type(e).__name__, str(e))) - return(1) - - tmpfilename = os.path.join(tmpdirname, "tmp.tar") - - options = { - "os_username": os.environ.get('OS_USERNAME_DOWNLOAD'), - "os_password": os.environ.get('OS_PASSWORD_DOWNLOAD'), - "os_tenant_name": os.environ.get('OS_TENANT_NAME_DOWNLOAD'), - "os_tenant_id": os.environ.get('OS_TENANT_ID_DOWNLOAD'), - "os_region_name": os.environ.get('OS_REGION_NAME_DOWNLOAD'), - "os_auth_url": os.environ.get('OS_AUTH_URL_DOWNLOAD'), - "auth_version": os.environ.get('ST_AUTH_VERSION_DOWNLOAD'), - } - with SwiftService(options=options) as swift_down: - for down_res in swift_down.download( - container=container, - objects=[package, ], - options={"out_file": tmpfilename}, - ): - if down_res["success"]: - logger.debug( - "'%s' downloaded" % down_res["object"] - ) - else: - logger.error( - "'%s' download failed" % down_res["object"] - ) - return(1) - - tf = tarfile.open(tmpfilename, mode="r") - - data_files_ti = [ - m for m in tf.getmembers() if - m is not None and re.search(r"IMG.+\.(TIF|JP2)", m.name, re.IGNORECASE) - ] - metadata_file_ti = next( - m for m in tf.getmembers() if m is not None and re.search(r"GSC.+\.xml", m.name, re.IGNORECASE) - ) - world_files_ti = [ - m for m in tf.getmembers() if m is not None and - re.search(r"RPC.+\.xml", m.name, re.IGNORECASE) - ] - # add J2W files only if more than one files are present - # that signalizes that file was split into multiple or has panchromatic - if len(data_files_ti) > 1: - world_files_ti += [ - m for m in tf.getmembers() if m is not None and - re.search(r".+\.J2W", m.name, re.IGNORECASE) - ] - data_files = [ - member.name - for member in data_files_ti - ] - metadata_file = metadata_file_ti.name - members = data_files_ti + [metadata_file_ti] + world_files_ti - - if not data_files or not metadata_file: - logger.error( - "Aborting, not all needed files found in package." - ) - return(1) - - tf.extractall(path=tmpdirname, members=members) - - # cleanup after use to save space - tf.close() - os.remove(tmpfilename) - - source_name_first = os.path.join(tmpdirname, data_files[0]) - - # if there is more than one file, make a VRT to mosaic them - if len(data_files) > 1: - logger.debug("More files found, creating a VRT") - source_name_vrt = os.path.join(tmpdirname, 'tmp.vrt') - # open all datasets one by one and create an array of open datasets - dataset_array = [transform_chain.open_gdal_dataset(os.path.join(tmpdirname, data_file)) for data_file in data_files] - if ENFORCE_FOUR_BANDS: - # remove and close datasets with different number of bands than expected - dataset_array = list(filter(None, [transform_chain.validate_band_count(dataset, 4) for dataset in dataset_array])) - if len(dataset_array) == 0: - logger.error( - "Aborting, wrong number of bands for all datasets %s" % ",".join(data_files) - ) - return(1) - # try to fix geotransform for ortho images one by one before making a vrt, which fails otherwise - dataset_array = [transform_chain.correct_geo_transform(dataset_entity) for dataset_entity in dataset_array] - # create a vrt out of them - dataset = transform_chain.create_vrt_dataset(dataset_array, source_name_vrt) - # during creating of a vrt, reference to RPC is lost - # if there was rpc, set it to the vrt - dataset = transform_chain.set_rpc_metadata(dataset_array[0], dataset) - dataset_array = None - else: - # open file using gdal - dataset = transform_chain.open_gdal_dataset(source_name_first) - # close datasets with different number of bands than expected - if ENFORCE_FOUR_BANDS: - dataset = transform_chain.validate_band_count(dataset, 4) - if dataset is None: - logger.error( - "Aborting, wrong number of bands for dataset %s" % data_files[0] - ) - return(1) - # change RPC to geotransform if present - dataset = transform_chain.apply_rpc(dataset) - - # perform transformation correction if necessary - dataset = transform_chain.correct_geo_transform(dataset) - - # save file with given options - should use ENV - creation_options = ["BLOCKSIZE=512", "COMPRESS=DEFLATE", "LEVEL=6", "NUM_THREADS=8", - "BIGTIFF=IF_SAFER", "OVERVIEWS=AUTO", "RESAMPLING=CUBIC"] - - split_parts = transform_chain.split_check(dataset, FILESIZE_LIMIT) if SPLIT_PARTS_CHECK == True else 1 - - output_file_list = transform_chain.write_gdal_dataset_split(dataset, "COG", "%s.tif" % os.path.splitext( - source_name_first)[0], creation_options, split_parts) - dataset = None - objects = [] - # create vrt if file was split - if len(output_file_list) > 1: - logger.debug("Creating .vrt of previously split files.") - vrt_name = "%s.vrt" % os.path.splitext(source_name_first)[0] - subprocess.run( - ['gdalbuildvrt', '-quiet', os.path.basename(vrt_name)] + [ - os.path.basename(data_file) for data_file in output_file_list], - timeout=600, check=True, cwd=os.path.dirname(vrt_name) - ) # use cwd to create relative paths in vrt - # add vrt to files to be uploaded - objects.append( - SwiftUploadObject( - vrt_name, - object_name=os.path.join( - container, package, os.path.basename(vrt_name)) - ) - ) - - # add image files to files to be uploaded - for data_file in output_file_list: - # check if 5GB swift upload limit is exceeded by any of files, if yes, use segmentation - size = os.stat(data_file).st_size - if (size > 1024 * 1024 * 1024 * 5): - swift_upload_options["segment_size"] = 2 * 1024 * 1024 * 1024 # 2gb segments - - dest_object_name = os.path.join( - container, package, os.path.basename(data_file) - ) - objects.append( - SwiftUploadObject( - data_file, - object_name=dest_object_name - ) - ) - - # add metadata to files to be uploaded after data files - objects.append( - SwiftUploadObject( - os.path.join(tmpdirname, metadata_file), - object_name=os.path.join(container, package, metadata_file) - ) - ) - - # upload files - for upload in swift.upload( - container=upload_container, - objects=objects, - options=swift_upload_options - ): - if upload["success"]: - if "object" in upload: - logger.info( - "'%s' successfully uploaded." % upload["object"] - ) - elif "for_object" in upload: - logger.debug( - "Successfully uploaded '%s' segment '%s'." - % (upload["for_object"], upload["segment_index"]) - ) - else: - logger.error( - "'%s' upload failed" % upload["error"] - ) - return(1) - - if client is not None: - logger.debug( - "Storing paths in redis queue '%s" % register_queue_key - ) - client.lpush( - register_queue_key, "%s" % tar_object_path - ) - - except Exception as e: - logger.debug(traceback.format_exc()) - logger.error("%s: %s\n" % (type(e).__name__, str(e))) - return(1) - - logger.info( - "Successfully finished preprocessing of '%s'." % (tar_object_path) - ) - - -def preprocessor_redis_wrapper( - collection, upload_container, replace=False, host="localhost", port=6379, - preprocess_queue_key="preprocess_queue", - register_queue_key="register_queue" -): - client = redis.Redis( - host=host, port=port, charset="utf-8", decode_responses=True - ) - while True: - logger.debug("waiting for redis queue '%s'..." % preprocess_queue_key) - value = client.brpop(preprocess_queue_key) - preprocessor( - collection, - value[1], - upload_container, - replace=replace, - client=client, - register_queue_key=register_queue_key - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.description = textwrap.dedent("""\ - Preprocess product data. - """) - - parser.add_argument( - "--mode", default="standard", choices=["standard", "redis"], - help=( - "The mode to run the preprocessor. Either one-off (standard) or " - "reading from a redis queue." - ) - ) - parser.add_argument( - "--tar-object-path", default=None, - help=( - "Path to object holding tar archive file of product." - ) - ) - parser.add_argument( - "--upload-container", default=None, - help=( - "The name of the swift container where the result is uploaded." - ) - ) - parser.add_argument( - "--replace", action="store_true", - help=( - "Replace existing products instead of skipping the preprocessing." - ) - ) - parser.add_argument( - "--redis-preprocess-queue-key", default="preprocess_queue" - ) - parser.add_argument( - "--redis-register-queue-key", default="register_queue" - ) - parser.add_argument( - "--redis-host", default="localhost" - ) - parser.add_argument( - "--redis-port", type=int, default=6379 - ) - - parser.add_argument( - "-v", "--verbosity", type=int, default=3, choices=[0, 1, 2, 3, 4], - help=( - "Set verbosity of log output " - "(4=DEBUG, 3=INFO, 2=WARNING, 1=ERROR, 0=CRITICAL). (default: 3)" - ) - ) - - arg_values = parser.parse_args() - - setup_logging(arg_values.verbosity) - - collection = os.environ.get('COLLECTION') - if collection is None: - logger.critical("Collection environment variable not set.") - sys.exit(1) - - upload_container = arg_values.upload_container - if upload_container is None: - upload_container = os.environ.get('UPLOAD_CONTAINER') - if upload_container is None: - logger.critical("UPLOAD_CONTAINER environment variable not set.") - sys.exit(1) - - if arg_values.mode == "standard": - preprocessor( - collection, - arg_values.tar_object_path, - upload_container, - replace=arg_values.replace, - ) - else: - preprocessor_redis_wrapper( - collection, - upload_container, - replace=arg_values.replace, - host=arg_values.redis_host, - port=arg_values.redis_port, - preprocess_queue_key=arg_values.redis_preprocess_queue_key, - register_queue_key=arg_values.redis_register_queue_key, - ) diff --git a/preprocessor/preprocessor/__init__.py b/preprocessor/preprocessor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/preprocessor/preprocessor/archive.py b/preprocessor/preprocessor/archive.py new file mode 100644 index 0000000000000000000000000000000000000000..57ccb973939d2c6799cc387b8759bc43f64372ad --- /dev/null +++ b/preprocessor/preprocessor/archive.py @@ -0,0 +1,130 @@ +from os import PathLike +import os.path +import io +from typing import List, Union, BinaryIO +import tarfile +import zipfile +import logging +from fnmatch import translate +import re + +logger = logging.getLogger(__name__) + +ARCHIVE_EXTENSIONS = ['ZIP', 'zip', 'TAR', 'tar', 'TAR.BZ2', 'tar.bz2', 'TAR.GZ', 'tar.gz'] + +def filter_filenames(filenames: List[PathLike], glob: str, case: bool=False) -> List[PathLike]: + regex = translate(glob) + if case: + reobj = re.compile(regex) + else: + reobj = re.compile(regex, re.IGNORECASE) + return [ + filename + for filename in filenames + if reobj.match(filename) + ] + + +def is_tarfile(archive_file: Union[PathLike, BinaryIO]) -> bool: + """ Helper to detect whether a path or a file object is + referencing a valid TAR file. + """ + try: + return tarfile.is_tarfile(archive_file) + except TypeError: + pass + + try: + tarfile.open(fileobj=archive_file) + return True + except (TypeError, tarfile.ReadError): + return False + +def open_tarfile(archive_file: Union[PathLike, BinaryIO]) -> tarfile.TarFile: + """ Open a TAR file from either a path or a file object. + """ + if isinstance(archive_file, (BinaryIO, io.BufferedReader)): + return tarfile.open(fileobj=archive_file) + return tarfile.open(archive_file) + + +def unpack_files(archive_path: Union[PathLike, BinaryIO], target_dir: PathLike, glob=None, case=None, filenames=None, recursive=False) -> List[PathLike]: + """ Unpacks the contents of the specified ZIP or TAR archive to the + given target directory. Optionally, only a given list of filenames + will be extracted. + When a glob is passed, all filenames (either given or from the archive) + will be filtered and only the matching files will be extracted. + """ + iszip = False + istar = False + + # open the archive and extract a list of filenames + if is_tarfile(archive_path): + archive = open_tarfile(archive_path) + all_filenames = archive.getnames() + filenames = filenames or all_filenames + istar = True + elif zipfile.is_zipfile(archive_path): + archive = zipfile.ZipFile(archive_path) + all_filenames = archive.namelist() + filenames = filenames or all_filenames + iszip = True + else: + raise Exception('Cannot open archive %s' % archive_path) + + # filter the filenames when a glob is passed + if glob: + filenames = filter_filenames(filenames, glob, case) + + extracted_filenames = [] + + # extract the files to the target directory + if istar: + members = [ + member + for member in archive.getmembers() + if member.name in filenames + ] + archive.extractall(target_dir, members) + extracted_filenames.extend([ + os.path.join(target_dir, member.name) + for member in members + ]) + + elif iszip: + archive.extractall(target_dir, filenames) + extracted_filenames.extend([ + os.path.join(target_dir, filename) + for filename in filenames + ]) + + # go into the sub-archives to extract files + if recursive: + for extension in ARCHIVE_EXTENSIONS: + sub_archives = filter_filenames(all_filenames, '*.%s' % extension) + for sub_archive in sub_archives: + sub_archive_filename = os.path.join( + os.path.dirname(archive_path), + os.path.basename(sub_archive), + ) + if istar: + archive.extract( + archive.getmember(sub_archive) + ) + os.rename(sub_archive, sub_archive_filename) + if iszip: + archive.extract(sub_archive) + os.rename(sub_archive, sub_archive_filename) + + sub_filenames = unpack_files( + sub_archive_filename, + os.path.join(target_dir, sub_archive), + glob, + case, + filenames, + recursive, + ) + extracted_filenames.extend(sub_filenames) + + # return a list of files extracted + return extracted_filenames diff --git a/preprocessor/preprocessor/cli.py b/preprocessor/preprocessor/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..9836ffdc5f02928766a66ba7d043c2f5c0b07763 --- /dev/null +++ b/preprocessor/preprocessor/cli.py @@ -0,0 +1,91 @@ +from os.path import join, dirname +import logging.config +import json + +import click +import yaml +import jsonschema + +from .preprocess import preprocess_file, preprocess_browse +from .daemon import run_daemon +from .config import load_config + + +def setup_logging(debug=False): + logging.config.dictConfig({ + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'brief': { + 'format': '%(levelname)s %(name)s: %(message)s' + } + }, + 'handlers': { + 'console': { + 'class': 'logging.StreamHandler', + 'level': 'DEBUG' if debug else 'INFO', + 'formatter': 'brief', + } + }, + 'root': { + 'handlers': ['console'], + 'level': 'DEBUG' if debug else 'INFO', + } + }) + + +def validate_config(config): + with open(join(dirname(__file__), 'config-schema.yaml')) as f: + schema = yaml.load(f) + + jsonschema.validate(config, schema) + + +@click.group() +def cli(): + pass + + +@cli.command(help='Run the preprocess daemon, attaching to a Redis queue') +@click.option('--config-file', type=click.File('r')) +@click.option('--use-dir', type=str) # TODO: check dir +@click.option('--validate/--no-validate', default=False) +@click.option('--host', type=str) +@click.option('--port', type=int) +@click.option('--listen-queue', type=str) +@click.option('--listen-md-queue', type=str) +@click.option('--write-queue', type=str) +@click.option('--debug/--no-debug', default=False) +def daemon(config_file=None, use_dir=None, validate=False, host=None, port=None, listen_queue=None, listen_md_queue=None, write_queue=None, debug=False): + setup_logging(debug) + config = load_config(config_file) + if validate: + validate_config(config) + run_daemon(config, host, port, listen_queue, listen_md_queue, write_queue) + + +@cli.command(help='Run a single, one-off preprocessing') +@click.argument('file_path', type=str) +@click.option('--config-file', type=click.File('r')) +@click.option('--use-dir', type=str) # TODO: check dir +@click.option('--validate/--no-validate', default=False) +@click.option('--browse-report/--no-browse-report', default=False) +@click.option('--debug/--no-debug', default=False) +def preprocess(file_path, config_file=None, use_dir=None, validate=False, browse_report=False, debug=False): + setup_logging(debug) + config = load_config(config_file) + if validate: + validate_config(config) + + if browse_report: + with open(file_path) as f: + browse_report_data = json.load(f) + + browse_type = browse_report_data['browse_type'] + for browse in browse_report_data['browses']: + preprocess_browse(config, browse_type, browse_report_data, browse, use_dir) + else: + preprocess_file(config, file_path, use_dir) + +if __name__ == '__main__': + cli() diff --git a/preprocessor/preprocessor/config-schema.yaml b/preprocessor/preprocessor/config-schema.yaml new file mode 100644 index 0000000000000000000000000000000000000000..93f3f9b4f972556cf362bdc5eb24a163772d6fe9 --- /dev/null +++ b/preprocessor/preprocessor/config-schema.yaml @@ -0,0 +1,227 @@ +$id: https://example.com/address.schema.json +$schema: http://json-schema.org/draft-07/schema# +type: object +properties: + source: + description: File source description. Either a local file system or an object storage. + type: object + properties: + type: + description: The type of the file source. + type: string + enum: [local, swift] + kwargs: + description: Extra arguments. Use depends on actual implementation. + type: object + # required: ['type'] + target: + description: File target description. Either a local file system or an object storage. + type: object + properties: + type: + description: The type of the file target. + type: string + enum: [local, swift] + kwargs: + description: Extra arguments. Use depends on actual implementation. + type: object + # required: [type] + replace: + description: If set to true, output replaces already existing files on target. If no existing are present, preprocessing does not start. + type: boolean + default: false + workdir: + description: The local directory, where intermediary files are to be stored. + type: string + keep_temp: + description: Whether to keep temporary files for each step. DEPRECATED. + type: boolean + metadata_glob: + description: A file glob to select metadata files from the downloaded archive. + type: string + glob_case: + description: If all file globs will use case-sensitive match. + type: boolean + type_extractor: + description: How the product type is to be extracted from the metadata file. + type: object + properties: + xpath: + description: Either a single XPath or multiple XPaths to the product type in the metadata file. Each is tried consecutively until the type could be extracted. + oneOf: + - type: string + - type: array + items: + type: string + map: + description: A simple mapping of the extracted type value to an identifier for later usage. This is useful when a preprocessing chain can be re-used for multiple product types. + type: object + required: [xpath] + level_extractor: + description: How the product level is extracted. Currently unused. + type: object + # TODO + preprocessing: + description: The actual preprocessing definition. + type: object + properties: + defaults: + description: The default step settings to be applied. + $ref: '#/definitions/steps' + types: + description: Product type specific step config. + type: object + additionalProperties: + description: A mapping of product type -> steps configuration + $ref: '#/definitions/steps' + required: [types] + + browse_type_mapping: + description: Mapping of browse types to product types. Default is direct mapping. + type: object +required: + - source + - target + - workdir + - keep_temp + - metadata_glob + - type_extractor + - level_extractor + - preprocessing +definitions: + steps: + custom_preprocessor: + description: Definition of a custom preprocessor step + type: object + properties: + path: + description: "The python dotted path to the function to invoke. e.g: 'path.to.module.function'" + type: string + args: + description: The list of arguments to pass to that function + type: array + kwargs: + description: The map of keyword arguments to pass to that function. + type: object + subdatasets: + description: The definition of the subdataset extraction step. + type: object + properties: + data_file_glob: + description: The data file selector. + type: string + subdataset_types: + description: Mapping of subdataset identifier to output filename postfix for subdatasets to be extracted for each data file. + type: object + patternProperties: + ".*": + type: string + georeference: + description: The definition of a georeferencing step. + type: object + properties: + type: + description: The type of georeferencing to apply. + type: string + enum: [gcp, rpc, corner, world] # TODO: more + options: + description: Additional options for the georeferencing. Depends on the type of georeferencing. + type: object + properties: + order: + description: The polynomial order to use for GCP reprojection. + type: number + projection: + description: The projection to use for ungeoreferenced images. + type: string + rpc_file_template: + description: The file glob template to use to find the RPC file. Template parameters are {filename}, {fileroot}, and {extension}. + type: string + warp_options: + description: "Warp options. See https://gdal.org/python/osgeo.gdal-module.html#WarpOptions for details" + corner_names: + description: "The metadata field name including the corner names. Tuple of four: bottom-left, bottom-right, top-left and top-right" + type: array + items: + type: string + orbit_direction_name: + description: The metadata field name containing the orbit direction + type: string + force_north_up: + description: + type: boolean + tps: + description: Whether to use TPS transformation instead of GCP polynomials. + type: boolean + + required: [type] + calc: + description: Definition of a calculation step. + type: object + properties: + formulas: + description: A list of formulas to calculate + type: array + items: + type: object + properties: + inputs: + description: Input definition of this formula + type: object + patternProperties: + "[A-Z]": + type: object + properties: + glob: + description: The input file glob to find the input file. + type: string + band: + description: The band number of the input file. Defaults to 1. + type: integer + data_type: + description: The output data type for the calculated file. (GDAL notation) + type: string + formula: + description: The formula to calculate. See gdal_calc.py for details. + type: string + output_postfix: + description: The filename postfix to append to the output filename. By default an enumeration is used. + type: string + nodata_value: + description: Use this nodata value in the calculation. + type: float + stack_bands: + description: Definition of a stack bands step. + type: object + properties: + group_by: + description: A regex to group the input datasets, if consisting of multiple file. The first regex group is used for the grouping. + type: string + sort_by: + description: A regex to select a portion of the filename to be used for sorting. The first regex group is used. + type: string + order: + description: The order of the extracted item used in 'sort_by'. + type: array + items: + type: string + output: + description: Definition of an output step. + type: object + properties: + options: + description: "Options to be passed to `gdal.Warp`. See https://gdal.org/python/osgeo.gdal-module.html#WarpOptions for details" + type: object + custom_postprocessor: + description: Definition of a custom postprocessor step + type: object + properties: + path: + description: "The python dotted path to the function to invoke. e.g: 'path.to.module.function'" + type: string + args: + description: The list of arguments to pass to that function + type: array + kwargs: + description: The map of keyword arguments to pass to that function. + type: object diff --git a/preprocessor/preprocessor/config.py b/preprocessor/preprocessor/config.py new file mode 100644 index 0000000000000000000000000000000000000000..77534e9411ce9e598d059a53460d40565408870d --- /dev/null +++ b/preprocessor/preprocessor/config.py @@ -0,0 +1,39 @@ +import os +from typing import TextIO +import re + +import yaml + + +ENV_PATTERN = re.compile(r'.*?\${(\w+)}.*?') + +def constructor_env_variables(loader, node): + """ + Extracts the environment variable from the node's value + :param yaml.Loader loader: the yaml loader + :param node: the current node in the yaml + :return: the parsed string that contains the value of the environment + variable + """ + value = loader.construct_scalar(node) + match = ENV_PATTERN.findall(value) # to find all env variables in line + if match: + full_value = value + for g in match: + full_value = full_value.replace( + f'${{{g}}}', os.environ.get(g, g) + ) + return full_value + return value + + +def load_config(input_file: TextIO): + tag = '!env' + loader = yaml.SafeLoader + + # the tag will be used to mark where to start searching for the pattern + # e.g. somekey: !env somestring${MYENVVAR}blah blah blah + loader.add_implicit_resolver(tag, ENV_PATTERN, None) + loader.add_constructor(tag, constructor_env_variables) + + return yaml.load(input_file, Loader=loader) diff --git a/preprocessor/preprocessor/daemon.py b/preprocessor/preprocessor/daemon.py new file mode 100644 index 0000000000000000000000000000000000000000..e44a3c9079f341b59fcf3384be2bd97d7f811ca1 --- /dev/null +++ b/preprocessor/preprocessor/daemon.py @@ -0,0 +1,37 @@ +import redis +import logging +import json + +from .preprocess import preprocess_file, preprocess_browse + + +logger = logging.getLogger(__name__) + + +def run_daemon(config, host, port, listen_queue, listen_md_queue, write_queue): + """ Run the preprocessing daemon, listening on a redis queue + for files to be preprocessed. After preprocessing the filename + of the preprocessed files will be pushed to the output queue. + """ + # initialize the queue client + client = redis.Redis( + host=host, port=port, charset="utf-8", decode_responses=True + ) + logger.debug("waiting for redis queue '%s'..." % listen_queue) + while True: + # fetch an item from the queue to be preprocessed + queue, value = client.brpop([listen_queue, listen_md_queue]) + file_paths = [] + # start the preprocessing on that file + if queue == listen_queue: + filename, file_path = preprocess_file(config, value) + file_paths.append(file_path) + elif queue == listen_md_queue: + browse_report_data = json.loads(value) + browse_type = browse_report_data['browse_type'] + for browse in browse_report_data['browses']: + filename, file_path = preprocess_browse(config, browse_type, browse_report_data, browse) + file_paths.append(file_path) + # TODO: convert to string, list, .... + for item in file_paths: + client.lpush(write_queue, item) diff --git a/preprocessor/preprocessor/metadata.py b/preprocessor/preprocessor/metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..083d585da796e6c75e2087548b94fe25fadc4205 --- /dev/null +++ b/preprocessor/preprocessor/metadata.py @@ -0,0 +1,47 @@ + +from lxml import etree + + +def evaluate_xpath(root, xpath): + """ + """ + result = root.xpath(xpath, namespaces=root.nsmap) + print(xpath, result) + if result: + if isinstance(result, list): + return result[0] + return result + return None + + +def extract_product_type_and_level(metadata_files, config): + """ + """ + product_type = None + product_level = None + for metadata_file in metadata_files: + with open(metadata_file) as f: + tree = etree.parse(f) + root = tree.getroot() + + if not product_type: + xpaths = config['type_extractor']['xpath'] + xpaths = [xpaths] if isinstance(xpaths, str) else xpaths + for xpath in xpaths: + product_type = evaluate_xpath(root, xpath) + if product_type: + break + + if not product_level: + xpaths = config['level_extractor']['xpath'] + if xpaths: + xpaths = [xpaths] if isinstance(xpaths, str) else xpaths + for xpath in xpaths: + product_level = evaluate_xpath(root, xpath) + if product_level: + break + + if product_type and product_level: + break + + return product_type, product_level \ No newline at end of file diff --git a/preprocessor/preprocessor/preprocess.py b/preprocessor/preprocessor/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..626099a5ab25ec001f907644fb1e856276bde68a --- /dev/null +++ b/preprocessor/preprocessor/preprocess.py @@ -0,0 +1,337 @@ +import os +import os.path +import itertools +import importlib +import logging +import shutil +from typing import List +from pprint import pformat +from urllib.parse import urlparse + +from .transfer import get_downloader, get_uploader +from .archive import unpack_files +from .metadata import extract_product_type_and_level +from .steps import ( + georeference_step, extract_subdataset_step, calc_step, stack_bands_step, output_step +) +from .steps.browse_report import browse_georeference +from .util import workdir, Timer + +logging.basicConfig() + +logger = logging.getLogger(__name__) + +# ----------------------------------------------------------------------------- + + +def copy_files(source, target, move=False): + for item in os.listdir(source): + src_path = os.path.join(source, item) + dst_path = os.path.join(target, item) + if move: + shutil.move(src_path, dst_path) + else: + if os.path.isdir(src_path): + shutil.copytree( + src_path, + dst_path + ) + else: + shutil.copy(src_path, dst_path) + + +def custom_preprocessor(source_dir, target_dir, path, args=None, kwargs=None): + """ Preprocessing step for a custom preprocessing. + """ + module_name, _, func_name = path.rpartition('.') + func = getattr(importlib.import_module(module_name), func_name) + func(source_dir, target_dir, *(args or []), **(kwargs or {})) + + +def custom_postprocessor(source_dir, target_dir, path, args=None, kwargs=None): + """ Preprocessing step for a custom preprocessing. + """ + module_name, _, func_name = path.rpartition('.') + func = getattr(importlib.import_module(module_name), func_name) + func(source_dir, target_dir, *(args or []), **(kwargs or {})) + + +STEP_FUNCTIONS = { + 'custom_preprocessor': custom_preprocessor, + 'subdatasets': extract_subdataset_step, + 'georeference': georeference_step, + 'calc': calc_step, + 'stack_bands': stack_bands_step, + 'output': output_step, + 'custom_postprocessor': custom_postprocessor, +} + + +def flatten(l): + return [item for sublist in l for item in sublist] + + +# ----------------------------------------------------------------------------- + + +def preprocess_internal(preprocess_config, previous_step='unpack'): + force_refresh = False + # make processing steps + for step in ['custom_preprocessor', 'subdatasets', 'georeference', 'calc', 'stack_bands', 'output', 'custom_postprocessor']: + step_config = preprocess_config.get(step) + if not step_config: + logger.debug('Skipping step %s as it is not configured.' % step) + continue + + # run the step if it was not already run + if not os.path.isdir(step) or force_refresh: + if os.path.isdir(step): + logger.info('Forcing refresh of existing directory %s' % step) + shutil.rmtree(step) + + logger.info('Running preprocessing step %s' % step) + os.mkdir(step) + preprocessor = STEP_FUNCTIONS[step] + + with Timer() as step_timer: + preprocessor(previous_step, step, **step_config) + + logger.info( + 'Finished preprocessing step %s after %.3f seconds.' + % (step, step_timer.elapsed) + ) + force_refresh = True + + else: + logger.info('%s dir already exists, skipping step...' % step) + + previous_step = step + + if not os.path.isdir('upload') or force_refresh: + try: + os.mkdir('upload') + except FileExistsError: + logger.debug('Upload folder already exists.') + + # copy or move files from previous step directory to upload directory + copy_files(previous_step, 'upload', move=preprocess_config.get('move_files', False)) + + + + +def preprocess_file(config: dict, file_path: os.PathLike, use_dir: os.PathLike=None): + """ Runs the preprocessing of a single file. + """ + with workdir(config, use_dir) as dirname, Timer() as preprocess_timer: + logger.info('Preprocessing %s in %s' % (file_path, dirname)) + target_config = config['target'] + # check if target.replace is configured and if not, check storage if files there + if not target_config['replace']: + uploader = get_uploader( + target_config['type'], target_config.get('args'), target_config.get('kwargs') + ) + if uploader.product_exists(file_path): + raise Exception('Target.replace configuration is not set to true and objects already exist in target %s.' % file_path) + else: + logger.debug('Product does not yet exist on target') + # check if we can reuse a previous download + if not os.path.isdir('download'): + os.mkdir('download') + logger.info('Downloading %s from %s...' % (file_path, dirname)) + # get the Downloader for the configured source archive to download the given source file + source_config = config['source'] + downloader = get_downloader( + source_config['type'], source_config.get('args'), source_config.get('kwargs') + ) + + with Timer() as download_timer: + source_archive_path = downloader.download(file_path, 'download') + + logger.info( + 'Downloaded file %s in %.3f seconds' + % (file_path, download_timer.elapsed) + ) + + else: + source_archive_path = os.path.join('download', os.path.basename(file_path)) + logger.info('Download dir already exists, skipping...') + + # fetch the metadata XML file from the downloaded archive + metadata_files = unpack_files(source_archive_path, 'extra', glob=config['metadata_glob'], case=config.get('glob_case', False)) + + # open the XML to retrieve the product type and level + product_type, product_level = extract_product_type_and_level(metadata_files, config) + logger.info('Detected product_type/level_type %s/%s' % (product_type, product_level)) + + # get a concrete configuration for the type, filled with the defaults + default_config = dict(config['preprocessing'].get('defaults', {})) + type_based_config = dict(config['preprocessing']['types'].get(product_type, {})) + default_config.update(type_based_config) + preprocess_config = default_config + logger.debug('Using preprocessing config %s' % pformat(preprocess_config)) + + if not os.path.isdir('unpack'): + os.mkdir('unpack') + logger.info('Unpacking original files...') + # select and unpack files according to configuration + + with Timer() as unpack_timer: + data_files = flatten([ + unpack_files( + source_archive_path, + 'unpack', + glob=glob, + case=config.get('glob_case', False), + recursive=preprocess_config.get('nested', False), + ) + for glob in preprocess_config['data_file_globs'] + ]) + metadata_files = flatten([ + unpack_files( + source_archive_path, + 'unpack', + glob=glob, + case=config.get('glob_case', False), + recursive=preprocess_config.get('nested', False), + ) + for glob in preprocess_config.get('additional_file_globs', []) + ]) + + logger.info( + 'Unpacked files: %s in %.3f seconds' + % (', '.join(metadata_files + data_files), unpack_timer.elapsed) + ) + else: + logger.info('Unpack dir already exists, skipping...') + + # actually perform the preprocessing from the downloaded file + preprocess_internal(preprocess_config, 'unpack') + + # get an uploader for the finalized images + uploader = get_uploader( + target_config['type'], target_config.get('args'), target_config.get('kwargs') + ) + if len(os.listdir('upload')) == 0: + # end here, so not only metadata file is uploaded + raise Exception('No data files to upload, aborting.') + + paths_for_upload = ['upload', 'extra'] + upload_filenames = [] + for path_to_upload in paths_for_upload: + upload_filenames.extend([ + os.path.join(dirpath, filename) + for dirpath, _, filenames in os.walk(path_to_upload) + for filename in filenames + ]) + + # send all files in the upload directory to the target storage + logger.info( + 'Starting uploading of %d files to %s' + % (len(upload_filenames), file_path) + ) + with Timer() as upload_timer: + uploader.upload(upload_filenames, file_path) + + logger.info( + 'Finished uploading after %.3f seconds.' + % (upload_timer.elapsed) + ) + + logger.info( + 'Finished preprocessing of %s after %.3f seconds.' + % (file_path, preprocess_timer.elapsed) + ) + + return upload_filenames, file_path + + +def preprocess_browse(config: dict, browse_type: str, browse_report: dict, browse: dict, use_dir: os.PathLike=None): + with workdir(config, use_dir) as dirname, Timer() as preprocess_timer: + filename = browse['filename'] + logger.info('Preprocessing browse "%s" in %s' % (filename, dirname)) + + parsed = urlparse(filename) + + if not parsed.scheme: + # check if we can reuse a previous download + if not os.path.isdir('download'): + os.mkdir('download') + logger.info('Downloading %s from %s...' % (filename, dirname)) + # get the Downloader for the configured source archive to download the given source file + source_config = config['source'] + downloader = get_downloader( + source_config['type'], source_config.get('args'), source_config.get('kwargs') + ) + + with Timer() as download_timer: + source_filename_path = downloader.download(filename, 'download') + + logger.info( + 'Downloaded file %s in %.3f seconds' + % (filename, download_timer.elapsed) + ) + + else: + source_filename_path = os.path.join('download', os.path.basename(filename)) + logger.info('Download dir already exists, skipping...') + + elif parsed.scheme in ('http', 'https'): + # TODO: check if allowed and download from there + raise NotImplementedError + + if not os.path.isdir('unpack'): + os.mkdir('unpack') + if not os.path.isdir('extra'): + os.mkdir('extra') + + logger.info('Applying browse georeference to browse %s' % filename) + browse_georeference('download', 'unpack', 'extra', browse_report, browse) + + # fetch the product type from the browse_type + product_type = config.get('browse_type_mapping', {}).get(browse_type, browse_type) + logger.info('Detected product_type %s' % (product_type)) + + # get a concrete configuration for the type, filled with the defaults + default_config = dict(config['preprocessing'].get('defaults', {})) + type_based_config = dict(config['preprocessing']['types'].get(product_type, {})) + default_config.update(type_based_config) + preprocess_config = default_config + + logger.debug('Using preprocessing config %s' % pformat(preprocess_config)) + preprocess_internal(preprocess_config) + + # get an uploader for the finalized images + target_config = config['target'] + uploader = get_uploader( + target_config['type'], target_config.get('args'), target_config.get('kwargs') + ) + paths_for_upload = ['upload', 'extra'] + upload_filenames = [] + for path_to_upload in paths_for_upload: + upload_filenames.extend([ + os.path.join(dirpath, filename) + for dirpath, _, filenames in os.walk(path_to_upload) + for filename in filenames + ]) + + file_path = browse['browse_identifier'] or upload_filenames[0] + + # send all files in the upload directory to the target storage + logger.info( + 'Starting uploading of %d files to %s' + % (len(upload_filenames), file_path) + ) + with Timer() as upload_timer: + uploader.upload(upload_filenames, file_path) + + logger.info( + 'Finished uploading after %.3f seconds.' + % (upload_timer.elapsed) + ) + + logger.info( + 'Finished preprocessing of browse "%s" after %.3f seconds.' + % (filename, preprocess_timer.elapsed) + ) + + return upload_filenames, file_path diff --git a/preprocessor/preprocessor/steps/__init__.py b/preprocessor/preprocessor/steps/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..82fc04493807bdd29b6d268dd2d0296fa9111ddd --- /dev/null +++ b/preprocessor/preprocessor/steps/__init__.py @@ -0,0 +1,14 @@ +from .georeference import georeference_step +from .output import output_step +from .stack import stack_bands_step +from .subdataset import extract_subdataset_step +from .calc import calc_step + + +__all__ = [ + 'georeference_step', + 'output_step', + 'stack_bands_step', + 'extract_subdataset_step', + 'calc_step', +] diff --git a/preprocessor/preprocessor/steps/browse_report.py b/preprocessor/preprocessor/steps/browse_report.py new file mode 100644 index 0000000000000000000000000000000000000000..bd377b6cc3dde156079952c1288fd1e7da01df86 --- /dev/null +++ b/preprocessor/preprocessor/steps/browse_report.py @@ -0,0 +1,210 @@ +import os +from glob import glob +from os.path import join, basename +from textwrap import dedent + +from ..util import replace_ext, pairwise, gdal, osr + + +def browse_georeference(source_dir: os.PathLike, target_dir_data: os.PathLike, target_dir_meta: os.PathLike, browse_report: dict, browse: dict): + for filename in glob(join(source_dir, '*')): + target_filename_data = join(target_dir_data, replace_ext(basename(filename), '.tif')) + target_filename_meta = join(target_dir_meta, replace_ext(basename(filename), '.xml')) + apply_browse_report_georeference(filename, target_filename_data, browse) + + generate_gsc(filename, target_filename_meta, browse_report, browse) + + +def apply_browse_report_georeference(input_filename: os.PathLike, target_filename: os.PathLike, browse: dict): + ds = gdal.GetDriverByName('GTiff').CreateCopy(target_filename, gdal.Open(input_filename)) + type_ = browse['type'] + + if type_ == 'rectified_browse': + size_x, size_y = ds.RasterXSize, ds.RasterYSize + low, high = browse['rectified']['coord_list'] + + minx, miny = low + maxx, maxy = high + + ds.SetGeoTransform([ + minx, (maxx - minx) / size_x, 0, + maxy, 0, (miny - maxy) / size_y, + ]) + + elif type_ == 'footprint_browse': + col_rows = browse['footprint']['col_row_list'] + coords = browse['footprint']['coord_list'] + + if col_rows[0] == col_rows[-1] and coords[0] == coords[-1]: + col_rows = col_rows[:-1] + coords = coords[:-1] + + gcps = [ + gdal.GCP(coord[1], coord[0], 0, col_row[0], col_row[1]) + for col_row, coord in zip(col_rows, coords) + ] + + sr = osr.SpatialReference() + sr.ImportFromEPSG(4326) + ds.SetGCPs(gcps, sr) + + elif type_ == 'model_in_geotiff_browse': + # nothing to do in this case + pass + + elif type_ == 'regular_grid_browse': + col_node_number = browse['regular_grid']['col_node_number'] + row_node_number = browse['regular_grid']['row_node_number'] + col_step = browse['regular_grid']['col_step'] + row_step = browse['regular_grid']['row_step'] + + coord_lists = browse['regular_grid']['coord_lists'] + + range_x = frange(0.0, row_node_number * row_step, row_step) + range_y = frange(0.0, col_node_number * col_step, col_step) + pixels = [(x, y) for x in range_x for y in range_y] + + coords = [ + coord + for coord in coord_list for coord_list in coord_lists + ] + + gcps = [ + gdal.GCP(coord[1], coord[0], 0, col_row[0], col_row[1]) + for col_row, coord in zip(col_rows, coords) + ] + + sr = osr.SpatialReference() + sr.ImportFromEPSG(4326) + ds.SetGCPs(gcps, sr) + + else: + raise Exception + + del ds + + +# copied from: https://pynative.com/python-range-for-float-numbers/ +def frange(start, stop=None, step=None): + # if stop and step argument is None set start=0.0 and step = 1.0 + start = float(start) + if stop == None: + stop = start + 0.0 + start = 0.0 + if step == None: + step = 1.0 + + count = 0 + while True: + temp = float(start + count * step) + if step > 0 and temp >= stop: + break + elif step < 0 and temp <= stop: + break + yield temp + + count += 1 + + +def generate_gsc(input_filename: os.PathLike, target_filename: os.PathLike, browse_report: dict, browse: dict): + footprint = '' + + type_ = browse['type'] + if type_ == 'rectified_browse': + low, high = browse['rectified']['coord_list'] + footprint = ' '.join(str(v) for v in [ + low[0], low[1], + high[0], low[1], + high[0], high[1], + low[0], high[1], + low[0], low[1], + ]) + + elif type_ == 'footprint_browse': + footprint = ' '.join([ + f'{y} {x}' + for y, x in browse['footprint']['coord_list'] + ]) + + elif type_ == 'model_in_geotiff_browse': + ds = gdal.Open(input_filename) + gt = ds.GetGeoTransform() + + low = (gt[3], gt[0]) + high = (gt[3] + ds.RasterYSize * gt[5], gt[0] + ds.RasterXSize * gt[1]) + + footprint = ' '.join(str(v) for v in [ + low[0], low[1], + high[0], low[1], + high[0], high[1], + low[0], high[1], + low[0], low[1], + ]) + + elif type_ == 'regular_grid_browse': + coord_lists = browse['regular_grid']['coord_lists'] + coords = coord_lists[0] + [ + coord_list[-1] + for coord_list in coord_lists[1:-1] + ] + reversed(coord_lists[-1]) + [ + coord_list[0] + for coord_list in coord_lists[-1::-1] + ] + + footprint = ' '.join([ + f'{y} {x}' + for y, x in coords + ]) + + return dedent(f"""\ + <?xml version='1.0' encoding='UTF-8'?> + <gsc:report + xmlns:sar="http://earth.esa.int/sar" + xmlns:gml="http://www.opengis.net/gml" + xmlns:eop="http://earth.esa.int/eop" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xmlns:opt="http://earth.esa.int/opt" + xmlns:gsc="http://earth.esa.int/gsc" + xmlns:atm="http://earth.esa.int/atm" + xmlns:xlink="http://www.w3.org/1999/xlink" version="2.0"> + <gsc:responsibleOrgName>{browse_report['responsible_org_name']}</gsc:responsibleOrgName> + <gsc:reportType>BROWSE</gsc:reportType> + <gsc:dateTime>{browse_report['date_time']}</gsc:dateTime> + <gsc:orderReference></gsc:orderReference> + <gsc:opt_metadata version="1.2.1"> + <gml:metaDataProperty> + <gsc:EarthObservationMetaData> + <eop:identifier>{browse['browse_identifier']}</eop:identifier> + </gsc:EarthObservationMetaData> + </gml:metaDataProperty> + <gml:validTime> + <gml:TimePeriod> + <gml:beginPosition>{browse['start_time']}</gml:beginPosition> + <gml:endPosition>{browse['end_time']}</gml:endPosition> + </gml:TimePeriod> + </gml:validTime> + <gml:using> + <eop:EarthObservationEquipment> + </eop:EarthObservationEquipment> + </gml:using> + <gml:target> + <eop:Footprint> + <gml:multiExtentOf> + <gml:MultiSurface srsName="EPSG:4326"> + <gml:surfaceMembers> + <gml:Polygon> + <gml:exterior> + <gml:LinearRing> + <gml:posList>{footprint}</gml:posList> + </gml:LinearRing> + </gml:exterior> + </gml:Polygon> + </gml:surfaceMembers> + </gml:MultiSurface> + </gml:multiExtentOf> + </eop:Footprint> + </gml:target> + <gml:resultOf/> + </gsc:opt_metadata> + </gsc:report> + """) diff --git a/preprocessor/preprocessor/steps/calc.py b/preprocessor/preprocessor/steps/calc.py new file mode 100644 index 0000000000000000000000000000000000000000..493c6c5c9dcc8333549ad12e6dd3938e5deba371 --- /dev/null +++ b/preprocessor/preprocessor/steps/calc.py @@ -0,0 +1,56 @@ +import os +from os.path import basename, dirname, join, isfile +import subprocess +from typing import List +from glob import glob +import shutil +import logging + +from ..util import replace_ext + + +logger = logging.getLogger(__name__) + + +def calc_step(source_dir: os.PathLike, target_dir: os.PathLike, formulas: List[dict]): + for i, item in enumerate(formulas): + # get first filename as a base + filename = glob(join(source_dir, list(item['inputs'].values())[0]['glob']))[0] + target_filename = join( + target_dir, + replace_ext(basename(filename), item.get('output_postfix', '_proc%d' % i) + '.tif', False) + ) + + if isfile(target_filename): + logger.warn('Calc output filename %s already exists' % target_filename) + + calc_formula(source_dir, item['inputs'], target_filename, item['formula'], item.get('data_type', 'Float32'), item.get('nodata_value', None)) + + # take all original files with from the last step + for filename in glob('%s/*' % source_dir): + target_filename = join(target_dir, basename(filename)) + if isfile(target_filename): + logger.warn('Calc output filename %s already exists' % target_filename) + shutil.copy(filename, target_filename) + + +def calc_formula(source_dir: os.PathLike, inputs: List[dict], target_filename: os.PathLike, formula: str, data_type: str="Float32", nodata_value: float=None): + cmd = [ + "gdal_calc.py", + "--calc=%s" % formula, + "--outfile=%s" % target_filename, + "--type", data_type, + ] + + for name in inputs: + # select first + filename = glob(join(source_dir, inputs[name]['glob']))[0] + cmd.extend([ + "-%s" % name, filename, + "--%s_band=%d" % (name, inputs[name].get('band', 1)), + ]) + + if nodata_value is not None: + cmd.append("--NoDataValue=%f" % nodata_value) + + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) diff --git a/preprocessor/preprocessor/steps/georeference.py b/preprocessor/preprocessor/steps/georeference.py new file mode 100644 index 0000000000000000000000000000000000000000..129074815f84b8f10720d367d09606a155677251 --- /dev/null +++ b/preprocessor/preprocessor/steps/georeference.py @@ -0,0 +1,207 @@ +import os +from os.path import join, basename, splitext +import logging +from glob import glob +import shutil +from typing import List, Tuple + +from ..util import gdal, osr, replace_ext + + +logger = logging.getLogger(__name__) + + +def georeference_step(source_dir: os.PathLike, target_dir: os.PathLike, type: str, **options: dict): + type_name = type.lower() + if type_name == 'gcp': + georef_func = gcp_georef + elif type_name == 'rpc': + georef_func = rpc_georef + elif type_name == 'world': + georef_func = world_georef + elif type_name == 'corners': + georef_func = corner_georef + else: + raise Exception('Invalid georeference type %s' % type_name) + for filename in [path for path in glob(join(source_dir, '**'), recursive=True) if not os.path.isdir(path)]: + target_filename = join(target_dir, basename(filename)) + georef_func(filename, target_filename, **options) + + +def gcp_georef(input_filename: os.PathLike, target_filename: os.PathLike, order: int=1, projection: str='EPSG:4326', + tps: bool=False): + succeded = False + + # simple case: get the geotransform from some GCPs + try: + ds = gdal.Open(input_filename, gdal.GA_Update) + except RuntimeError: + logger.warn('Can not open file by GDAL %s' % (input_filename)) + return + if ds.GetGCPCount() <= 4: + try: + gcps = ds.GetGCPs() + gt = gdal.GCPsToGeoTransform(gcps) + ds.SetGeoTransform(gt) + except Exception: + del ds + logger.warning("Failed to get geotransform via GCPsToGeoTransform") + else: + del ds + shutil.move(input_filename, target_filename) + succeded = True + + # otherwise warp + if not succeded: + logger.info("Applying GCP transform by warping") + + if tps: + options = { + 'tps': tps + } + else: + options = { + 'polynomialOrder': order + } + + gdal.Warp( + target_filename, + input_filename, + dstSRS=projection, + **options, + ) + +def rpc_georef(input_filename: os.PathLike, target_filename: os.PathLike, rpc_file_template: str='{fileroot}.RPC', warp_options: dict=None): + fileroot, extension = splitext(input_filename) + rpc_file_glob = rpc_file_template.format( + filename=input_filename, fileroot=fileroot, + extension=extension, + ) + rpc_filename = None + try: + rpc_filename = glob(rpc_file_glob, recursive=True)[0] + except IndexError: + logger.warn('No RPC filename found with glob %s' % rpc_file_glob) + # rename RPC filename to be compatible with GDAL + if rpc_filename: + shutil.move(rpc_filename, replace_ext(input_filename, '.rpc')) + + gdal.Warp( + target_filename, + input_filename, + rpc=True, + **(warp_options or {}) + ) + + +def corner_georef(input_filename: os.PathLike, target_filename: os.PathLike, corner_names: List[str]=None, + orbit_direction_name: str=None, force_north_up: bool=False, gcp_srid: int=4326, warp: bool=False): + corner_names = corner_names or ["bottom_left", "bottom_right", "top_left", "top_right"] + ds = gdal.Open(input_filename, gdal.GA_Update) + + orbit_direction = ds.GetMetadata()[orbit_direction_name].lower() + metadata = ds.GetRasterBand(1).GetMetadata() + + # from pprint import pprint + + # pprint (metadata) + # pprint(ds.GetMetadata()) + bl, br, tl, tr = [ + [float(num) for num in metadata[corner_name].split()] + for corner_name in corner_names + ] + + gcps = gcps_from_borders( + (ds.RasterXSize, ds.RasterYSize), + (bl, br, tl, tr), + orbit_direction, + force_north_up + ) + + sr = osr.SpatialReference() + sr.ImportFromEPSG(gcp_srid) + + ds.SetGCPs(gcps, sr.ExportToWkt()) + + if warp: + gdal.Warp( + target_filename, + ds, + ) + del ds + else: + ds.SetGeoTransform(gdal.GCPsToGeoTransform(ds.GetGCPs())) + driver = ds.GetDriver() + del ds + driver.Rename(target_filename, input_filename) + + +def world_georef(): + # TODO: implement + pass + + + + +def gcps_from_borders(size: Tuple[float, float], coords: List[Tuple[float, float]], orbit_direction: str, force_north_up: bool=False): + x_size, y_size = size + # expects coordinates in dict(.*border_left.*:[lat,lon],...) + gcps = [] + if force_north_up and len(coords) == 4: + # compute gcps assuming north-up, east-right image no matter, what is claimed by metadata + sorted_by_lats = sorted(coords, key=lambda x: x[0], reverse=True) + # compare longitudes + if sorted_by_lats[0][1] > sorted_by_lats[1][1]: + # /\ + # 1 \ \ + # top - left corner has lower latitude from two northernmost \/ + if orbit_direction != "descending": + top_left = sorted_by_lats[1] + top_right = sorted_by_lats[0] + bottom_left = sorted_by_lats[3] + bottom_right = sorted_by_lats[2] + else: + top_left = sorted_by_lats[3] + top_right = sorted_by_lats[2] + bottom_left = sorted_by_lats[1] + bottom_right = sorted_by_lats[0] + else: + # /\ + # 2 / / + # top - left corner has higher latitude from two northernmost \/ + if orbit_direction != "descending": + top_left = sorted_by_lats[0] + top_right = sorted_by_lats[1] + bottom_left = sorted_by_lats[2] + bottom_right = sorted_by_lats[3] + else: + top_left = sorted_by_lats[2] + top_right = sorted_by_lats[3] + bottom_left = sorted_by_lats[0] + bottom_right = sorted_by_lats[1] + gcps.append(gdal.GCP(bottom_left[1], bottom_left[0], 0, 0.5, 0.5)) + gcps.append(gdal.GCP(bottom_right[1], bottom_right[0], 0, x_size - 0.5, 0.5)) + gcps.append(gdal.GCP(top_left[1], top_left[0], 0, 0.5, y_size - 0.5)) + gcps.append(gdal.GCP(top_right[1], top_right[0], 0, x_size - 0.5, y_size - 0.5)) + + else: + bl, br, tl, tr = coords + + x_left = x_size - 0.5 + x_right = 0.5 + + y_bottom = 0.5 + y_top = y_size - 0.5 + + if orbit_direction == 'descending': + x_left, x_right = x_right, x_left + y_bottom, y_top = y_top, y_bottom + + gcps.extend([ + gdal.GCP(bl[1], bl[0], 0, x_left, y_bottom), + gdal.GCP(br[1], br[0], 0, x_right, y_bottom), + gdal.GCP(tl[1], tl[0], 0, x_left, y_top), + gdal.GCP(tr[1], tr[0], 0, x_right, y_top), + ]) + + return gcps diff --git a/preprocessor/preprocessor/steps/output.py b/preprocessor/preprocessor/steps/output.py new file mode 100644 index 0000000000000000000000000000000000000000..d90c53435419f999964899080a7fa7fb6f277ef3 --- /dev/null +++ b/preprocessor/preprocessor/steps/output.py @@ -0,0 +1,40 @@ +import os +from os.path import join, basename +from uuid import uuid4 +from glob import glob + +from ..util import replace_ext, gdal +import logging + +logger = logging.getLogger(__name__) + + +def output_step(source_dir: os.PathLike, target_dir: os.PathLike, options: dict=None): + # find out the driver to get the extension + options = options if options is not None else {} + frmt = options.get('format', 'GTiff') + driver = gdal.GetDriverByName(frmt) + if not driver: + raise ValueError('Unsupported driver %s' % frmt) + extension = driver.GetMetadata().get('DMD_EXTENSIONS', 'tif').split(' ')[0] + # warp each individual file + warped_files = [] + for filename in [path for path in glob(join(source_dir, '**'), recursive=True) if not os.path.isdir(path)]: + target_filename = join(target_dir, replace_ext(basename(filename), extension)) + logger.debug('Warping file %s' % filename) + gdal.Warp(target_filename, filename, options=gdal.WarpOptions( + **options + )) + warped_files.append(target_filename) + + if len(warped_files) > 1: + tmp_filename = join(target_dir, '%s.%s' % (uuid4().hex, extension)) + logger.debug('Warping files %s' % warped_files) + gdal.Warp(tmp_filename, warped_files, options=gdal.WarpOptions( + **options + )) + + # delete old files and rename the combined file to the first filename + for filename in warped_files: + os.unlink(filename) + os.rename(tmp_filename, warped_files[0]) diff --git a/preprocessor/preprocessor/steps/stack.py b/preprocessor/preprocessor/steps/stack.py new file mode 100644 index 0000000000000000000000000000000000000000..809f3ce7e574e31e3788cb3e15cab5fd1c489186 --- /dev/null +++ b/preprocessor/preprocessor/steps/stack.py @@ -0,0 +1,52 @@ +import os +from os.path import basename, join, splitext +from itertools import groupby +import re +from glob import glob +from typing import List + +from ..util import replace_ext, gdal + + +def stack_bands_step(source_dir: os.PathLike, target_dir: os.PathLike, group_by: str=None, sort_by: str=None, order: List[str]=None): + """ Stack bands of the individual images + """ + filenames = [path for path in glob(join(source_dir, '**'), recursive=True) if not os.path.isdir(path)] + # check if we have a group_by regex. If yes, use the first + # re-group to group by. + # Fallback is basename of file as groupname + if group_by: + re_group_by = re.compile(group_by) + groups = { + k: list(v) + for k, v in groupby(filenames, key=lambda v: re_group_by.match(v).group(1)) + } + else: + groups = {basename(filenames[0]): filenames} + + for groupname, group in groups.items(): + # check if a sort_by is specified. if yes, use the sort_by regex group + # and optionally a ordered list to order the filenames + if sort_by: + re_sort_by = re.compile(sort_by) + + if order: + group = [ + v for v in group + if re_sort_by.match(v) + and re_sort_by.match(v).group(1) in order + ] + + group = sorted( + group, + key=lambda v: order.index(re_sort_by.match(v).group(1)) + ) + else: + group = sorted( + group, + key=lambda v: re_sort_by.match(v).group(1) + ) + + # build a VRT to stack bands for each group + vrt_filename = replace_ext(join(target_dir, groupname), '.vrt') + gdal.BuildVRT(vrt_filename, group, separate=True) diff --git a/preprocessor/preprocessor/steps/subdataset.py b/preprocessor/preprocessor/steps/subdataset.py new file mode 100644 index 0000000000000000000000000000000000000000..a284de94bae96d40a0156b766f512d2c3946834d --- /dev/null +++ b/preprocessor/preprocessor/steps/subdataset.py @@ -0,0 +1,36 @@ +import os +from os.path import join, splitext, basename, dirname +from glob import glob +from typing import Dict + +from ..util import replace_ext, gdal + + +def extract_subdataset_step(source_dir: os.PathLike, target_dir: os.PathLike, data_file_glob: str, subdataset_types: Dict[str, str]=None): + datafiles = glob(join(source_dir, data_file_glob)) + if not datafiles: + raise Exception('No datafiles were matched by the provided glob') + + for filename in datafiles: + extract_subdatasets( + filename, + target_dir, + subdataset_types + ) + + +def extract_subdatasets(source_filename: os.PathLike, target_dir: os.PathLike, subdataset_types: Dict[str, str]=None): + ds = gdal.Open(source_filename) + + sub_datasets = [] + for locator, _ in ds.GetSubDatasets(): + _, _, sd_type = locator.split(':') + if subdataset_types is None or sd_type in subdataset_types: + sub_datasets.append((locator, subdataset_types[sd_type])) + + if not sub_datasets: + raise Exception('No subdatasets were matched by the provided types') + + for locator, suffix in sub_datasets: + target_filename = join(target_dir, basename(replace_ext(source_filename, '%s.tif' % suffix))) + gdal.Translate(target_filename, locator, format='GTiff') diff --git a/preprocessor/preprocessor/transfer/__init__.py b/preprocessor/preprocessor/transfer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..824127913807029b545af0f77c69bc4109c9fbd8 --- /dev/null +++ b/preprocessor/preprocessor/transfer/__init__.py @@ -0,0 +1 @@ +from .util import get_downloader, get_uploader \ No newline at end of file diff --git a/preprocessor/preprocessor/transfer/abc.py b/preprocessor/preprocessor/transfer/abc.py new file mode 100644 index 0000000000000000000000000000000000000000..9cc9818dcd1393b73f2c60a1fe78ec0cce98aad5 --- /dev/null +++ b/preprocessor/preprocessor/transfer/abc.py @@ -0,0 +1,36 @@ +from os import PathLike +from abc import ABC, abstractmethod +from typing import List, Optional, Union + +# copied from: https://github.com/python/cpython/blob/3.8/Lib/_collections_abc.py +def _check_methods(C, *methods): + mro = C.__mro__ + for method in methods: + for B in mro: + if method in B.__dict__: + if B.__dict__[method] is None: + return NotImplemented + break + else: + return NotImplemented + return True + + +class Downloader(ABC): + """ ABC for file downloaders. Implementing classes shall download files + from a given storage location to be preprocessed. + """ + + @abstractmethod + def download(self, remote_path: PathLike, local_path: PathLike) -> PathLike: + pass + + +class Uploader(ABC): + @abstractmethod + def upload(self, local_path: Union[PathLike, List[PathLike]], remote_dir: PathLike) -> List[PathLike]: + pass + + @abstractmethod + def product_exists(self, remote_dir: PathLike) -> bool: + pass diff --git a/preprocessor/preprocessor/transfer/local.py b/preprocessor/preprocessor/transfer/local.py new file mode 100644 index 0000000000000000000000000000000000000000..f1450f7d6b18d2e0a98232023d5bda5cb6db667a --- /dev/null +++ b/preprocessor/preprocessor/transfer/local.py @@ -0,0 +1,48 @@ +import os +import os.path +import shutil +from typing import List, Union +import logging + +logger = logging.getLogger(__name__) + +class Base: + def __init__(self, storage_path: os.PathLike): + self.storage_path = storage_path + + +class Downloader(Base): + """ Downloader for OpenStack swift object storages + """ + def download(self, remote_path: os.PathLike, local_dir: os.PathLike) -> os.PathLike: + os.path.join(self.storage_path, remote_path) + + return shutil.copy2(os.path.join(self.storage_path, remote_path), local_dir) + + +class Uploader(Base): + """ Uploader for OpenStack swift object storages + """ + def upload(self, local_path: Union[os.PathLike, List[os.PathLike]], remote_dir: os.PathLike) -> List[os.PathLike]: + paths = local_path if isinstance(local_path, List) else [local_path] + remote_paths = [ + os.path.join( + self.storage_path, + remote_dir, + os.path.basename(path) + ) + for path in paths + ] + + for local_path, remote_path in zip(paths, remote_paths): + os.makedirs(os.path.dirname(remote_path), exist_ok=True) + shutil.copy2(local_path, remote_path) + + return remote_paths + + def product_exists(self, remote_dir: os.PathLike) -> bool: + remote_path = os.path.join(self.storage_path, remote_dir) + for r, d, f in os.walk(remote_path): + if len(f) >= 2: + return True + return False diff --git a/preprocessor/preprocessor/transfer/swift.py b/preprocessor/preprocessor/transfer/swift.py new file mode 100644 index 0000000000000000000000000000000000000000..a58f7fb146530860a1b7af61961b38ea6b6849dc --- /dev/null +++ b/preprocessor/preprocessor/transfer/swift.py @@ -0,0 +1,135 @@ +import os +import os.path +from typing import List, Union +import logging + +from swiftclient.multithreading import OutputManager +from swiftclient.service import SwiftError, SwiftService, SwiftUploadObject + +logging.getLogger("requests").setLevel(logging.WARNING) +logging.getLogger("swiftclient").setLevel(logging.WARNING) +logger = logging.getLogger(__name__) + +class Base: + def __init__(self, username=None, password=None, tenant_name=None, + tenant_id=None, region_name=None, user_domain_id=None, + user_domain_name=None, auth_url=None, auth_version=None, + container=None): + self.username = username + self.password = password + self.tenant_name = tenant_name + self.tenant_id = tenant_id + self.region_name = region_name + self.user_domain_id = user_domain_id + self.user_domain_name = user_domain_name + self.auth_url = auth_url + self.auth_version = auth_version # TODO: assume 3 + self.container = container + + def get_service(self): + return SwiftService(options={ + "os_username": self.username, + "os_password": self.password, + "os_tenant_name": self.tenant_name, + "os_tenant_id": self.tenant_id, + "os_region_name": self.region_name, + "os_auth_url": self.auth_url, + "auth_version": self.auth_version, + "os_user_domain_id": self.user_domain_id, + "os_user_domain_name": self.user_domain_name, + }) + + def validate_container(self, remote_dir): + if self.container: + # container was specified, use it + return self.container, remote_dir + # container needs to be extracted from path + # paths needs to be updated + return remote_dir.partition('/')[0], remote_dir.partition('/')[2] + + +class Downloader(Base): + """ Downloader for OpenStack swift object storages + """ + def download(self, remote_path: os.PathLike, local_path: os.PathLike) -> os.PathLike: + container, remote_path = self.validate_container(remote_path) + target_filename = os.path.join(local_path, os.path.basename(remote_path)) + with self.get_service() as swift: + results = swift.download( + container, + [remote_path], + options={ + 'out_file': target_filename + } + ) + + for result in results: + if result["success"]: + return target_filename + else: + raise Exception('Failed to download %s' % remote_path) + + +class Uploader(Base): + """ Uploader for OpenStack swift object storages + """ + def upload(self, local_path: Union[os.PathLike, List[os.PathLike]], remote_dir: os.PathLike) -> List[os.PathLike]: + paths = local_path if isinstance(local_path, List) else [local_path] + container, remote_dir = self.validate_container(remote_dir) + remote_paths = [ + os.path.join( + remote_dir, + os.path.basename(path) + ) + for path in paths + ] + objects = [ + SwiftUploadObject( + path, + object_name=remote_path + ) + for path, remote_path in zip(paths, remote_paths) + ] + + max_size = max([os.stat(path).st_size for path in paths]) + options = {} + + # use segment options if file is bigger than 5GB + if (max_size > 1024 * 1024 * 1024 * 5): + options['segment_size'] = 2 * 1024 * 1024 * 1024 + options['use_slo'] = True + + with self.get_service() as swift: + # use container or first part of path + results = swift.upload(container=container, objects=objects, options=options) + + for result in results: + if result["success"]: + if "object" in result: + logger.info( + "'%s' successfully uploaded." % result["object"] + ) + elif "for_object" in result: + logger.debug( + "Successfully uploaded '%s' segment '%s'." + % (result["for_object"], result["segment_index"]) + ) + else: + logger.error( + "'%s' upload failed" % result["error"] + ) + raise Exception('Failed to upload %s' % result["error"]) + + return remote_paths + + def product_exists(self, remote_dir: os.PathLike) -> bool: + with self.get_service() as swift: + container, remote_dir = self.validate_container(remote_dir) + list_parts_gen = swift.list( + container=container, options={"prefix": remote_dir}, + ) + for page in list_parts_gen: + if page["success"] and len(page["listing"]) >= 2: + # at least two files present -> pass validation + return True + return False diff --git a/preprocessor/preprocessor/transfer/util.py b/preprocessor/preprocessor/transfer/util.py new file mode 100644 index 0000000000000000000000000000000000000000..bc14bbf5695e7a06d49a9a1bc5ffddc7090d51d6 --- /dev/null +++ b/preprocessor/preprocessor/transfer/util.py @@ -0,0 +1,22 @@ +from . import swift +from . import local + +from .abc import Downloader, Uploader + + +def get_downloader(type_name, args, kwargs) -> Downloader: + if type_name == 'swift': + return swift.Downloader(*args or [], **kwargs or {}) + elif type_name == 'local': + return local.Downloader(*args or [], **kwargs or {}) + + raise Exception('Downloader type %s is not supported' % type_name) + + +def get_uploader(type_name, args, kwargs) -> Uploader: + if type_name == 'swift': + return swift.Uploader(*args or [], **kwargs or {}) + elif type_name == 'local': + return local.Uploader(*args or [], **kwargs or {}) + + raise Exception('Uploader type %s is not supported' % type_name) diff --git a/preprocessor/preprocessor/util.py b/preprocessor/preprocessor/util.py new file mode 100644 index 0000000000000000000000000000000000000000..249176c5643c3f58d5fedbfed94a478992679dee --- /dev/null +++ b/preprocessor/preprocessor/util.py @@ -0,0 +1,68 @@ +import os +from os.path import splitext +from contextlib import contextmanager +from tempfile import TemporaryDirectory, mkdtemp +from time import time + +try: + from osgeo import gdal +except ImportError: + import gdal + +gdal.UseExceptions() + +try: + from osgeo import osr +except ImportError: + import osr + +osr.UseExceptions() + + +def replace_ext(filename: os.PathLike, new_ext: str, force_dot: bool=True) -> os.PathLike: + return splitext(filename)[0] + ('' if new_ext.startswith('.') or not force_dot else '.') + new_ext + + +@contextmanager +def workdir(config: dict, use_dir: os.PathLike=None): + prefix = config.get('prefix', 'preprocess_') + workdir = config.get('workdir') + if use_dir: + os.chdir(use_dir) + yield use_dir + elif config.get('keep_temp'): + dirname = mkdtemp(prefix=prefix, dir=workdir) + os.chdir(dirname) + yield dirname + else: + with TemporaryDirectory(prefix=prefix, dir=workdir) as dirname: + os.chdir(dirname) + yield dirname + + +def pairwise(col): + iterator = iter(col) + while True: + try: + yield (next(iterator), next(iterator)) + except StopIteration: + break + + +class Timer: + """ Helper timer class to allow logging of timing values + """ + def __init__(self): + self.start = None + self.end = None + + def __enter__(self): + self.start = time() + return self + + def __exit__(self, *args, **kwargs): + self.end = time() + + @property + def elapsed(self): + return (self.end if self.end is not None else time()) - self.start diff --git a/preprocessor/run-preprocessor.sh b/preprocessor/run-preprocessor.sh index 7ae857f3b9a6beb9a8c5197115df75da76380ab3..70a8aee6572806b30e158b706fc65fbd95ad427c 100644 --- a/preprocessor/run-preprocessor.sh +++ b/preprocessor/run-preprocessor.sh @@ -1,5 +1,16 @@ #!/bin/sh echo "Running preprocessor" +debug="--no-debug" +if test "$PREPROCESSOR_DEBUG" = true; then + debug="--debug" +fi -python3 /preprocessor.py --mode redis --redis-host ${REDIS_HOST} --redis-port ${REDIS_PORT} --redis-preprocess-queue-key ${REDIS_PREPROCESS_QUEUE_KEY} --redis-register-queue-key ${REDIS_REGISTER_QUEUE_KEY} +preprocessor daemon \ + --config-file /config.yaml \ + --host ${REDIS_HOST} \ + --port ${REDIS_PORT} \ + --listen-queue ${REDIS_PREPROCESS_QUEUE_KEY} \ + --listen-md-queue ${REDIS_PREPROCESS_MD_QUEUE_KEY} \ + --write-queue ${REDIS_REGISTER_QUEUE_KEY} \ + ${debug} \ No newline at end of file diff --git a/preprocessor/setup.py b/preprocessor/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..3b90e5064267302085796c218846f79fe551945c --- /dev/null +++ b/preprocessor/setup.py @@ -0,0 +1,28 @@ +from setuptools import setup, find_packages + +# with open("README.md", "r") as fh: +# long_description = fh.read() +long_description = "" + +setup( + name="preprocessor", # Replace with your own username + version="0.0.1", + author="", + author_email="", + description="preprocessor for PVS", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://gitlab.eox.at/esa/prism/vs/-/tree/master/preprocessor", + packages=find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', + entry_points={ + "console_scripts": [ + "preprocessor = preprocessor.cli:cli", + ], + } +) diff --git a/preprocessor/transform_chain.py b/preprocessor/transform_chain.py deleted file mode 100644 index 9c3977b1993c434c0362978c56f13c91014342fe..0000000000000000000000000000000000000000 --- a/preprocessor/transform_chain.py +++ /dev/null @@ -1,184 +0,0 @@ -from osgeo import gdal, osr -from math import ceil, floor -import logging - -gdal.SetConfigOption("GDAL_CACHEMAX", "1000") -logger = logging.getLogger("preprocessor") - -GDT_SIZES = { - # geotiff data types and their respective sizes in bits - "Byte": 8, - "Int16": 16, - "UInt16": 16, - "CInt16": 16, - "Int32": 32, - "UInt32": 32, - "CInt32": 32, - "Float32": 32, - "CFloat32": 32, - "Float64": 64, - "CFloat64": 64 -} - - -def open_gdal_dataset(input_file): - logger.debug("Opening file using GDAL: %s" % input_file) - return gdal.Open(input_file) - - -def apply_rpc(src_ds): - # if RPC metadata on source is present, apply it as geotransform using warp through temporary vrt - rpc = src_ds.GetMetadata("RPC") - if rpc: - logger.debug("Applying RPC metadata.") - dst_ds = gdal.Warp("", src_ds, dstSRS="EPSG:4326", format="VRT", rpc=True, multithread=True, warpMemoryLimit=1024*1024*1024, resampleAlg=gdal.GRA_NearestNeighbour) - return dst_ds - return src_ds - - -def set_rpc_metadata(src_ds, dst_ds): - # sets RPC metadata from one source dataset to destination dataset if present - rpc = src_ds.GetMetadata('RPC') - if rpc: - dst_ds.SetMetadata(rpc, 'RPC') - return dst_ds - - -def write_gdal_dataset(src_ds, driver_name, output_file_name, creation_options=[]): - # writes dataset to an output file using a given driver and array of creation options (-CO) - # returns filelist as an array (for later reference) - driver_instance = gdal.GetDriverByName(driver_name) - dst_ds = driver_instance.CreateCopy(output_file_name, src_ds, strict=0, options=creation_options) - dst_ds = None # write to disk - return [output_file_name] - - -def write_gdal_dataset_split(src_ds, driver_name, output_file_name, creation_options=[], split_parts=1): - # writes dataset to an output file using a given driver and array of creation options (-CO) - # returns a list of created files - if driver_name == "COG": - # COG driver does not have Create method, need to create in memory raster first and then CreateCopy - driver_instance = gdal.GetDriverByName("MEM") - creation_options_valid = [] # creation options applied only to result driver - else: - driver_instance = gdal.GetDriverByName(driver_name) - creation_options_valid = creation_options - logger.info("Writing file to disk. %s" % ("Splitting into %s stripes." % split_parts if split_parts > 1 else "")) - if split_parts > 1: - # get image properties - dst_filenames = [] - y_coord = 0 - cols = src_ds.RasterXSize - rows = src_ds.RasterYSize - bands = src_ds.RasterCount - data_type = src_ds.GetRasterBand(1).DataType - geo_transform = src_ds.GetGeoTransform() - projection = osr.SpatialReference() - projection.ImportFromWkt(src_ds.GetProjectionRef()) - if not geo_transform: - # provide some default when not set, which will not make sense - # but at least split images will not be stacked on top of each other - geo_transform = [0.0, 1.0, 0.0, 0.0, 0.0, -1.0] - y_origin = geo_transform[3] # uly - pixelHeight = geo_transform[5] # yres - size = floor(rows / split_parts) # height of one strip - for i in range(split_parts): - # add underscore and padded index to between filename and extension if there is one - output_file_name_split = output_file_name.rsplit(".", 1) - output_file_name_crop = "%s_%04d.%s" % (output_file_name_split[0], i, output_file_name_split[1] if len(output_file_name_split) > 1 else "") - logger.debug("Creating in memory dataset stripe %04d." % i) - if i < split_parts - 1: - # all stripes except the last one - # create a new dataset - dst_ds = driver_instance.Create(output_file_name_crop, xsize=cols, ysize=size, bands=bands, eType=data_type, options=creation_options_valid) - # fill bands with data - for band in range(1, bands + 1): - data = src_ds.GetRasterBand(band).ReadAsArray(0, y_coord, cols, size) # xoff, yoff, xsize, ysize - dst_ds.GetRasterBand(band).WriteArray(data) - no_data_value = src_ds.GetRasterBand(band).GetNoDataValue() - if no_data_value: - dst_ds.GetRasterBand(band).SetNoDataValue(no_data_value) - # move y_start for next iteration - y_coord += size - else: - # for last stripe ysize will be larger by remainder after division by split_parts - dst_ds = driver_instance.Create(output_file_name_crop, xsize=cols, ysize=size + (rows % split_parts), bands=bands, eType=data_type, options=creation_options_valid) - for band in range(1, bands + 1): - data = src_ds.GetRasterBand(band).ReadAsArray(0, y_coord, cols, size + rows % split_parts) # xoff, yoff, xsize, ysize - dst_ds.GetRasterBand(band).WriteArray(data) - no_data_value = src_ds.GetRasterBand(band).GetNoDataValue() - if no_data_value: - dst_ds.GetRasterBand(band).SetNoDataValue(no_data_value) - # set new geotransform - new_y = y_origin + (i * size * pixelHeight) - dst_ds.SetGeoTransform((geo_transform[0], geo_transform[1], geo_transform[2], new_y, geo_transform[4], geo_transform[5])) - dst_ds.SetProjection(projection.ExportToWkt()) - logger.debug("Writing stripe file %04d." % i) - if driver_name == "COG": - cog_driver = gdal.GetDriverByName("COG") - if cog_driver: - output_ds = cog_driver.CreateCopy(output_file_name_crop, dst_ds, options=creation_options) - output_ds = None # write to disk - else: - raise Exception("COG driver was not found. Please upgrade GDAL to version >= 3.1") - else: - dst_ds = None # write to disk - dst_filenames.append(output_file_name_crop) - else: - # do not split, just create output - dst_filenames = write_gdal_dataset(src_ds, driver_name, output_file_name, creation_options) - logger.info("Finished writing %s files to disk." % split_parts) - return dst_filenames - - -def correct_geo_transform(src_dst): - # input - gdal dataset - # sets new geotransform if necessary by creating control points of a raster with switched height and width - by Petr - # returns - gdal dataset - ulx, xres, xskew, uly, yskew, yres = src_dst.GetGeoTransform() - # test geotransform if necessary to shift - if xres == 0.0 and yres == 0.0: - logger.debug("Malformed geotransform xres,yres=0 detected, correcting.") - # malformed image, compute xres and yres switched in geotransform - lrx = ulx + (src_dst.RasterXSize * xskew) - lry = uly + (src_dst.RasterYSize * yskew) - # [ulx, lrx, lry, uly] - bounds = lon_min, lon_max, lat_min, lat_max - fp = [[0, src_dst.RasterXSize, src_dst.RasterXSize, 0], [0, 0, src_dst.RasterYSize, src_dst.RasterYSize]] - tp = [[ulx, lrx, lrx, ulx], [lry, lry, uly, uly]] - pix = list(zip(fp[0], fp[1])) - coor = list(zip(tp[0], tp[1])) - # compute the gdal.GCP parameters - gcps = [] - for index, txt in enumerate(pix): - gcps.append(gdal.GCP()) - gcps[index].GCPPixel = pix[index][0] - gcps[index].GCPLine = src_dst.RasterYSize - int(pix[index][1]) - gcps[index].GCPX = coor[index][0] - gcps[index].GCPY = coor[index][1] - # get correct geotransform from gcps - geotransform_new = gdal.GCPsToGeoTransform(gcps) - # overwrite geotransform with new - src_dst.SetGeoTransform(geotransform_new) - return src_dst - - -def split_check(dst, limit=1024 ** 3): - # returns number of parts to which split the resulting image to get each part within limit - # assuming non-compressed resulting image - parts = 1 - size = 0 - if dst is not None: - for i in range(1, dst.RasterCount + 1): - size += dst.RasterXSize * dst.RasterYSize * GDT_SIZES[gdal.GetDataTypeName(dst.GetRasterBand(i).DataType)] / 8 - parts = ceil(size / limit) - return parts - - -def create_vrt_dataset(src_dst_array, dst_ds_name): - return gdal.BuildVRT(dst_ds_name, src_dst_array) - - -def validate_band_count(src_ds, count=4): - if src_ds.RasterCount == count: - return src_ds - return None diff --git a/registrar_test.py b/registrar_test.py deleted file mode 100644 index 7d2df6332b55589a3d6ad7f80b91937599ff35dd..0000000000000000000000000000000000000000 --- a/registrar_test.py +++ /dev/null @@ -1,48 +0,0 @@ -import psycopg2 -import os -import csv - - -with open('./env/emg_db.env', 'r') as f: - env = dict( - line.split('=', 1) - for line in f - ) -database= env['DB_NAME'].replace('\n','') -port = env['DB_PORT'].replace('\n','') -host = env['DB_HOST'].replace('\n','') -database_password= env['DB_PW'].replace('\n','') -database_user = env['DB_USER'].replace('\n','') - - -def connect_to_db(eo_id): - global db_name, coverage_id - connection= None - try: - connection = psycopg2.connect(dbname=database, user=database_user, password=database_password, host='docker', port=port) - cursor = connection.cursor() - db_name = connection.get_dsn_parameters()["dbname"] - postgreSQL_select_Query = "SELECT identifier FROM coverages_eoobject WHERE identifier = '%s';" % eo_id - cursor.execute(postgreSQL_select_Query) - coverage_id = cursor.fetchone()[0] - - except (Exception, psycopg2.Error) as error : - print ("Error while connecting to PostgreSQL", error) - finally: - #closing database connection. - if connection: - cursor.close() - connection.close() - print("PostgreSQL connection is closed") - - - -def test_db_name(name): - with open(name, newline='') as csvfile: - spamreader = csv.reader(csvfile) - for row in spamreader: - identifier = row[0].split('/')[4] - connect_to_db(identifier) - assert coverage_id == identifier - assert db_name == database - diff --git a/registrar_test.sh b/registrar_test.sh deleted file mode 100755 index d394a2d1de2fe73b2157adab0136e08c2d2c7708..0000000000000000000000000000000000000000 --- a/registrar_test.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -product_list_file=$1 -docker exec -i $(docker ps -qf "name=emg-pvs_registrar") python3 /var/www/pvs/dev/pvs_instance/manage.py storage create pvs_testing pvs_testing --type swift --storage-auth auth-cloud-ovh -IFS="," -while read product; do - docker exec -i $(docker ps -qf "name=emg-pvs_registrar") python3 /registrar.py --objects-prefix $product <<<$product - -done < $product_list_file - -pytest -s registrar_test.py --name $product_list_file diff --git a/testing/docker-stack-wait.sh b/testing/docker-stack-wait.sh new file mode 100755 index 0000000000000000000000000000000000000000..c7f00a3199b8562cd7759e4f481709a306a9768a --- /dev/null +++ b/testing/docker-stack-wait.sh @@ -0,0 +1,148 @@ +#!/bin/sh + +# By: Brandon Mitchell <public@bmitch.net> +# License: MIT +# Source repo: https://github.com/sudo-bmitch/docker-stack-wait + +set -e +trap "{ exit 1; }" TERM INT +opt_h=0 +opt_r=0 +opt_s=5 +opt_t=3600 +start_epoc=$(date +%s) + +usage() { + echo "$(basename $0) [opts] stack_name" + echo " -f filter: only wait for services matching filter, may be passed multiple" + echo " times, see docker stack services for the filter syntax" + echo " -h: this help message" + echo " -n name: only wait for specific service names, overrides any filters," + echo " may be passed multiple times, do not include the stack name prefix" + echo " -r: treat a rollback as successful" + echo " -s sec: frequency to poll service state (default $opt_s sec)" + echo " -t sec: timeout to stop waiting" + [ "$opt_h" = "1" ] && exit 0 || exit 1 +} +check_timeout() { + # timeout when a timeout is defined and we will exceed the timeout after the + # next sleep completes + if [ "$opt_t" -gt 0 ]; then + cur_epoc=$(date +%s) + cutoff_epoc=$(expr ${start_epoc} + $opt_t - $opt_s) + if [ "$cur_epoc" -gt "$cutoff_epoc" ]; then + echo "Error: Timeout exceeded" + exit 1 + fi + fi +} +get_service_ids() { + if [ -n "$opt_n" ]; then + service_list="" + for name in $opt_n; do + service_list="${service_list:+${service_list} }${stack_name}_${name}" + done + docker service inspect --format '{{.ID}}' ${service_list} + else + docker stack services ${opt_f} -q "${stack_name}" + fi +} +service_state() { + # output the state when it changes from the last state for the service + service=$1 + # strip any invalid chars from service name for caching state + service_safe=$(echo "$service" | sed 's/[^A-Za-z0-9_]/_/g') + state=$2 + if eval [ \"\$cache_${service_safe}\" != \"\$state\" ]; then + echo "Service $service state: $state" + eval cache_${service_safe}=\"\$state\" + fi +} + +while getopts 'f:hn:rs:t:' opt; do + case $opt in + f) opt_f="${opt_f:+${opt_f} }-f $OPTARG";; + h) opt_h=1;; + n) opt_n="${opt_n:+${opt_n} } $OPTARG";; + r) opt_r=1;; + s) opt_s="$OPTARG";; + t) opt_t="$OPTARG";; + esac +done +shift $(expr $OPTIND - 1) + +if [ $# -ne 1 -o "$opt_h" = "1" -o "$opt_s" -le "0" ]; then + usage +fi + +stack_name=$1 + +# 0 = running, 1 = success, 2 = error +stack_done=0 +while [ "$stack_done" != "1" ]; do + stack_done=1 + # run get_service_ids outside of the for loop to catch errors + service_ids=$(get_service_ids) + for service_id in ${service_ids}; do + service_done=1 + service=$(docker service inspect --format '{{.Spec.Name}}' "$service_id") + + # hardcode a "new" state when UpdateStatus is not defined + state=$(docker service inspect -f '{{if .UpdateStatus}}{{.UpdateStatus.State}}{{else}}new{{end}}' "$service_id") + + # check for failed update states + case "$state" in + paused|rollback_paused) + service_done=2 + ;; + rollback_*) + if [ "$opt_r" = "0" ]; then + service_done=2 + fi + ;; + esac + + # identify/report current state + if [ "$service_done" != "2" ]; then + replicas=$(docker service ls --format '{{.Replicas}}' --filter "id=$service_id" | cut -d' ' -f1) + current=$(echo "$replicas" | cut -d/ -f1) + target=$(echo "$replicas" | cut -d/ -f2) + if [ "$current" != "$target" ]; then + # actively replicating service + service_done=0 + state="replicating $replicas" + fi + fi + service_state "$service" "$state" + + # check for states that indicate an update is done + if [ "$service_done" = "1" ]; then + case "$state" in + new|completed|rollback_completed) + service_done=1 + ;; + *) + # any other state is unknown, not necessarily finished + service_done=0 + ;; + esac + fi + + # update stack done state + if [ "$service_done" = "2" ]; then + # error condition + stack_done=2 + elif [ "$service_done" = "0" -a "$stack_done" = "1" ]; then + # only go to an updating state if not in an error state + stack_done=0 + fi + done + if [ "$stack_done" = "2" ]; then + echo "Error: This deployment will not complete" + exit 1 + fi + if [ "$stack_done" != "1" ]; then + check_timeout + sleep "${opt_s}" + fi +done diff --git a/gitlab_test.sh b/testing/gitlab_test.sh old mode 100644 new mode 100755 similarity index 50% rename from gitlab_test.sh rename to testing/gitlab_test.sh index 1b0972c92f180d9cf6090c5d457169b5408899ab..51a14fa137fa6b6a4160a91e1444fc8cef52a898 --- a/gitlab_test.sh +++ b/testing/gitlab_test.sh @@ -1,15 +1,41 @@ #!/bin/sh -chmod +x env_setup.sh wait_for_container.sh -./env_setup.sh + +# fetch secrets and write them to their according files +cat $vhr18_db > ../env/vhr18_db.env +cat $vhr18_django > ../env/vhr18_django.env +cat $vhr18_obs > ../env/vhr18_obs.env + +cat $emg_db > ../env/emg_db.env +cat $emg_django > ../env/emg_django.env +cat $emg_obs > ../env/emg_obs.env + + +# use `pvs_testing` bucket instead + +sed -i -e 's/emg-data/pvs_testing/g' ../env/emg.env +sed -i -e 's/vhr18-data/pvs_testing/g' ../env/vhr18.env + +sed -i -e 's/emg-cache/pvs_testing/g' ../env/emg_obs.env +sed -i -e 's/vhr18-cache/pvs_testing/g' ../env/vhr18_obs.env + +# source the env file +set -o allexport +source ../env/emg.env +set +o allexport + mkdir data docker swarm init docker network create -d overlay emg-extnet -docker stack deploy -c docker-compose.emg.yml -c docker-compose.emg.dev.yml -c docker-compose.logging.yml emg-pvs -apk update && apk add bash postgresql-dev gcc python3-dev musl-dev py-pip gdal +docker stack deploy -c ../docker-compose.emg.yml -c ../docker-compose.emg.dev.yml emg-pvs +apk update && apk add bash postgresql-dev gcc python3-dev musl-dev py-pip gdal libffi-dev openssl-dev make pip3 install -r requirements.txt -./wait_for_container.sh + +./docker-stack-wait.sh -n renderer -n registrar -n preprocessor emg-pvs + +docker service ls + bash ./registrar_test.sh product_list.csv # docker exec -i $(docker ps -qf "name=emg-pvs_registrar") python3 /var/www/pvs/dev/pvs_instance/manage.py storage create pvs_testing pvs_testing --type swift --storage-auth auth-cloud-ovh # docker exec -i $(docker ps -qf "name=emg-pvs_registrar") python3 /core/registrar.py --objects-prefix "OA/PH1B/0.1/b9/urn:eop:PHR:MULTISPECTRAL_0.5m:DS_PHR1B_201608070959189_FR1_PX_E012N32_0719_00974_4148/0000/PH1B_PHR_FUS_1A_20160807T095918_20160807T095920_TOU_1234_4148.DIMA.tar" -# pytest -s registrar_test.py --name OA/PH1B/0.1/b9/urn:eop:PHR:MULTISPECTRAL_0.5m:DS_PHR1B_201608070959189_FR1_PX_E012N32_0719_00974_4148/0000/PH1B_PHR_FUS_1A_20160807T095918_20160807T095920_TOU_1234_4148.DIMA.tar \ No newline at end of file +# pytest -s registrar_test.py --name OA/PH1B/0.1/b9/urn:eop:PHR:MULTISPECTRAL_0.5m:DS_PHR1B_201608070959189_FR1_PX_E012N32_0719_00974_4148/0000/PH1B_PHR_FUS_1A_20160807T095918_20160807T095920_TOU_1234_4148.DIMA.tar diff --git a/product_list.csv b/testing/product_list.csv similarity index 100% rename from product_list.csv rename to testing/product_list.csv diff --git a/testing/registrar_test.py b/testing/registrar_test.py new file mode 100644 index 0000000000000000000000000000000000000000..54bc3108fc0164c0e62b700c2e62b887650f7435 --- /dev/null +++ b/testing/registrar_test.py @@ -0,0 +1,82 @@ +import os +import csv + +import pytest +import psycopg2 +import paramiko +from dotenv import load_dotenv +from xml.etree import ElementTree + + +@pytest.fixture(scope="session") +def connection(): + load_dotenv(dotenv_path='../env/emg_db.env') + + connect_args = dict( + dbname=os.environ['DB_NAME'], + user=os.environ['DB_USER'], + password=f"\"{os.environ['DB_PW']}\"", + host='docker', + port=os.environ['DB_PORT'], + ) + + with psycopg2.connect(**connect_args) as connection: + yield connection + + +@pytest.fixture +def identifiers(): + with open('./product_list.csv') as f: + yield csv.reader(f) + + +@pytest.fixture +def sftp_connection(): + + transport = paramiko.Transport(('docker',2222)) + transport.connect(username='eox', password='password') + with paramiko.SFTPClient.from_transport(transport) as sftp: + yield sftp + +def query_eo_object(connection, eo_id): + query = f"SELECT identifier FROM coverages_eoobject WHERE identifier = '{eo_id}';" + with connection.cursor() as cursor: + cursor.execute(query) + return cursor.fetchone()[0] + + +def test_db_name(connection, identifiers): + for row in identifiers: + identifier = row[0].split('/')[4] + query_eo_object(connection, identifier) + + +def compare_links(sftp, product_xml, product): + + report= sftp.file("data/to/panda/%s" % product_xml) + + + xml_file= report.read() + root = ElementTree.fromstring(xml_file.decode('utf-8').strip()) + + urls = root.findall('{http://www.telespazio.com/CSCDA/CDD/PDAS}URL') + wms_link = urls[0].find('{http://www.telespazio.com/CSCDA/CDD/PDAS}URL').text + wcs_link = urls[1].find('{http://www.telespazio.com/CSCDA/CDD/PDAS}URL').text + + wms_capabilities = 'emg.pass.copernicus.eu/ows?service=wms&request=GetCapabilities&&cql=identifier=' + wcs_capabilities = 'emg.pass.copernicus.eu/ows?service=wcs&request=GetCapabilities&&cql=identifier=' + expected_wms_link = '%s"%s"' % (wms_capabilities, product) + expected_wcs_link = '%s"%s"' % (wcs_capabilities, product) + assert expected_wms_link.replace('&&', '&') == wms_link + + assert expected_wcs_link.replace('&&', '&') == wcs_link + +def test_reporting(sftp_connection, identifiers): + report_list = sftp_connection.listdir('data/to/panda/') + assert len(report_list) > 0 + for item in report_list: + for row in identifiers: + identifier = row[0].split('/')[4] + + if identifier in item: + compare_links(sftp_connection, item, identifier) \ No newline at end of file diff --git a/testing/registrar_test.sh b/testing/registrar_test.sh new file mode 100755 index 0000000000000000000000000000000000000000..3c26bc87dea352ee81cea9b046f0ff2602de11f6 --- /dev/null +++ b/testing/registrar_test.sh @@ -0,0 +1,16 @@ +#!/bin/bash +product_list_file=$1 +docker exec -i $(docker ps -qf "name=emg-pvs_registrar") python3 /var/www/pvs/dev/pvs_instance/manage.py storage create pvs_testing pvs_testing --type swift --storage-auth auth-cloud-ovh +IFS="," + +while read product; do + docker exec -i $(docker ps -qf "name=emg-pvs_registrar") \ + python3 /registrar.py \ + --objects-prefix $product \ + --service-url $SERVICE_URL \ + --reporting-dir "/mnt/reports" \ + <<<$product + +done < "$product_list_file" + +pytest #-s registrar_test.py --name $product_list_file diff --git a/requirements.txt b/testing/requirements.txt similarity index 72% rename from requirements.txt rename to testing/requirements.txt index 56ba698664ddb78df20e57a7c6159d1da82feba9..b6db4b77a42837295c7f2c82dfad95859ea34b56 100644 --- a/requirements.txt +++ b/testing/requirements.txt @@ -1,4 +1,6 @@ pytest psycopg2 +python-dotenv +paramiko # python-swiftclient # python-keystoneclient \ No newline at end of file diff --git a/wait_for_container.sh b/wait_for_container.sh deleted file mode 100755 index 8a6b5f97f19a839f79fc748e98fdc6c48c0bef61..0000000000000000000000000000000000000000 --- a/wait_for_container.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -while [ -z $(docker exec -i $(docker ps -qf "name=emg-pvs_registrar") python3 /var/www/pvs/dev/pvs_instance/manage.py id list) ]; do - >&2 echo "Collection is not created yet - sleeping" - sleep 20 - done - -while [ -z $(docker exec -i $(docker ps -qf "name=emg-pvs_registrar") python3 /var/www/pvs/dev/pvs_instance/manage.py id list) ]; do - >&2 echo "Collection is not created yet - sleeping" - sleep 20 - done