From 6b1b7cc431165e78c0a91a9c41ddf398ff7da75d Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 29 Jun 2023 13:51:38 -0700 Subject: [PATCH 01/70] Separated NTS backends --- data-access/nexustiles/AbstractTileService.py | 329 +++++ data-access/nexustiles/backends/__init__.py | 0 .../backends/nexusproto/__init__.py | 0 .../nexustiles/backends/nexusproto/backend.py | 566 ++++++++ .../backends/nexusproto/config/datastores.ini | 36 + .../nexusproto/config/datastores.ini.default | 39 + .../backends/nexusproto/dao/CassandraProxy.py | 317 +++++ .../backends/nexusproto/dao/DynamoProxy.py | 146 ++ .../nexusproto/dao/ElasticsearchProxy.py | 1235 +++++++++++++++++ .../backends/nexusproto/dao/S3Proxy.py | 141 ++ .../backends/nexusproto/dao/SolrProxy.py | 731 ++++++++++ .../backends/nexusproto/dao/__init__.py | 14 + .../nexustiles/backends/zarr/__init__.py | 0 data-access/nexustiles/nexustiles.py | 97 +- 14 files changed, 3564 insertions(+), 87 deletions(-) create mode 100644 data-access/nexustiles/AbstractTileService.py create mode 100644 data-access/nexustiles/backends/__init__.py create mode 100644 data-access/nexustiles/backends/nexusproto/__init__.py create mode 100644 data-access/nexustiles/backends/nexusproto/backend.py create mode 100644 data-access/nexustiles/backends/nexusproto/config/datastores.ini create mode 100644 data-access/nexustiles/backends/nexusproto/config/datastores.ini.default create mode 100644 data-access/nexustiles/backends/nexusproto/dao/CassandraProxy.py create mode 100644 data-access/nexustiles/backends/nexusproto/dao/DynamoProxy.py create mode 100644 data-access/nexustiles/backends/nexusproto/dao/ElasticsearchProxy.py create mode 100644 data-access/nexustiles/backends/nexusproto/dao/S3Proxy.py create mode 100644 data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py create mode 100644 data-access/nexustiles/backends/nexusproto/dao/__init__.py create mode 100644 data-access/nexustiles/backends/zarr/__init__.py diff --git a/data-access/nexustiles/AbstractTileService.py b/data-access/nexustiles/AbstractTileService.py new file mode 100644 index 00000000..f4f4449c --- /dev/null +++ b/data-access/nexustiles/AbstractTileService.py @@ -0,0 +1,329 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import configparser +import logging +import sys +import json +from abc import ABC, abstractmethod +from datetime import datetime +from functools import reduce + +import numpy as np +import numpy.ma as ma +import pkg_resources +from pytz import timezone, UTC +from shapely.geometry import MultiPolygon, box + +from .dao import CassandraProxy +from .dao import DynamoProxy +from .dao import S3Proxy +from .dao import SolrProxy +from .dao import ElasticsearchProxy + +from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable +from nexustiles.nexustiles import NexusTileServiceException + +class AbstractTileService(ABC): + @abstractmethod + def get_dataseries_list(self, simple=False): + raise NotImplementedError() + + @abstractmethod + def find_tile_by_id(self, tile_id, **kwargs): + raise NotImplementedError() + + @abstractmethod + def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): + raise NotImplementedError() + + @abstractmethod + def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, start_time, end_time, + metrics_callback=None, **kwargs): + raise NotImplementedError() + + @abstractmethod + def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year, **kwargs): + """ + Given a bounding polygon, dataset, and day of year, find tiles in that dataset with the same bounding + polygon and the closest day of year. + + For example: + given a polygon minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; and day of year=32 + search for first tile in MY_DS with identical bbox and day_of_year <= 32 (sorted by day_of_year desc) + + Valid matches: + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 + + Invalid matches: + minx=1, miny=0, maxx=2, maxy=1; dataset=MY_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_OTHER_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 if minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 also exists + + :param bounding_polygon: The exact bounding polygon of tiles to search for + :param ds: The dataset name being searched + :param day_of_year: Tile day of year to search for, tile nearest to this day (without going over) will be returned + :return: List of one tile from ds with bounding_polygon on or before day_of_year or raise NexusTileServiceException if no tile found + """ + raise NotImplementedError() + + @abstractmethod + def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + @abstractmethod + def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **kwargs): + raise NotImplementedError() + + @abstractmethod + def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): + # Find tiles that fall in the given box in the Solr index + raise NotImplementedError() + + @abstractmethod + def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=0, end_time=-1, **kwargs): + # Find tiles that fall within the polygon in the Solr index + raise NotImplementedError() + + @abstractmethod + def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): + """ + Return list of tiles whose metadata matches the specified metadata, start_time, end_time. + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of tiles + """ + raise NotImplementedError() + + @abstractmethod + def get_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): + """ + Return list of tiles that matches the specified metadata, start_time, end_time with tile data outside of time + range properly masked out. + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of tiles + """ + raise NotImplementedError() + + @abstractmethod + def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs): + """ + The method will return tiles with the exact given bounds within the time range. It differs from + find_tiles_in_polygon in that only tiles with exactly the given bounds will be returned as opposed to + doing a polygon intersection with the given bounds. + + :param bounds: (minx, miny, maxx, maxy) bounds to search for + :param ds: Dataset name to search + :param start_time: Start time to search (seconds since epoch) + :param end_time: End time to search (seconds since epoch) + :param kwargs: fetch_data: True/False = whether or not to retrieve tile data + :return: + """ + raise NotImplementedError() + + @abstractmethod + def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + @abstractmethod + def get_tiles_bounded_by_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, + **kwargs): + raise NotImplementedError() + + @abstractmethod + def get_tiles_bounded_by_polygon(self, polygon, ds=None, start_time=0, end_time=-1, **kwargs): + raise NotImplementedError() + + @abstractmethod + def get_min_max_time_by_granule(self, ds, granule_name): + raise NotImplementedError() + + @abstractmethod + def get_dataset_overall_stats(self, ds): + raise NotImplementedError() + + @abstractmethod + def get_tiles_bounded_by_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + @abstractmethod + def get_tiles_bounded_by_polygon_at_time(self, polygon, dataset, time, **kwargs): + raise NotImplementedError() + + @abstractmethod + def get_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + @abstractmethod + def get_stats_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + @abstractmethod + def get_bounding_box(self, tile_ids): + """ + Retrieve a bounding box that encompasses all of the tiles represented by the given tile ids. + :param tile_ids: List of tile ids + :return: shapely.geometry.Polygon that represents the smallest bounding box that encompasses all of the tiles + """ + raise NotImplementedError() + + @abstractmethod + def get_min_time(self, tile_ids, ds=None): + """ + Get the minimum tile date from the list of tile ids + :param tile_ids: List of tile ids + :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) + :return: long time in seconds since epoch + """ + raise NotImplementedError() + + @abstractmethod + def get_max_time(self, tile_ids, ds=None): + """ + Get the maximum tile date from the list of tile ids + :param tile_ids: List of tile ids + :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) + :return: long time in seconds since epoch + """ + raise NotImplementedError() + + @abstractmethod + def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): + """ + Get a list of distinct tile bounding boxes from all tiles within the given polygon and time range. + :param bounding_polygon: The bounding polygon of tiles to search for + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of distinct bounding boxes (as shapely polygons) for tiles in the search polygon + """ + raise NotImplementedError() + + def mask_tiles_to_bbox(self, min_lat, max_lat, min_lon, max_lon, tiles): + for tile in tiles: + tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) + tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + # If this is multi-var, need to mask each variable separately. + if tile.is_multi: + # Combine space/time mask with existing mask on data + data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) + + num_vars = len(tile.data) + multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) + tile.data = ma.masked_where(multi_data_mask, tile.data) + else: + tile.data = ma.masked_where(data_mask, tile.data) + + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] + + return tiles + + def mask_tiles_to_bbox_and_time(self, min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles): + for tile in tiles: + tile.times = ma.masked_outside(tile.times, start_time, end_time) + tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) + tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + tile.data = ma.masked_where(data_mask, tile.data) + + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] + + return tiles + + def mask_tiles_to_polygon(self, bounding_polygon, tiles): + + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + + return self.mask_tiles_to_bbox(min_lat, max_lat, min_lon, max_lon, tiles) + + def mask_tiles_to_polygon_and_time(self, bounding_polygon, start_time, end_time, tiles): + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + + return self.mask_tiles_to_bbox_and_time(min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles) + + def mask_tiles_to_time_range(self, start_time, end_time, tiles): + """ + Masks data in tiles to specified time range. + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param tiles: List of tiles + :return: A list tiles with data masked to specified time range + """ + if 0 <= start_time <= end_time: + for tile in tiles: + tile.times = ma.masked_outside(tile.times, start_time, end_time) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + # If this is multi-var, need to mask each variable separately. + if tile.is_multi: + # Combine space/time mask with existing mask on data + data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) + + num_vars = len(tile.data) + multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) + tile.data = ma.masked_where(multi_data_mask, tile.data) + else: + tile.data = ma.masked_where(data_mask, tile.data) + + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] + + return tiles + + @abstractmethod + def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): + """ + Return number of tiles that match search criteria. + :param ds: The dataset name to search + :param bounding_polygon: The polygon to search for tiles + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :return: number of tiles that match search criteria + """ + raise NotImplementedError() + + @abstractmethod + def fetch_data_for_tiles(self, *tiles): + raise NotImplementedError() + + @abstractmethod + def open_dataset(self, dataset): + raise NotImplementedError() + + @abstractmethod + def _metadata_store_docs_to_tiles(self, *store_docs): + raise NotImplementedError() + diff --git a/data-access/nexustiles/backends/__init__.py b/data-access/nexustiles/backends/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data-access/nexustiles/backends/nexusproto/__init__.py b/data-access/nexustiles/backends/nexusproto/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data-access/nexustiles/backends/nexusproto/backend.py b/data-access/nexustiles/backends/nexusproto/backend.py new file mode 100644 index 00000000..86d5ca6a --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/backend.py @@ -0,0 +1,566 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import configparser +import logging +import sys +import json +from datetime import datetime +from functools import reduce + +import numpy as np +import numpy.ma as ma +import pkg_resources +from pytz import timezone, UTC +from shapely.geometry import MultiPolygon, box + +from .dao import CassandraProxy +from .dao import DynamoProxy +from .dao import S3Proxy +from .dao import SolrProxy +from .dao import ElasticsearchProxy + +from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable +from nexustiles.nexustiles import NexusTileServiceException + +EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt="%Y-%m-%dT%H:%M:%S", stream=sys.stdout) +logger = logging.getLogger("testing") + + +class NexusprotoTileService(object): + def __init__(self, skipDatastore=False, skipMetadatastore=False, config=None): + self._datastore = None + self._metadatastore = None + + self._config = configparser.RawConfigParser() + self._config.read(NexusprotoTileService._get_config_files('config/datastores.ini')) + + if config: + self.override_config(config) + + if not skipDatastore: + datastore = self._config.get("datastore", "store") + if datastore == "cassandra": + self._datastore = CassandraProxy.CassandraProxy(self._config) + elif datastore == "s3": + self._datastore = S3Proxy.S3Proxy(self._config) + elif datastore == "dynamo": + self._datastore = DynamoProxy.DynamoProxy(self._config) + else: + raise ValueError("Error reading datastore from config file") + + if not skipMetadatastore: + metadatastore = self._config.get("metadatastore", "store", fallback='solr') + if metadatastore == "solr": + self._metadatastore = SolrProxy.SolrProxy(self._config) + elif metadatastore == "elasticsearch": + self._metadatastore = ElasticsearchProxy.ElasticsearchProxy(self._config) + + def override_config(self, config): + for section in config.sections(): + if self._config.has_section(section): # only override preexisting section, ignores the other + for option in config.options(section): + if config.get(section, option) is not None: + self._config.set(section, option, config.get(section, option)) + + def get_dataseries_list(self, simple=False): + if simple: + return self._metadatastore.get_data_series_list_simple() + else: + return self._metadatastore.get_data_series_list() + + def find_tile_by_id(self, tile_id, **kwargs): + return self._metadatastore.find_tile_by_id(tile_id) + + def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): + return self._metadatastore.find_tiles_by_id(tile_ids, ds=ds, **kwargs) + + def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, start_time, end_time, + metrics_callback=None, **kwargs): + start = datetime.now() + result = self._metadatastore.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, dataset, start_time, + end_time, + **kwargs) + duration = (datetime.now() - start).total_seconds() + if metrics_callback: + metrics_callback(solr=duration) + return result + + def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year, **kwargs): + """ + Given a bounding polygon, dataset, and day of year, find tiles in that dataset with the same bounding + polygon and the closest day of year. + + For example: + given a polygon minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; and day of year=32 + search for first tile in MY_DS with identical bbox and day_of_year <= 32 (sorted by day_of_year desc) + + Valid matches: + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 + + Invalid matches: + minx=1, miny=0, maxx=2, maxy=1; dataset=MY_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_OTHER_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 if minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 also exists + + :param bounding_polygon: The exact bounding polygon of tiles to search for + :param ds: The dataset name being searched + :param day_of_year: Tile day of year to search for, tile nearest to this day (without going over) will be returned + :return: List of one tile from ds with bounding_polygon on or before day_of_year or raise NexusTileServiceException if no tile found + """ + try: + tile = self._metadatastore.find_tile_by_polygon_and_most_recent_day_of_year(bounding_polygon, ds, + day_of_year) + except IndexError: + raise NexusTileServiceException("No tile found.").with_traceback(sys.exc_info()[2]) + + return tile + + def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + return self._metadatastore.find_all_tiles_in_box_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, + rows=5000, + **kwargs) + + def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **kwargs): + return self._metadatastore.find_all_tiles_in_polygon_at_time(bounding_polygon, dataset, time, rows=5000, + **kwargs) + + def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): + # Find tiles that fall in the given box in the Solr index + if type(start_time) is datetime: + start_time = (start_time - EPOCH).total_seconds() + if type(end_time) is datetime: + end_time = (end_time - EPOCH).total_seconds() + return self._metadatastore.find_all_tiles_in_box_sorttimeasc(min_lat, max_lat, min_lon, max_lon, ds, start_time, + end_time, **kwargs) + + def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=0, end_time=-1, **kwargs): + # Find tiles that fall within the polygon in the Solr index + if 'sort' in list(kwargs.keys()): + tiles = self._metadatastore.find_all_tiles_in_polygon(bounding_polygon, ds, start_time, end_time, **kwargs) + else: + tiles = self._metadatastore.find_all_tiles_in_polygon_sorttimeasc(bounding_polygon, ds, start_time, + end_time, + **kwargs) + return tiles + + def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): + """ + Return list of tiles whose metadata matches the specified metadata, start_time, end_time. + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of tiles + """ + tiles = self._metadatastore.find_all_tiles_by_metadata(metadata, ds, start_time, end_time, **kwargs) + + return tiles + + def get_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): + """ + Return list of tiles that matches the specified metadata, start_time, end_time with tile data outside of time + range properly masked out. + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of tiles + """ + tiles = self.find_tiles_by_metadata(metadata, ds, start_time, end_time, **kwargs) + tiles = self.mask_tiles_to_time_range(start_time, end_time, tiles) + + return tiles + + def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs): + """ + The method will return tiles with the exact given bounds within the time range. It differs from + find_tiles_in_polygon in that only tiles with exactly the given bounds will be returned as opposed to + doing a polygon intersection with the given bounds. + + :param bounds: (minx, miny, maxx, maxy) bounds to search for + :param ds: Dataset name to search + :param start_time: Start time to search (seconds since epoch) + :param end_time: End time to search (seconds since epoch) + :param kwargs: fetch_data: True/False = whether or not to retrieve tile data + :return: + """ + tiles = self._metadatastore.find_tiles_by_exact_bounds(bounds[0], bounds[1], bounds[2], bounds[3], ds, + start_time, + end_time) + return tiles + + def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + return self._metadatastore.find_all_boundary_tiles_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, + rows=5000, + **kwargs) + + def get_tiles_bounded_by_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, + **kwargs): + tiles = self.find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs) + tiles = self.mask_tiles_to_bbox(min_lat, max_lat, min_lon, max_lon, tiles) + if 0 <= start_time <= end_time: + tiles = self.mask_tiles_to_time_range(start_time, end_time, tiles) + + return tiles + + def get_tiles_bounded_by_polygon(self, polygon, ds=None, start_time=0, end_time=-1, **kwargs): + tiles = self.find_tiles_in_polygon(polygon, ds, start_time, end_time, + **kwargs) + tiles = self.mask_tiles_to_polygon(polygon, tiles) + if 0 <= start_time <= end_time: + tiles = self.mask_tiles_to_time_range(start_time, end_time, tiles) + + return tiles + + def get_min_max_time_by_granule(self, ds, granule_name): + start_time, end_time = self._metadatastore.find_min_max_date_from_granule(ds, granule_name) + + return start_time, end_time + + def get_dataset_overall_stats(self, ds): + return self._metadatastore.get_data_series_stats(ds) + + def get_tiles_bounded_by_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + tiles = self.find_all_tiles_in_box_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs) + tiles = self.mask_tiles_to_bbox_and_time(min_lat, max_lat, min_lon, max_lon, time, time, tiles) + + return tiles + + def get_tiles_bounded_by_polygon_at_time(self, polygon, dataset, time, **kwargs): + tiles = self.find_all_tiles_in_polygon_at_time(polygon, dataset, time, **kwargs) + tiles = self.mask_tiles_to_polygon_and_time(polygon, time, time, tiles) + + return tiles + + def get_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + tiles = self.find_all_boundary_tiles_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs) + tiles = self.mask_tiles_to_bbox_and_time(min_lat, max_lat, min_lon, max_lon, time, time, tiles) + + return tiles + + def get_stats_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + tiles = self._metadatastore.find_all_tiles_within_box_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, + **kwargs) + + return tiles + + def get_bounding_box(self, tile_ids): + """ + Retrieve a bounding box that encompasses all of the tiles represented by the given tile ids. + :param tile_ids: List of tile ids + :return: shapely.geometry.Polygon that represents the smallest bounding box that encompasses all of the tiles + """ + tiles = self.find_tiles_by_id(tile_ids, fl=['tile_min_lat', 'tile_max_lat', 'tile_min_lon', 'tile_max_lon'], + fetch_data=False, rows=len(tile_ids)) + polys = [] + for tile in tiles: + polys.append(box(tile.bbox.min_lon, tile.bbox.min_lat, tile.bbox.max_lon, tile.bbox.max_lat)) + return box(*MultiPolygon(polys).bounds) + + def get_min_time(self, tile_ids, ds=None): + """ + Get the minimum tile date from the list of tile ids + :param tile_ids: List of tile ids + :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) + :return: long time in seconds since epoch + """ + min_time = self._metadatastore.find_min_date_from_tiles(tile_ids, ds=ds) + return int((min_time - EPOCH).total_seconds()) + + def get_max_time(self, tile_ids, ds=None): + """ + Get the maximum tile date from the list of tile ids + :param tile_ids: List of tile ids + :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) + :return: long time in seconds since epoch + """ + max_time = self._metadatastore.find_max_date_from_tiles(tile_ids, ds=ds) + return int((max_time - EPOCH).total_seconds()) + + def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): + """ + Get a list of distinct tile bounding boxes from all tiles within the given polygon and time range. + :param bounding_polygon: The bounding polygon of tiles to search for + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of distinct bounding boxes (as shapely polygons) for tiles in the search polygon + """ + bounds = self._metadatastore.find_distinct_bounding_boxes_in_polygon(bounding_polygon, ds, start_time, end_time) + return [box(*b) for b in bounds] + + def mask_tiles_to_bbox(self, min_lat, max_lat, min_lon, max_lon, tiles): + + for tile in tiles: + tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) + tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + # If this is multi-var, need to mask each variable separately. + if tile.is_multi: + # Combine space/time mask with existing mask on data + data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) + + num_vars = len(tile.data) + multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) + tile.data = ma.masked_where(multi_data_mask, tile.data) + else: + tile.data = ma.masked_where(data_mask, tile.data) + + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] + + return tiles + + def mask_tiles_to_bbox_and_time(self, min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles): + for tile in tiles: + tile.times = ma.masked_outside(tile.times, start_time, end_time) + tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) + tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + tile.data = ma.masked_where(data_mask, tile.data) + + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] + + return tiles + + def mask_tiles_to_polygon(self, bounding_polygon, tiles): + + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + + return self.mask_tiles_to_bbox(min_lat, max_lat, min_lon, max_lon, tiles) + + def mask_tiles_to_polygon_and_time(self, bounding_polygon, start_time, end_time, tiles): + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + + return self.mask_tiles_to_bbox_and_time(min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles) + + def mask_tiles_to_time_range(self, start_time, end_time, tiles): + """ + Masks data in tiles to specified time range. + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param tiles: List of tiles + :return: A list tiles with data masked to specified time range + """ + if 0 <= start_time <= end_time: + for tile in tiles: + tile.times = ma.masked_outside(tile.times, start_time, end_time) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + # If this is multi-var, need to mask each variable separately. + if tile.is_multi: + # Combine space/time mask with existing mask on data + data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) + + num_vars = len(tile.data) + multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) + tile.data = ma.masked_where(multi_data_mask, tile.data) + else: + tile.data = ma.masked_where(data_mask, tile.data) + + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] + + return tiles + + def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): + """ + Return number of tiles that match search criteria. + :param ds: The dataset name to search + :param bounding_polygon: The polygon to search for tiles + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :return: number of tiles that match search criteria + """ + return self._metadatastore.get_tile_count(ds, bounding_polygon, start_time, end_time, metadata, **kwargs) + + def fetch_data_for_tiles(self, *tiles): + + nexus_tile_ids = set([tile.tile_id for tile in tiles]) + matched_tile_data = self._datastore.fetch_nexus_tiles(*nexus_tile_ids) + + tile_data_by_id = {str(a_tile_data.tile_id): a_tile_data for a_tile_data in matched_tile_data} + + missing_data = nexus_tile_ids.difference(list(tile_data_by_id.keys())) + if len(missing_data) > 0: + raise Exception("Missing data for tile_id(s) %s." % missing_data) + + for a_tile in tiles: + lats, lons, times, data, meta, is_multi_var = tile_data_by_id[a_tile.tile_id].get_lat_lon_time_data_meta() + + a_tile.latitudes = lats + a_tile.longitudes = lons + a_tile.times = times + a_tile.data = data + a_tile.meta_data = meta + a_tile.is_multi = is_multi_var + + del (tile_data_by_id[a_tile.tile_id]) + + return tiles + + def _metadata_store_docs_to_tiles(self, *store_docs): + + tiles = [] + for store_doc in store_docs: + tile = Tile() + try: + tile.tile_id = store_doc['id'] + except KeyError: + pass + + try: + min_lat = store_doc['tile_min_lat'] + min_lon = store_doc['tile_min_lon'] + max_lat = store_doc['tile_max_lat'] + max_lon = store_doc['tile_max_lon'] + + if isinstance(min_lat, list): + min_lat = min_lat[0] + if isinstance(min_lon, list): + min_lon = min_lon[0] + if isinstance(max_lat, list): + max_lat = max_lat[0] + if isinstance(max_lon, list): + max_lon = max_lon[0] + + tile.bbox = BBox(min_lat, max_lat, min_lon, max_lon) + except KeyError: + pass + + try: + tile.dataset = store_doc['dataset_s'] + except KeyError: + pass + + try: + tile.dataset_id = store_doc['dataset_id_s'] + except KeyError: + pass + + try: + tile.granule = store_doc['granule_s'] + except KeyError: + pass + + try: + tile.min_time = datetime.strptime(store_doc['tile_min_time_dt'], "%Y-%m-%dT%H:%M:%SZ").replace( + tzinfo=UTC) + except KeyError: + pass + + try: + tile.max_time = datetime.strptime(store_doc['tile_max_time_dt'], "%Y-%m-%dT%H:%M:%SZ").replace( + tzinfo=UTC) + except KeyError: + pass + + try: + tile.section_spec = store_doc['sectionSpec_s'] + except KeyError: + pass + + try: + tile.tile_stats = TileStats( + store_doc['tile_min_val_d'], store_doc['tile_max_val_d'], + store_doc['tile_avg_val_d'], store_doc['tile_count_i'] + ) + except KeyError: + pass + + try: + # Ensure backwards compatibility by working with old + # tile_var_name_s and tile_standard_name_s fields to + + # will be overwritten if tile_var_name_ss is present + # as well. + if '[' in store_doc['tile_var_name_s']: + var_names = json.loads(store_doc['tile_var_name_s']) + else: + var_names = [store_doc['tile_var_name_s']] + + standard_name = store_doc.get( + 'tile_standard_name_s', + json.dumps([None] * len(var_names)) + ) + if '[' in standard_name: + standard_names = json.loads(standard_name) + else: + standard_names = [standard_name] + + tile.variables = [] + for var_name, standard_name in zip(var_names, standard_names): + tile.variables.append(TileVariable( + variable_name=var_name, + standard_name=standard_name + )) + except KeyError: + pass + + if 'tile_var_name_ss' in store_doc: + tile.variables = [] + for var_name in store_doc['tile_var_name_ss']: + standard_name_key = f'{var_name}.tile_standard_name_s' + standard_name = store_doc.get(standard_name_key) + tile.variables.append(TileVariable( + variable_name=var_name, + standard_name=standard_name + )) + + tiles.append(tile) + + return tiles + + def pingSolr(self): + status = self._metadatastore.ping() + if status and status["status"] == "OK": + return True + else: + return False + + @staticmethod + def _get_config_files(filename): + log = logging.getLogger(__name__) + candidates = [] + extensions = ['.default', ''] + for extension in extensions: + try: + candidate = pkg_resources.resource_filename(__name__, filename + extension) + log.info('use config file {}'.format(filename + extension)) + candidates.append(candidate) + except KeyError as ke: + log.warning('configuration file {} not found'.format(filename + extension)) + + return candidates diff --git a/data-access/nexustiles/backends/nexusproto/config/datastores.ini b/data-access/nexustiles/backends/nexusproto/config/datastores.ini new file mode 100644 index 00000000..f3facb95 --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/config/datastores.ini @@ -0,0 +1,36 @@ +[cassandra] +host=localhost +port=9042 +keyspace=nexustiles +local_datacenter=datacenter1 +protocol_version=3 +dc_policy=WhiteListRoundRobinPolicy +username=cassandra +password=cassandra + +[dynamo] +table=nexus-jpl-table +region=us-west-2 + +[solr] +host=http://localhost:8983 +core=nexustiles + +[s3] +bucket=cdms-dev-zarr +#key=MUR_aggregate/ +#key=MUR_1wk_7_100_100/ +#key=MUR_1wk_7_1500_2500/ +#key=MUR_2017_9dy_7_1500_2500/ +#key=MUR_2017_9dy_7_120_240/ +key=MUR_2017_2yr_30_120_240/ +#key=SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5_7_120_240.zarr/ +#key=SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5_1_240_240.zarr/ +#key=SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5_90_120_240.zarr/ +public=false +region=us-west-2 +profile=saml-pub + +[datastore] +store=cassandra +#store=zarrS3 diff --git a/data-access/nexustiles/backends/nexusproto/config/datastores.ini.default b/data-access/nexustiles/backends/nexusproto/config/datastores.ini.default new file mode 100644 index 00000000..d8db1902 --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/config/datastores.ini.default @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[cassandra] +host=localhost +port=9042 +keyspace=nexustiles +local_datacenter=datacenter1 +protocol_version=3 +dc_policy=DCAwareRoundRobinPolicy +username= +password= + +[s3] +bucket=nexus-jpl +region=us-west-2 + +[dynamo] +table=nexus-jpl-table +region=us-west-2 + +[solr] +host=http://localhost:8983 +core=nexustiles + +[datastore] +store=cassandra diff --git a/data-access/nexustiles/backends/nexusproto/dao/CassandraProxy.py b/data-access/nexustiles/backends/nexusproto/dao/CassandraProxy.py new file mode 100644 index 00000000..96f7c4c6 --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/dao/CassandraProxy.py @@ -0,0 +1,317 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import uuid +from configparser import NoOptionError + +import nexusproto.DataTile_pb2 as nexusproto +import numpy as np +from cassandra.auth import PlainTextAuthProvider +from cassandra.cqlengine import columns, connection, CQLEngineException +from cassandra.cluster import NoHostAvailable +from cassandra.cqlengine.models import Model +from cassandra.policies import TokenAwarePolicy, DCAwareRoundRobinPolicy, WhiteListRoundRobinPolicy +from multiprocessing.synchronize import Lock +from nexusproto.serialization import from_shaped_array + +INIT_LOCK = Lock(ctx=None) + +logger = logging.getLogger(__name__) + +class NexusTileData(Model): + __table_name__ = 'sea_surface_temp' + tile_id = columns.UUID(primary_key=True) + tile_blob = columns.Blob() + + __nexus_tile = None + + def _get_nexus_tile(self): + if self.__nexus_tile is None: + self.__nexus_tile = nexusproto.TileData.FromString(self.tile_blob) + + return self.__nexus_tile + + def get_raw_data_array(self): + + nexus_tile = self._get_nexus_tile() + the_tile_type = nexus_tile.tile.WhichOneof("tile_type") + + the_tile_data = getattr(nexus_tile.tile, the_tile_type) + + return from_shaped_array(the_tile_data.variable_data) + + def get_lat_lon_time_data_meta(self): + """ + Retrieve data from data store and metadata from metadata store + for this tile. For gridded tiles, the tile shape of the data + will match the input shape. For example, if the input was a + 30x30 tile, all variables will also be 30x30. However, if the + tile is a swath tile, the data will be transformed along the + diagonal of the data matrix. For example, a 30x30 tile would + become 900x900 where the 900 points are along the diagonal. + + Multi-variable tile will also include an extra dimension in the + data array. For example, a 30 x 30 x 30 array would be + transformed to N x 30 x 30 x 30 where N is the number of + variables in this tile. + + latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data, is_multi_var + + :return: latitude data + :return: longitude data + :return: time data + :return: data + :return: meta data dictionary + :return: boolean flag, True if this tile has more than one variable + """ + is_multi_var = False + + if self._get_nexus_tile().HasField('grid_tile'): + grid_tile = self._get_nexus_tile().grid_tile + + grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_tile.variable_data)) + latitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.latitude)) + longitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.longitude)) + + if len(grid_tile_data.shape) == 2: + grid_tile_data = grid_tile_data[np.newaxis, :] + + # Extract the meta data + meta_data = {} + for meta_data_obj in grid_tile.meta_data: + name = meta_data_obj.name + meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + if len(meta_array.shape) == 2: + meta_array = meta_array[np.newaxis, :] + meta_data[name] = meta_array + + return latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data, is_multi_var + elif self._get_nexus_tile().HasField('swath_tile'): + swath_tile = self._get_nexus_tile().swath_tile + + latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) + longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) + time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) + + # Simplify the tile if the time dimension is the same value repeated + if np.all(time_data == np.min(time_data)): + time_data = np.array([np.min(time_data)]) + + swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) + + tile_data = self._to_standard_index(swath_tile_data, + (len(time_data), len(latitude_data), len(longitude_data))) + + # Extract the meta data + meta_data = {} + for meta_data_obj in swath_tile.meta_data: + name = meta_data_obj.name + actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) + meta_data[name] = reshaped_meta_array + + return latitude_data, longitude_data, time_data, tile_data, meta_data, is_multi_var + elif self._get_nexus_tile().HasField('time_series_tile'): + time_series_tile = self._get_nexus_tile().time_series_tile + + time_series_tile_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.variable_data)) + time_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.time)).reshape(-1) + latitude_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.latitude)) + longitude_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.longitude)) + + reshaped_array = np.ma.masked_all((len(time_data), len(latitude_data), len(longitude_data))) + idx = np.arange(len(latitude_data)) + reshaped_array[:, idx, idx] = time_series_tile_data + tile_data = reshaped_array + # Extract the meta data + meta_data = {} + for meta_data_obj in time_series_tile.meta_data: + name = meta_data_obj.name + meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + + reshaped_meta_array = np.ma.masked_all((len(time_data), len(latitude_data), len(longitude_data))) + idx = np.arange(len(latitude_data)) + reshaped_meta_array[:, idx, idx] = meta_array + + meta_data[name] = reshaped_meta_array + + return latitude_data, longitude_data, time_data, tile_data, meta_data, is_multi_var + elif self._get_nexus_tile().HasField('swath_multi_variable_tile'): + swath_tile = self._get_nexus_tile().swath_multi_variable_tile + is_multi_var = True + + latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) + longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) + time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) + + # Simplify the tile if the time dimension is the same value repeated + if np.all(time_data == np.min(time_data)): + time_data = np.array([np.min(time_data)]) + + swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) + + desired_shape = ( + len(time_data), + len(latitude_data), + len(longitude_data), + ) + tile_data = self._to_standard_index(swath_tile_data, desired_shape, is_multi_var=True) + + # Extract the meta data + meta_data = {} + for meta_data_obj in swath_tile.meta_data: + name = meta_data_obj.name + actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) + meta_data[name] = reshaped_meta_array + + return latitude_data, longitude_data, time_data, tile_data, meta_data, is_multi_var + elif self._get_nexus_tile().HasField('grid_multi_variable_tile'): + grid_multi_variable_tile = self._get_nexus_tile().grid_multi_variable_tile + is_multi_var = True + + grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_multi_variable_tile.variable_data)) + latitude_data = np.ma.masked_invalid(from_shaped_array(grid_multi_variable_tile.latitude)) + longitude_data = np.ma.masked_invalid(from_shaped_array(grid_multi_variable_tile.longitude)) + + # If there are 3 dimensions, that means the time dimension + # was squeezed. Add back in + if len(grid_tile_data.shape) == 3: + grid_tile_data = np.expand_dims(grid_tile_data, axis=1) + # If there are 4 dimensions, that means the time dimension + # is present. Move the multivar dimension. + if len(grid_tile_data.shape) == 4: + grid_tile_data = np.moveaxis(grid_tile_data, -1, 0) + + # Extract the meta data + meta_data = {} + for meta_data_obj in grid_multi_variable_tile.meta_data: + name = meta_data_obj.name + meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + if len(meta_array.shape) == 2: + meta_array = meta_array[np.newaxis, :] + meta_data[name] = meta_array + + return latitude_data, longitude_data, np.array([grid_multi_variable_tile.time]), grid_tile_data, meta_data, is_multi_var + else: + raise NotImplementedError("Only supports grid_tile, swath_tile, swath_multi_variable_tile, and time_series_tile") + + @staticmethod + def _to_standard_index(data_array, desired_shape, is_multi_var=False): + """ + Transform swath data to a standard format where data runs along + diagonal of ND matrix and the non-diagonal data points are + masked + + :param data_array: The data array to be transformed + :param desired_shape: The desired shape of the resulting array + :param is_multi_var: True if this is a multi-variable tile + :type data_array: np.array + :type desired_shape: tuple + :type is_multi_var: bool + :return: Reshaped array + :rtype: np.array + """ + + reshaped_array = [] + if is_multi_var: + reshaped_data_array = np.moveaxis(data_array, -1, 0) + else: + reshaped_data_array = [data_array] + + for variable_data_array in reshaped_data_array: + if desired_shape[0] == 1: + variable_reshaped_array = np.ma.masked_all((desired_shape[1], desired_shape[2])) + else: + variable_reshaped_array = np.ma.masked_all(desired_shape) + + row, col = np.indices(variable_data_array.shape) + + variable_reshaped_array[ + np.diag_indices(desired_shape[1], len(variable_reshaped_array.shape))] = \ + variable_data_array[ + row.flat, col.flat] + variable_reshaped_array.mask[ + np.diag_indices(desired_shape[1], len(variable_reshaped_array.shape))] = \ + variable_data_array.mask[ + row.flat, col.flat] + + if desired_shape[0] == 1: + reshaped_array.append(variable_reshaped_array[np.newaxis, :]) + else: + reshaped_array.append(variable_reshaped_array) + + if not is_multi_var: + # If single var, squeeze extra dim out of array + reshaped_array = reshaped_array[0] + + return reshaped_array + + +class CassandraProxy(object): + def __init__(self, config): + self.config = config + self.__cass_url = config.get("cassandra", "host") + self.__cass_username = config.get("cassandra", "username") + self.__cass_password = config.get("cassandra", "password") + self.__cass_keyspace = config.get("cassandra", "keyspace") + self.__cass_local_DC = config.get("cassandra", "local_datacenter") + self.__cass_protocol_version = config.getint("cassandra", "protocol_version") + self.__cass_dc_policy = config.get("cassandra", "dc_policy") + + try: + self.__cass_port = config.getint("cassandra", "port") + except NoOptionError: + self.__cass_port = 9042 + + with INIT_LOCK: + try: + connection.get_cluster() + except CQLEngineException: + self.__open() + + def __open(self): + if self.__cass_dc_policy == 'DCAwareRoundRobinPolicy': + dc_policy = DCAwareRoundRobinPolicy(self.__cass_local_DC) + token_policy = TokenAwarePolicy(dc_policy) + elif self.__cass_dc_policy == 'WhiteListRoundRobinPolicy': + token_policy = WhiteListRoundRobinPolicy([self.__cass_url]) + + if self.__cass_username and self.__cass_password: + auth_provider = PlainTextAuthProvider(username=self.__cass_username, password=self.__cass_password) + else: + auth_provider = None + try: + connection.setup( + [host for host in self.__cass_url.split(',')], self.__cass_keyspace, + protocol_version=self.__cass_protocol_version, load_balancing_policy=token_policy, + port=self.__cass_port, + auth_provider=auth_provider + ) + except NoHostAvailable as e: + logger.error("Cassandra is not accessible, SDAP will not server local datasets", e) + + def fetch_nexus_tiles(self, *tile_ids): + tile_ids = [uuid.UUID(str(tile_id)) for tile_id in tile_ids if + (isinstance(tile_id, str) or isinstance(tile_id, str))] + + res = [] + for tile_id in tile_ids: + filterResults = NexusTileData.objects.filter(tile_id=tile_id) + if len(filterResults) > 0: + res.append(filterResults[0]) + + return res diff --git a/data-access/nexustiles/backends/nexusproto/dao/DynamoProxy.py b/data-access/nexustiles/backends/nexusproto/dao/DynamoProxy.py new file mode 100644 index 00000000..1ee70ac1 --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/dao/DynamoProxy.py @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid +import nexusproto.DataTile_pb2 as nexusproto +from nexusproto.serialization import from_shaped_array +import numpy as np +import boto3 + +class NexusTileData(object): + __nexus_tile = None + __data = None + tile_id = None + + def __init__(self, data, _tile_id): + if self.__data is None: + self.__data = data + if self.tile_id is None: + self.tile_id = _tile_id + + def _get_nexus_tile(self): + if self.__nexus_tile is None: + self.__nexus_tile = nexusproto.TileData.FromString(self.__data) + + return self.__nexus_tile + + def get_raw_data_array(self): + + nexus_tile = self._get_nexus_tile() + the_tile_type = nexus_tile.tile.WhichOneof("tile_type") + + the_tile_data = getattr(nexus_tile.tile, the_tile_type) + + return from_shaped_array(the_tile_data.variable_data) + + def get_lat_lon_time_data_meta(self): + if self._get_nexus_tile().HasField('grid_tile'): + grid_tile = self._get_nexus_tile().grid_tile + + grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_tile.variable_data)) + latitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.latitude)) + longitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.longitude)) + + if len(grid_tile_data.shape) == 2: + grid_tile_data = grid_tile_data[np.newaxis, :] + + # Extract the meta data + meta_data = {} + for meta_data_obj in grid_tile.meta_data: + name = meta_data_obj.name + meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + if len(meta_array.shape) == 2: + meta_array = meta_array[np.newaxis, :] + meta_data[name] = meta_array + + return latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data + elif self._get_nexus_tile().HasField('swath_tile'): + swath_tile = self._get_nexus_tile().swath_tile + + latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) + longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) + time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) + + # Simplify the tile if the time dimension is the same value repeated + if np.all(time_data == np.min(time_data)): + time_data = np.array([np.min(time_data)]) + + swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) + + tile_data = self._to_standard_index(swath_tile_data, + (len(time_data), len(latitude_data), len(longitude_data))) + + # Extract the meta data + meta_data = {} + for meta_data_obj in swath_tile.meta_data: + name = meta_data_obj.name + actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) + meta_data[name] = reshaped_meta_array + + return latitude_data, longitude_data, time_data, tile_data, meta_data + else: + raise NotImplementedError("Only supports grid_tile and swath_tile") + + @staticmethod + def _to_standard_index(data_array, desired_shape): + + if desired_shape[0] == 1: + reshaped_array = np.ma.masked_all((desired_shape[1], desired_shape[2])) + row, col = np.indices(data_array.shape) + + reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ + row.flat, col.flat] + reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ + row.flat, col.flat] + reshaped_array = reshaped_array[np.newaxis, :] + else: + reshaped_array = np.ma.masked_all(desired_shape) + row, col = np.indices(data_array.shape) + + reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ + row.flat, col.flat] + reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ + row.flat, col.flat] + + return reshaped_array + + +class DynamoProxy(object): + def __init__(self, config): + self.config = config + self.__dynamo_tablename = config.get("dynamo", "table") + self.__dynamo_region = config.get("dynamo", "region") + self.__dynamo = boto3.resource('dynamodb', region_name=self.__dynamo_region) + self.__dynamo_table = self.__dynamo.Table(self.__dynamo_tablename) + self.__nexus_tile = None + + def fetch_nexus_tiles(self, *tile_ids): + + tile_ids = [uuid.UUID(str(tile_id)) for tile_id in tile_ids if + (isinstance(tile_id, str) or isinstance(tile_id, str))] + res = [] + for tile_id in tile_ids: + response = self.__dynamo_table.get_item( + Key = { + 'tile_id': str(tile_id) + } + ) + item = response['Item'] + data = item['data'].__str__() + nexus_tile = NexusTileData(data, str(tile_id)) + res.append(nexus_tile) + + return res \ No newline at end of file diff --git a/data-access/nexustiles/backends/nexusproto/dao/ElasticsearchProxy.py b/data-access/nexustiles/backends/nexusproto/dao/ElasticsearchProxy.py new file mode 100644 index 00000000..157630f6 --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/dao/ElasticsearchProxy.py @@ -0,0 +1,1235 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import threading +import time +import re +from datetime import datetime +from pytz import timezone, UTC + +import requests +import pysolr +from shapely import wkt +from elasticsearch import Elasticsearch + +ELASTICSEARCH_CON_LOCK = threading.Lock() +thread_local = threading.local() + +EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) +ELASTICSEARCH_FORMAT = '%Y-%m-%dT%H:%M:%SZ' +ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' + + +class ElasticsearchProxy(object): + def __init__(self, config): + self.elasticsearchHosts = config.get("elasticsearch", "host").split(',') + self.elasticsearchIndex = config.get("elasticsearch", "index") + self.elasticsearchUsername = config.get("elasticsearch", "username") + self.elasticsearchPassword = config.get("elasticsearch", "password") + self.logger = logging.getLogger(__name__) + + with ELASTICSEARCH_CON_LOCK: + elasticsearchcon = getattr(thread_local, 'elasticsearchcon', None) + if elasticsearchcon is None: + elasticsearchcon = Elasticsearch(hosts=self.elasticsearchHosts, http_auth=(self.elasticsearchUsername, self.elasticsearchPassword)) + thread_local.elasticsearchcon = elasticsearchcon + + self.elasticsearchcon = elasticsearchcon + + def find_tile_by_id(self, tile_id): + + params = { + "size": 1, + "query": { + "term": { + "id": { + "value": tile_id + } + } + } + } + + results, _, hits = self.do_query(*(None, None, None, True, None), **params) + assert hits == 1, f"Found {hits} results, expected exactly 1" + return [results[0]["_source"]] + + def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): + + params = { + "query": { + "bool": { + "filter": [], + "should": [], + "minimum_should_match": 1 + } + } + } + + for tile_id in tile_ids: + params['query']['bool']['should'].append({"term": {"id": {"value": tile_id}}}) + + if ds is not None: + params['query']['bool']['filter'].append({"term": {"dataset_s": {"value": ds}}}) + + self._merge_kwargs(params, **kwargs) + + results = self.do_query_all(*(None, None, None, False, None), **params) + assert len(results) == len(tile_ids), "Found %s results, expected exactly %s" % (len(results), len(tile_ids)) + return results + + def find_min_date_from_tiles(self, tile_ids, ds=None, **kwargs): + params = { + "size": 0, + "query": { + "bool": { + "filter": [], + "should": [] + } + }, + "aggs": { + "min_date_agg": { + "min": { + "field": "tile_min_time_dt" + } + } + } + } + + for tile_id in tile_ids: + params['query']['bool']['should'].append({"term": {"id": {"value": tile_id}}}) + if ds is not None: + params['query']['bool']['filter'].append({"term": {"dataset_s": {"value": ds}}}) + + aggregations = self.do_aggregation(*(None, None, None, True, None), **params) + return self.convert_iso_to_datetime(aggregations['min_date_agg']["value_as_string"]) + + def find_max_date_from_tiles(self, tile_ids, ds=None, **kwargs): + + params = { + "size": 0, + "query": { + "bool": { + "filter": [], + "should": [] + } + }, + "aggs": { + "max_date_agg": { + "max": { + "field": "tile_max_time_dt" + } + } + } + } + + for tile_id in tile_ids: + params['query']['bool']['should'].append({"term": {"id": {"value": tile_id}}}) + if ds is not None: + params['query']['bool']['filter'].append({"term": {"dataset_s": {"value": ds}}}) + + aggregations = self.do_aggregation(*(None, None, None, True, None), **params) + return self.convert_iso_to_datetime(aggregations['max_date_agg']["value_as_string"]) + + + def find_min_max_date_from_granule(self, ds, granule_name, **kwargs): + + params = { + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "term": { + "granule_s": { + "value": granule_name + } + } + } + ] + } + }, + "aggs": { + "min_date_agg": { + "max": { + "field": "tile_min_time_dt" + } + }, + "max_date_agg": { + "max": { + "field": "tile_max_time_dt" + } + } + } + } + + self._merge_kwargs(params, **kwargs) + + aggregations = self.do_aggregation(*(None, None, None, False, None), **params) + start_time = self.convert_iso_to_datetime(aggregations['min_date_agg']["value_as_string"]) + end_time = self.convert_iso_to_datetime(aggregations['max_date_agg']["value_as_string"]) + + return start_time, end_time + + def get_data_series_list(self): + + datasets = self.get_data_series_list_simple() + + for dataset in datasets: + min_date = self.find_min_date_from_tiles([], ds=dataset['title']) + max_date = self.find_max_date_from_tiles([], ds=dataset['title']) + dataset['start'] = (min_date - EPOCH).total_seconds() + dataset['end'] = (max_date - EPOCH).total_seconds() + dataset['iso_start'] = min_date.strftime(ISO_8601) + dataset['iso_end'] = max_date.strftime(ISO_8601) + + return datasets + + def get_data_series_list_simple(self): + + params = { + 'size': 0, + "aggs": { + "dataset_list_agg": { + "composite": { + "size":100, + "sources": [ + { + "dataset_s": { + "terms": { + "field": "dataset_s" + } + } + } + ] + } + } + } + } + + aggregations = self.do_aggregation_all(params, 'dataset_list_agg') + l = [] + + for dataset in aggregations: + l.append({ + "shortName": dataset['key']['dataset_s'], + "title": dataset['key']['dataset_s'], + "tileCount": dataset["doc_count"] + }) + + l = sorted(l, key=lambda entry: entry["title"]) + return l + + def get_data_series_stats(self, ds): + + params = { + "size": 0, + "query": { + "term":{ + "dataset_s": { + "value": ds + } + } + }, + "aggs": { + "available_dates": { + "composite": { + "size": 100, + "sources": [ + {"terms_tile_max_time_dt": {"terms": {"field": "tile_max_time_dt"}}} + ] + } + } + } + } + + aggregations = self.do_aggregation_all(params, 'available_dates') + stats = {} + stats['available_dates'] = [] + + for dt in aggregations: + stats['available_dates'].append(dt['key']['terms_tile_max_time_dt'] / 1000) + + stats['available_dates'] = sorted(stats['available_dates']) + + params = { + "size": 0, + "query": { + "term":{ + "dataset_s": { + "value": ds + } + } + }, + "aggs": { + "min_tile_min_val_d": { + "min": { + "field": "tile_min_val_d" + } + }, + "min_tile_max_time_dt": { + "min": { + "field": "tile_max_time_dt" + } + }, + "max_tile_max_time_dt": { + "max": { + "field": "tile_max_time_dt" + } + }, + "max_tile_max_val_d": { + "max": { + "field": "tile_max_val_d" + } + } + } + } + + aggregations = self.do_aggregation(*(None, None, None, False, None), **params) + stats["start"] = int(aggregations["min_tile_max_time_dt"]["value"]) / 1000 + stats["end"] = int(aggregations["max_tile_max_time_dt"]["value"]) / 1000 + stats["minValue"] = aggregations["min_tile_min_val_d"]["value"] + stats["maxValue"] = aggregations["max_tile_max_val_d"]["value"] + + return stats + + # day_of_year_i added (SDAP-347) + def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year): + + max_lat = bounding_polygon.bounds[3] + min_lon = bounding_polygon.bounds[0] + min_lat = bounding_polygon.bounds[1] + max_lon = bounding_polygon.bounds[2] + + params = { + "size": "1", + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] + }, + "relation": "intersects" + } + } + }, + { + "range": { + "tile_count_i": { + "gte": 1 + } + } + }, + { + "range": { + "day_of_year_i": { + "lte": day_of_year + } + } + } + ] + } + } + } + result, _, _ = self.do_query(*(None, None, None, True, 'day_of_year_i desc'), **params) + + return [result[0]] + + def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs): + + search_start_s = datetime.utcfromtimestamp(start_time).strftime(ELASTICSEARCH_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(ELASTICSEARCH_FORMAT) + + params = { + "size": "0", + "_source": "tile_min_time_dt", + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "range": { + "tile_min_time_dt": { + "gte": search_start_s, + "lte": search_end_s + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] + }, + "relation": "intersects" + } + } + } + ] + } + }, + "aggs": { + "days_range_agg": { + "composite": { + "size":100, + "sources": [ + { + "tile_min_time_dt": { + "terms": { + "field": "tile_min_time_dt" + } + } + } + ] + } + } + } + } + + aggregations = self.do_aggregation_all(params, 'days_range_agg') + results = [res['key']['tile_min_time_dt'] for res in aggregations] + daysinrangeasc = sorted([(res / 1000) for res in results]) + return daysinrangeasc + + def find_all_tiles_in_box_sorttimeasc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, + end_time=-1, **kwargs): + + params = { + "size": 1000, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] + }, + "relation": "intersects" + } + } + }, + { + "range": { + "tile_count_i": { + "gte": 1 + } + } + } + ] + } + } + } + + + if 0 < start_time <= end_time: + params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) + params["query"]["bool"]["minimum_should_match"] = 1 + + self._merge_kwargs(params, **kwargs) + + return self.do_query_all(*(None, None, None, False, 'tile_min_time_dt asc,tile_max_time_dt asc'), **params) + + def find_all_tiles_in_polygon_sorttimeasc(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): + + nums = re.findall(r'\d+(?:\.\d*)?', bounding_polygon.wkt.rpartition(',')[0]) + polygon_coordinates = list(zip(*[iter(nums)] * 2)) + + max_lat = bounding_polygon.bounds[3] + min_lon = bounding_polygon.bounds[0] + min_lat = bounding_polygon.bounds[1] + max_lon = bounding_polygon.bounds[2] + + params = { + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] + }, + "relation": "intersects" + } + } + } + ] + } + } + } + + try: + if 'fl' in list(kwargs.keys()): + params["_source"] = kwargs["fl"].split(',') + except KeyError: + pass + + if 0 < start_time <= end_time: + params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) + params["query"]["bool"]["minimum_should_match"] = 1 + + return self.do_query_all(*(None, None, None, False, 'tile_min_time_dt asc,tile_max_time_dt asc'), **params) + + def find_all_tiles_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): + + nums = re.findall(r'\d+(?:\.\d*)?', bounding_polygon.wkt.rpartition(',')[0]) + polygon_coordinates = list(zip(*[iter(nums)] * 2)) + + max_lat = bounding_polygon.bounds[3] + min_lon = bounding_polygon.bounds[0] + min_lat = bounding_polygon.bounds[1] + max_lon = bounding_polygon.bounds[2] + + params = { + "size": 1000, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] + }, + "relation": "intersects" + } + } + }, + { + "range": { + "tile_count_i": { + "gte": 1 + } + } + } + ] + } + } + } + + try: + if 'fl' in list(kwargs.keys()): + params["_source"] = kwargs["fl"].split(',') + except KeyError: + pass + + if 0 < start_time <= end_time: + params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) + params["query"]["bool"]["minimum_should_match"] = 1 + + self._merge_kwargs(params, **kwargs) + + return self.do_query_all(*(None, None, None, False, None), **params) + + def find_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): + + tile_max_lat = bounding_polygon.bounds[3] + tile_min_lon = bounding_polygon.bounds[0] + tile_min_lat = bounding_polygon.bounds[1] + tile_max_lon = bounding_polygon.bounds[2] + + params = { + "size": 0, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[tile_min_lon, tile_max_lat], [tile_max_lon, tile_min_lat]] + }, + "relation": "intersects" + } + } + } + ] + } + }, + "aggs": { + "distinct_bounding_boxes": { + "composite": { + "size": 100, + "sources": [ + { + "bounding_box": { + "terms": { + "script": { + "source": "String.valueOf(doc['tile_min_lon'].value) + ', ' + String.valueOf(doc['tile_max_lon'].value) + ', ' + String.valueOf(doc['tile_min_lat'].value) + ', ' + String.valueOf(doc['tile_max_lat'].value)", + "lang": "painless" + } + } + } + } + ] + } + } + } + } + + if 0 < start_time <= end_time: + params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) + params["query"]["bool"]["minimum_should_match"] = 1 + + self._merge_kwargs(params, **kwargs) + aggregations = self.do_aggregation_all(params, 'distinct_bounding_boxes') + distinct_bounds = [] + for agg in aggregations: + coords = agg['key']['bounding_box'].split(',') + min_lon = round(float(coords[0]), 2) + max_lon = round(float(coords[1]), 2) + min_lat = round(float(coords[2]), 2) + max_lat = round(float(coords[3]), 2) + polygon = 'POLYGON((%s %s, %s %s, %s %s, %s %s, %s %s))' % (min_lon, max_lat, min_lon, min_lat, max_lon, min_lat, max_lon, max_lat, min_lon, max_lat) + distinct_bounds.append(wkt.loads(polygon).bounds) + + return distinct_bounds + + def find_tiles_by_exact_bounds(self, minx, miny, maxx, maxy, ds, start_time=0, end_time=-1, **kwargs): + + params = { + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "term": { + "tile_min_lon": { + "value": minx + } + } + }, + { + "term": { + "tile_min_lat": { + "value": miny + } + } + }, + { + "term": { + "tile_max_lon": { + "value": maxx + } + } + }, + { + "term": { + "tile_max_lat": { + "value": maxy + } + } + } + ] + } + }} + + if 0 < start_time <= end_time: + params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) + params["query"]["bool"]["minimum_should_match"] = 1 + + self._merge_kwargs(params, **kwargs) + + return self.do_query_all(*(None, None, None, False, None), **params) + + def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, search_time, **kwargs): + + the_time = datetime.utcfromtimestamp(search_time).strftime(ELASTICSEARCH_FORMAT) + + params = { + "size": 1000, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] + }, + "relation": "intersects" + } + } + }, + { + "range": { + "tile_min_time_dt": { + "lte": the_time + } + } + }, + { + "range": { + "tile_max_time_dt": { + "gte": the_time + } + } + } + ] + } + } + } + + self._merge_kwargs(params, **kwargs) + + return self.do_query_all(*(None, None, None, False, None), **params) + + def find_all_tiles_in_polygon_at_time(self, bounding_polygon, ds, search_time, **kwargs): + + the_time = datetime.utcfromtimestamp(search_time).strftime(ELASTICSEARCH_FORMAT) + + max_lat = bounding_polygon.bounds[3] + min_lon = bounding_polygon.bounds[0] + min_lat = bounding_polygon.bounds[1] + max_lon = bounding_polygon.bounds[2] + + params = { + "size": 1000, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] + }, + "relation": "intersects" + } + } + }, + { "range": { + "tile_min_time_dt": { + "lte": the_time + } + } }, + { "range": { + "tile_max_time_dt": { + "gte": the_time + } + } } + ] + } + } + } + + self._merge_kwargs(params, **kwargs) + + return self.do_query_all(*(None, None, None, False, None), **params) + + + def find_all_tiles_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): + + the_time = datetime.utcfromtimestamp(time).strftime(ELASTICSEARCH_FORMAT) + + params = { + "size": 1000, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] + }, + "relation": "within" + } + } + }, + { + "range": { + "tile_count_i": { + "gte": 1 + } + } + }, + { + "range": { + "tile_min_time_dt": { + "lte": the_time + } + } + }, + { + "range": { + "tile_max_time_dt": { + "gte": the_time + } + } + } + ] + } + } + } + + + self._merge_kwargs(params, **kwargs) + + return self.do_query_all(*(None, "product(tile_avg_val_d, tile_count_i),*", None, False, None), **params) + + def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): + + the_time = datetime.utcfromtimestamp(time).strftime(ELASTICSEARCH_FORMAT) + + params = { + "size": 1000, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "geo_shape": { + "geo": { + "shape": { + "type": "multilinestring", + "coordinates": [[[min_lon, max_lat], [max_lon, max_lat], [min_lon, max_lat], [min_lon, min_lat], [max_lon, max_lat], [max_lon, min_lat], [min_lon, min_lat], [max_lon, min_lat]]] + }, + "relation": "intersects" + } + } + }, + { + "range": { + "tile_count_i": { + "gte": 1 + } + } + }, + { + "range": { + "tile_min_time_dt": { + "lte": the_time + } + } + }, + { + "range": { + "tile_max_time_dt": { + "gte": the_time + } + } + } + ], + "must_not" : { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] + }, + "relation": "within" + } + } + } + } + } + } + + self._merge_kwargs(params, **kwargs) + + return self.do_query_all(*(None, None, None, False, None), **params) + + def find_all_tiles_by_metadata(self, metadata, ds, start_time=0, end_time=-1, **kwargs): + """ + Get a list of tile metadata that matches the specified metadata, start_time, end_time. + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of tile metadata + """ + + params = { + "query": { + "bool": { + "must": [ + { + "term": { + "dataset_s": {"value": ds} + } + } + ] + } + } + } + + if len(metadata) > 0: + for key_value in metadata: + key = key_value.split(':')[0] + value = key_value.split(':')[1] + params['query']['bool']['must'].append({"match": {key: value}}) + + if 0 < start_time <= end_time: + params['query']['bool']['should'] = self.get_formatted_time_clause(start_time, end_time) + params["query"]["bool"]["minimum_should_match"] = 1 + + self._merge_kwargs(params, **kwargs) + return self.do_query_all(*(None, None, None, False, None), **params) + + def get_formatted_time_clause(self, start_time, end_time): + search_start_s = datetime.utcfromtimestamp(start_time).strftime(ELASTICSEARCH_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(ELASTICSEARCH_FORMAT) + + time_clause = [ + { + "range": { + "tile_min_time_dt": { + "lte": search_end_s, + "gte": search_start_s + } + } + }, + { + "range": { + "tile_max_time_dt": { + "lte": search_end_s, + "gte": search_start_s + } + } + }, + { + "bool": { + "must": [ + { + "range": { + "tile_min_time_dt": { + "gte": search_start_s + } + } + }, + { + "range": { + "tile_max_time_dt": { + "lte": search_end_s + } + } + } + ] + } + } + ] + + return time_clause + + def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): + """ + Return number of tiles that match search criteria. + :param ds: The dataset name to search + :param bounding_polygon: The polygon to search for tiles + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :return: number of tiles that match search criteria + """ + + params = { + "size": 0, + "query": { + "bool": { + "filter": [ + { + "term": { + "dataset_s": { + "value": ds + } + } + }, + { + "range": { + "tile_count_i": { + "gte": 1 + } + } + } + ] + } + } + } + + if bounding_polygon: + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + geo_clause = { + "geo_shape": { + "geo": { + "shape": { + "type": "envelope", + "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] + } + } + } + } + + params['query']['bool']['filter'].append(geo_clause) + + if 0 < start_time <= end_time: + params['query']['bool']['should'] = self.get_formatted_time_clause(start_time, end_time) + params["query"]["bool"]["minimum_should_match"] = 1 + + if len(metadata) > 0: + for key_value in metadata: + key = key_value.split(':')[0] + value = key_value.split(':')[1] + params['query']['bool']['filter'].append({"term": {key: {"value": value}}}) + + self._merge_kwargs(params, **kwargs) + _, _, found = self.do_query(*(None, None, None, True, None), **params) + + return found + + def do_aggregation(self, *args, **params): + # Gets raw aggregations + + response = self.do_query_raw(*args, **params) + aggregations = response.get('aggregations', None) + return aggregations + + def do_aggregation_all(self, params, agg_name): + # Used for pagination when results can exceed ES max size (use of after_key) + + with ELASTICSEARCH_CON_LOCK: + response = self.elasticsearchcon.search(index=self.elasticsearchIndex, body=params) + all_buckets = [] + + try: + aggregations = response.get('aggregations', None) + current_buckets = aggregations.get(agg_name, None) + buckets = current_buckets.get('buckets', None) + all_buckets += buckets + after_bucket = current_buckets.get('after_key', None) + + while after_bucket is not None: + for agg in params['aggs']: + params['aggs'][agg]['composite']['after'] = {} + for source in params['aggs'][agg]['composite']['sources']: + key_name = next(iter(source)) + params['aggs'][agg]['composite']['after'][key_name] = after_bucket[key_name] + with ELASTICSEARCH_CON_LOCK: + response = self.elasticsearchcon.search(index=self.elasticsearchIndex, body=params) + + aggregations = response.get('aggregations', None) + current_buckets = aggregations.get(agg_name, None) + buckets = current_buckets.get('buckets', None) + all_buckets += buckets + after_bucket = current_buckets.get('after_key', None) + + except AttributeError as e: + self.logger.error('Error when accessing aggregation buckets - ' + str(e)) + + return all_buckets + + def do_query(self, *args, **params): + response = self.do_query_raw(*args, **params) + return response['hits']['hits'], None, response['hits']['total']['value'] + + def do_query_raw(self, *args, **params): + + if args[4]: + + sort_fields = args[4].split(",") + + if 'sort' not in list(params.keys()): + params["sort"] = [] + + for field in sort_fields: + field_order = field.split(' ') + sort_instruction = {field_order[0]: field_order[1]} + if sort_instruction not in params['sort']: + params["sort"].append(sort_instruction) + with ELASTICSEARCH_CON_LOCK: + response = self.elasticsearchcon.search(index=self.elasticsearchIndex, body=params) + + return response + + def do_query_all(self, *args, **params): + # Used to paginate with search_after. + # The method calling this might already have a sort clause, + # so we merge both sort clauses inside do_query_raw + + results = [] + + search = None + + # Add track option to not be blocked at 10000 hits per worker + if 'track_total_hits' not in params.keys(): + params['track_total_hits'] = True + + # Add sort instruction order to paginate the results : + params["sort"] = [ + { "tile_min_time_dt": "asc"}, + { "_id": "asc" } + ] + + response = self.do_query_raw(*args, **params) + results.extend([r["_source"] for r in response["hits"]["hits"]]) + + total_hits = response["hits"]["total"]["value"] + + try: + search_after = [] + for sort_param in response["hits"]["hits"][-1]["sort"]: + search_after.append(str(sort_param)) + except (KeyError, IndexError): + search_after = [] + + try: + while len(results) < total_hits: + params["search_after"] = search_after + response = self.do_query_raw(*args, **params) + results.extend([r["_source"] for r in response["hits"]["hits"]]) + + search_after = [] + for sort_param in response["hits"]["hits"][-1]["sort"]: + search_after.append(str(sort_param)) + + except (KeyError, IndexError): + pass + + return results + + def convert_iso_to_datetime(self, date): + return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=UTC) + + def convert_iso_to_timestamp(self, date): + return (self.convert_iso_to_datetime(date) - EPOCH).total_seconds() + + @staticmethod + def _merge_kwargs(params, **kwargs): + # Only Solr-specific kwargs are parsed + # And the special 'limit' + try: + params['limit'] = kwargs['limit'] + except KeyError: + pass + + try: + params['_route_'] = kwargs['_route_'] + except KeyError: + pass + + try: + params['size'] = kwargs['size'] + except KeyError: + pass + + try: + params['start'] = kwargs['start'] + except KeyError: + pass + + try: + s = kwargs['sort'] if isinstance(kwargs['sort'], list) else [kwargs['sort']] + except KeyError: + s = None + + try: + params['sort'].extend(s) + except KeyError: + if s is not None: + params['sort'] = s diff --git a/data-access/nexustiles/backends/nexusproto/dao/S3Proxy.py b/data-access/nexustiles/backends/nexusproto/dao/S3Proxy.py new file mode 100644 index 00000000..c8d3adfe --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/dao/S3Proxy.py @@ -0,0 +1,141 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import uuid + +import boto3 +import nexusproto.DataTile_pb2 as nexusproto +import numpy as np +from nexusproto.serialization import from_shaped_array + + +class NexusTileData(object): + __nexus_tile = None + __data = None + tile_id = None + + def __init__(self, data, _tile_id): + if self.__data is None: + self.__data = data + if self.tile_id is None: + self.tile_id = _tile_id + + def _get_nexus_tile(self): + if self.__nexus_tile is None: + self.__nexus_tile = nexusproto.TileData.FromString(self.__data) + + return self.__nexus_tile + + def get_raw_data_array(self): + + nexus_tile = self._get_nexus_tile() + the_tile_type = nexus_tile.tile.WhichOneof("tile_type") + + the_tile_data = getattr(nexus_tile.tile, the_tile_type) + + return from_shaped_array(the_tile_data.variable_data) + + def get_lat_lon_time_data_meta(self): + if self._get_nexus_tile().HasField('grid_tile'): + grid_tile = self._get_nexus_tile().grid_tile + + grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_tile.variable_data)) + latitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.latitude)) + longitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.longitude)) + + if len(grid_tile_data.shape) == 2: + grid_tile_data = grid_tile_data[np.newaxis, :] + + # Extract the meta data + meta_data = {} + for meta_data_obj in grid_tile.meta_data: + name = meta_data_obj.name + meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + if len(meta_array.shape) == 2: + meta_array = meta_array[np.newaxis, :] + meta_data[name] = meta_array + + return latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data + elif self._get_nexus_tile().HasField('swath_tile'): + swath_tile = self._get_nexus_tile().swath_tile + + latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) + longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) + time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) + + # Simplify the tile if the time dimension is the same value repeated + if np.all(time_data == np.min(time_data)): + time_data = np.array([np.min(time_data)]) + + swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) + + tile_data = self._to_standard_index(swath_tile_data, + (len(time_data), len(latitude_data), len(longitude_data))) + + # Extract the meta data + meta_data = {} + for meta_data_obj in swath_tile.meta_data: + name = meta_data_obj.name + actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) + reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) + meta_data[name] = reshaped_meta_array + + return latitude_data, longitude_data, time_data, tile_data, meta_data + else: + raise NotImplementedError("Only supports grid_tile and swath_tile") + + @staticmethod + def _to_standard_index(data_array, desired_shape): + + if desired_shape[0] == 1: + reshaped_array = np.ma.masked_all((desired_shape[1], desired_shape[2])) + row, col = np.indices(data_array.shape) + + reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ + row.flat, col.flat] + reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ + row.flat, col.flat] + reshaped_array = reshaped_array[np.newaxis, :] + else: + reshaped_array = np.ma.masked_all(desired_shape) + row, col = np.indices(data_array.shape) + + reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ + row.flat, col.flat] + reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ + row.flat, col.flat] + + return reshaped_array + + +class S3Proxy(object): + def __init__(self, config): + self.config = config + self.__s3_bucketname = config.get("s3", "bucket") + self.__s3_region = config.get("s3", "region") + self.__s3 = boto3.resource('s3') + self.__nexus_tile = None + + def fetch_nexus_tiles(self, *tile_ids): + tile_ids = [uuid.UUID(str(tile_id)) for tile_id in tile_ids if + (isinstance(tile_id, str) or isinstance(tile_id, str))] + res = [] + for tile_id in tile_ids: + obj = self.__s3.Object(self.__s3_bucketname, str(tile_id)) + data = obj.get()['Body'].read() + nexus_tile = NexusTileData(data, str(tile_id)) + res.append(nexus_tile) + + return res diff --git a/data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py b/data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py new file mode 100644 index 00000000..9b16533d --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py @@ -0,0 +1,731 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import logging +import threading +import time +from datetime import datetime +from pytz import timezone, UTC + +import requests +import pysolr +from shapely import wkt + +SOLR_CON_LOCK = threading.Lock() +thread_local = threading.local() + +EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) +SOLR_FORMAT = '%Y-%m-%dT%H:%M:%SZ' +ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' + + +class SolrProxy(object): + def __init__(self, config): + self.solrUrl = config.get("solr", "host") + self.solrCore = config.get("solr", "core") + solr_kargs = {} + if config.has_option("solr", "time_out"): + solr_kargs["timeout"] = config.get("solr", "time_out") + self.logger = logging.getLogger('nexus') + + with SOLR_CON_LOCK: + solrcon = getattr(thread_local, 'solrcon', None) + if solrcon is None: + solr_url = '%s/solr/%s' % (self.solrUrl, self.solrCore) + self.logger.info("connect to solr, url {} with option(s) = {}".format(solr_url, solr_kargs)) + solrcon = pysolr.Solr(solr_url, **solr_kargs) + thread_local.solrcon = solrcon + + self.solrcon = solrcon + + def find_tile_by_id(self, tile_id): + + search = 'id:%s' % tile_id + + params = { + 'rows': 1 + } + + results, start, found = self.do_query(*(search, None, None, True, None), **params) + + assert len(results) == 1, "Found %s results, expected exactly 1" % len(results) + return [results[0]] + + def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): + + if ds is not None: + search = 'dataset_s:%s' % ds + else: + search = '*:*' + + additionalparams = { + 'fq': [ + "{!terms f=id}%s" % ','.join(tile_ids) + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + + results = self.do_query_all(*(search, None, None, False, None), **additionalparams) + + assert len(results) == len(tile_ids), "Found %s results, expected exactly %s" % (len(results), len(tile_ids)) + return results + + def find_min_date_from_tiles(self, tile_ids, ds=None, **kwargs): + + if ds is not None: + search = 'dataset_s:%s' % ds + else: + search = '*:*' + + kwargs['rows'] = 1 + kwargs['fl'] = 'tile_min_time_dt' + kwargs['sort'] = ['tile_min_time_dt asc'] + additionalparams = { + 'fq': [ + "{!terms f=id}%s" % ','.join(tile_ids) if len(tile_ids) > 0 else '' + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + + results, start, found = self.do_query(*(search, None, None, True, None), **additionalparams) + + return self.convert_iso_to_datetime(results[0]['tile_min_time_dt']) + + def find_max_date_from_tiles(self, tile_ids, ds=None, **kwargs): + + if ds is not None: + search = 'dataset_s:%s' % ds + else: + search = '*:*' + + kwargs['rows'] = 1 + kwargs['fl'] = 'tile_max_time_dt' + kwargs['sort'] = ['tile_max_time_dt desc'] + additionalparams = { + 'fq': [ + "{!terms f=id}%s" % ','.join(tile_ids) if len(tile_ids) > 0 else '' + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + + results, start, found = self.do_query(*(search, None, None, True, None), **additionalparams) + + return self.convert_iso_to_datetime(results[0]['tile_max_time_dt']) + + def find_min_max_date_from_granule(self, ds, granule_name, **kwargs): + search = 'dataset_s:%s' % ds + + kwargs['rows'] = 1 + kwargs['fl'] = 'tile_min_time_dt' + kwargs['sort'] = ['tile_min_time_dt asc'] + additionalparams = { + 'fq': [ + "granule_s:%s" % granule_name + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + results, start, found = self.do_query(*(search, None, None, False, None), **additionalparams) + start_time = self.convert_iso_to_datetime(results[0]['tile_min_time_dt']) + + kwargs['fl'] = 'tile_max_time_dt' + kwargs['sort'] = ['tile_max_time_dt desc'] + additionalparams = { + 'fq': [ + "granule_s:%s" % granule_name + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + results, start, found = self.do_query(*(search, None, None, False, None), **additionalparams) + end_time = self.convert_iso_to_datetime(results[0]['tile_max_time_dt']) + + return start_time, end_time + + def get_data_series_list(self): + + datasets = self.get_data_series_list_simple() + + for dataset in datasets: + min_date = self.find_min_date_from_tiles([], ds=dataset['title']) + max_date = self.find_max_date_from_tiles([], ds=dataset['title']) + dataset['start'] = (min_date - EPOCH).total_seconds() + dataset['end'] = (max_date - EPOCH).total_seconds() + dataset['iso_start'] = min_date.strftime(ISO_8601) + dataset['iso_end'] = max_date.strftime(ISO_8601) + + return datasets + + def get_data_series_list_simple(self): + search = "*:*" + params = { + 'rows': 0, + "facet": "true", + "facet.field": "dataset_s", + "facet.mincount": "1", + "facet.limit": "-1" + } + + + response = self.do_query_raw(*(search, None, None, False, None), **params) + l = [] + for g, v in zip(*[iter(response.facets["facet_fields"]["dataset_s"])]*2): + l.append({ + "shortName": g, + "title": g, + "tileCount": v + }) + l = sorted(l, key=lambda entry: entry["title"]) + return l + + def get_data_series_stats(self, ds): + search = "dataset_s:%s" % ds + params = { + "facet": "true", + "facet.field": ["dataset_s", "tile_max_time_dt"], + "facet.limit": "-1", + "facet.mincount": "1", + "facet.pivot": "{!stats=piv1}dataset_s", + "stats": "on", + "stats.field": ["{!tag=piv1 min=true max=true sum=false}tile_max_time_dt","{!tag=piv1 min=true max=false sum=false}tile_min_val_d","{!tag=piv1 min=false max=true sum=false}tile_max_val_d"] + } + + response = self.do_query_raw(*(search, None, None, False, None), **params) + + stats = {} + + for g in response.facets["facet_pivot"]["dataset_s"]: + if g["value"] == ds: + stats["start"] = self.convert_iso_to_timestamp(g["stats"]["stats_fields"]["tile_max_time_dt"]["min"]) + stats["end"] = self.convert_iso_to_timestamp(g["stats"]["stats_fields"]["tile_max_time_dt"]["max"]) + stats["minValue"] = g["stats"]["stats_fields"]["tile_min_val_d"]["min"] + stats["maxValue"] = g["stats"]["stats_fields"]["tile_max_val_d"]["max"] + + + stats["availableDates"] = [] + for dt in response.facets["facet_fields"]["tile_max_time_dt"][::2]: + stats["availableDates"].append(self.convert_iso_to_timestamp(dt)) + + stats["availableDates"] = sorted(stats["availableDates"]) + + return stats + + def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year): + + search = 'dataset_s:%s' % ds + + params = { + 'fq': [ + "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, + "tile_count_i:[1 TO *]", + "day_of_year_i:[* TO %s]" % day_of_year + ], + 'rows': 1 + } + + results, start, found = self.do_query( + *(search, None, None, True, ('day_of_year_i desc',)), **params) + + return [results[0]] + + def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs): + + search = 'dataset_s:%s' % ds + + search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) + + additionalparams = { + 'fq': [ + "geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon), + "{!frange l=0 u=0}ms(tile_min_time_dt,tile_max_time_dt)", + "tile_count_i:[1 TO *]", + "tile_min_time_dt:[%s TO %s] " % (search_start_s, search_end_s) + ], + 'rows': 0, + 'facet': 'true', + 'facet.field': 'tile_min_time_dt', + 'facet.mincount': '1', + 'facet.limit': '-1' + } + + self._merge_kwargs(additionalparams, **kwargs) + + response = self.do_query_raw(*(search, None, None, False, None), **additionalparams) + + daysinrangeasc = sorted( + [(datetime.strptime(a_date, SOLR_FORMAT) - datetime.utcfromtimestamp(0)).total_seconds() for a_date + in response.facets['facet_fields']['tile_min_time_dt'][::2]]) + + return daysinrangeasc + + def find_all_tiles_in_box_sorttimeasc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, + end_time=-1, **kwargs): + + search = 'dataset_s:%s' % ds + + additionalparams = { + 'fq': [ + "geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon), + "tile_count_i:[1 TO *]" + ] + } + + if 0 <= start_time <= end_time: + search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) + + time_clause = "(" \ + "tile_min_time_dt:[%s TO %s] " \ + "OR tile_max_time_dt:[%s TO %s] " \ + "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ + ")" % ( + search_start_s, search_end_s, + search_start_s, search_end_s, + search_start_s, search_end_s + ) + additionalparams['fq'].append(time_clause) + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all( + *(search, None, None, False, 'tile_min_time_dt asc, tile_max_time_dt asc'), + **additionalparams) + + def find_all_tiles_in_polygon_sorttimeasc(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): + + search = 'dataset_s:%s' % ds + + additionalparams = { + 'fq': [ + "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, + "tile_count_i:[1 TO *]" + ] + } + + if 0 <= start_time <= end_time: + search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) + + time_clause = "(" \ + "tile_min_time_dt:[%s TO %s] " \ + "OR tile_max_time_dt:[%s TO %s] " \ + "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ + ")" % ( + search_start_s, search_end_s, + search_start_s, search_end_s, + search_start_s, search_end_s + ) + additionalparams['fq'].append(time_clause) + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all( + *(search, None, None, False, 'tile_min_time_dt asc, tile_max_time_dt asc'), + **additionalparams) + + def find_all_tiles_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): + + search = 'dataset_s:%s' % ds + + additionalparams = { + 'fq': [ + "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, + "tile_count_i:[1 TO *]" + ] + } + + if 0 <= start_time <= end_time: + search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) + + time_clause = "(" \ + "tile_min_time_dt:[%s TO %s] " \ + "OR tile_max_time_dt:[%s TO %s] " \ + "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ + ")" % ( + search_start_s, search_end_s, + search_start_s, search_end_s, + search_start_s, search_end_s + ) + additionalparams['fq'].append(time_clause) + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all( + *(search, None, None, False, None), + **additionalparams) + + def find_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): + + search = 'dataset_s:%s' % ds + + additionalparams = { + 'fq': [ + "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, + "tile_count_i:[1 TO *]" + ], + 'rows': 0, + 'facet': 'true', + 'facet.field': 'geo_s', + 'facet.limit': -1, + 'facet.mincount': 1 + } + + if 0 <= start_time <= end_time: + search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) + + time_clause = "(" \ + "tile_min_time_dt:[%s TO %s] " \ + "OR tile_max_time_dt:[%s TO %s] " \ + "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ + ")" % ( + search_start_s, search_end_s, + search_start_s, search_end_s, + search_start_s, search_end_s + ) + additionalparams['fq'].append(time_clause) + + self._merge_kwargs(additionalparams, **kwargs) + + response = self.do_query_raw(*(search, None, None, False, None), **additionalparams) + + distinct_bounds = [wkt.loads(key).bounds for key in response.facets["facet_fields"]["geo_s"][::2]] + + return distinct_bounds + + def find_tiles_by_exact_bounds(self, minx, miny, maxx, maxy, ds, start_time=0, end_time=-1, **kwargs): + + search = 'dataset_s:%s' % ds + + additionalparams = { + 'fq': [ + "tile_min_lon:\"%s\"" % minx, + "tile_min_lat:\"%s\"" % miny, + "tile_max_lon:\"%s\"" % maxx, + "tile_max_lat:\"%s\"" % maxy, + "tile_count_i:[1 TO *]" + ] + } + + if 0 <= start_time <= end_time: + search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) + + time_clause = "(" \ + "tile_min_time_dt:[%s TO %s] " \ + "OR tile_max_time_dt:[%s TO %s] " \ + "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ + ")" % ( + search_start_s, search_end_s, + search_start_s, search_end_s, + search_start_s, search_end_s + ) + additionalparams['fq'].append(time_clause) + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all( + *(search, None, None, False, None), + **additionalparams) + + def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, search_time, **kwargs): + search = 'dataset_s:%s' % ds + + the_time = datetime.utcfromtimestamp(search_time).strftime(SOLR_FORMAT) + time_clause = "(" \ + "tile_min_time_dt:[* TO %s] " \ + "AND tile_max_time_dt:[%s TO *] " \ + ")" % ( + the_time, the_time + ) + + additionalparams = { + 'fq': [ + "geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon), + "tile_count_i:[1 TO *]", + time_clause + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all(*(search, None, None, False, None), **additionalparams) + + def find_all_tiles_in_polygon_at_time(self, bounding_polygon, ds, search_time, **kwargs): + search = 'dataset_s:%s' % ds + + the_time = datetime.utcfromtimestamp(search_time).strftime(SOLR_FORMAT) + time_clause = "(" \ + "tile_min_time_dt:[* TO %s] " \ + "AND tile_max_time_dt:[%s TO *] " \ + ")" % ( + the_time, the_time + ) + + additionalparams = { + 'fq': [ + "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, + "tile_count_i:[1 TO *]", + time_clause + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all(*(search, None, None, False, None), **additionalparams) + + def find_all_tiles_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): + search = 'dataset_s:%s' % ds + + the_time = datetime.utcfromtimestamp(time).strftime(SOLR_FORMAT) + time_clause = "(" \ + "tile_min_time_dt:[* TO %s] " \ + "AND tile_max_time_dt:[%s TO *] " \ + ")" % ( + the_time, the_time + ) + + additionalparams = { + 'fq': [ + "geo:\"Within(ENVELOPE(%s,%s,%s,%s))\"" % (min_lon, max_lon, max_lat, min_lat), + "tile_count_i:[1 TO *]", + time_clause + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all(*(search, "product(tile_avg_val_d, tile_count_i),*", None, False, None), + **additionalparams) + + def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): + search = 'dataset_s:%s' % ds + + the_time = datetime.utcfromtimestamp(time).strftime(SOLR_FORMAT) + time_clause = "(" \ + "tile_min_time_dt:[* TO %s] " \ + "AND tile_max_time_dt:[%s TO *] " \ + ")" % ( + the_time, the_time + ) + + additionalparams = { + 'fq': [ + "geo:\"Intersects(MultiLineString((%s %s, %s %s),(%s %s, %s %s),(%s %s, %s %s),(%s %s, %s %s)))\"" % ( + min_lon, max_lat, max_lon, max_lat, min_lon, max_lat, min_lon, min_lat, max_lon, max_lat, max_lon, + min_lat, min_lon, min_lat, max_lon, min_lat), + "-geo:\"Within(ENVELOPE(%s,%s,%s,%s))\"" % (min_lon, max_lon, max_lat, min_lat), + "tile_count_i:[1 TO *]", + time_clause + ] + } + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all(*(search, None, None, False, None), **additionalparams) + + def find_all_tiles_by_metadata(self, metadata, ds, start_time=0, end_time=-1, **kwargs): + """ + Get a list of tile metadata that matches the specified metadata, start_time, end_time. + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of tile metadata + """ + search = 'dataset_s:%s' % ds + + additionalparams = { + 'fq': metadata + } + + if 0 <= start_time <= end_time: + additionalparams['fq'].append(self.get_formatted_time_clause(start_time, end_time)) + + self._merge_kwargs(additionalparams, **kwargs) + + return self.do_query_all( + *(search, None, None, False, None), + **additionalparams) + + def get_formatted_time_clause(self, start_time, end_time): + search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) + search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) + + time_clause = "(" \ + "tile_min_time_dt:[%s TO %s] " \ + "OR tile_max_time_dt:[%s TO %s] " \ + "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ + ")" % ( + search_start_s, search_end_s, + search_start_s, search_end_s, + search_start_s, search_end_s + ) + return time_clause + + def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): + """ + Return number of tiles that match search criteria. + :param ds: The dataset name to search + :param bounding_polygon: The polygon to search for tiles + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :return: number of tiles that match search criteria + """ + search = 'dataset_s:%s' % ds + + additionalparams = { + 'fq': [ + "tile_count_i:[1 TO *]" + ], + 'rows': 0 + } + + if bounding_polygon: + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + additionalparams['fq'].append("geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon)) + + if 0 <= start_time <= end_time: + additionalparams['fq'].append(self.get_formatted_time_clause(start_time, end_time)) + + if metadata: + additionalparams['fq'].extend(metadata) + + self._merge_kwargs(additionalparams, **kwargs) + + results, start, found = self.do_query(*(search, None, None, True, None), **additionalparams) + + return found + + def do_query(self, *args, **params): + + response = self.do_query_raw(*args, **params) + + return response.docs, response.raw_response['response']['start'], response.hits + + def do_query_raw(self, *args, **params): + + if 'fl' not in list(params.keys()) and args[1]: + params['fl'] = args[1] + + if 'sort' not in list(params.keys()) and args[4]: + params['sort'] = args[4] + + # If dataset_s is specified as the search term, + # add the _route_ parameter to limit the search to the correct shard + if 'dataset_s:' in args[0]: + ds = args[0].split(':')[-1] + params['shard_keys'] = ds + '!' + + with SOLR_CON_LOCK: + response = self.solrcon.search(args[0], **params) + + return response + + + def do_query_all(self, *args, **params): + + results = [] + + response = self.do_query_raw(*args, **params) + results.extend(response.docs) + + limit = min(params.get('limit', float('inf')), response.hits) + + while len(results) < limit: + params['start'] = len(results) + response = self.do_query_raw(*args, **params) + results.extend(response.docs) + + assert len(results) == limit + + return results + + def convert_iso_to_datetime(self, date): + return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=UTC) + + def convert_iso_to_timestamp(self, date): + return (self.convert_iso_to_datetime(date) - EPOCH).total_seconds() + + def ping(self): + solrAdminPing = '%s/solr/%s/admin/ping' % (self.solrUrl, self.solrCore) + try: + r = requests.get(solrAdminPing, params={'wt': 'json'}) + results = json.loads(r.text) + return results + except: + return None + + @staticmethod + def _merge_kwargs(additionalparams, **kwargs): + # Only Solr-specific kwargs are parsed + # And the special 'limit' + try: + additionalparams['limit'] = kwargs['limit'] + except KeyError: + pass + + try: + additionalparams['_route_'] = kwargs['_route_'] + except KeyError: + pass + + try: + additionalparams['rows'] = kwargs['rows'] + except KeyError: + pass + + try: + additionalparams['start'] = kwargs['start'] + except KeyError: + pass + + try: + kwfq = kwargs['fq'] if isinstance(kwargs['fq'], list) else list(kwargs['fq']) + except KeyError: + kwfq = [] + + try: + additionalparams['fq'].extend(kwfq) + except KeyError: + additionalparams['fq'] = kwfq + + try: + kwfl = kwargs['fl'] if isinstance(kwargs['fl'], list) else [kwargs['fl']] + except KeyError: + kwfl = [] + + try: + additionalparams['fl'].extend(kwfl) + except KeyError: + additionalparams['fl'] = kwfl + + try: + s = kwargs['sort'] if isinstance(kwargs['sort'], list) else [kwargs['sort']] + except KeyError: + s = None + + try: + additionalparams['sort'].extend(s) + except KeyError: + if s is not None: + additionalparams['sort'] = s diff --git a/data-access/nexustiles/backends/nexusproto/dao/__init__.py b/data-access/nexustiles/backends/nexusproto/dao/__init__.py new file mode 100644 index 00000000..6acb5d12 --- /dev/null +++ b/data-access/nexustiles/backends/nexusproto/dao/__init__.py @@ -0,0 +1,14 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/data-access/nexustiles/backends/zarr/__init__.py b/data-access/nexustiles/backends/zarr/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index a3aa61e9..333b0c55 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -32,6 +32,13 @@ from .dao import SolrProxy from .dao import ElasticsearchProxy +from .backends.nexusproto.backend import NexusprotoTileService + + +from abc import ABC, abstractmethod + +from .AbstractTileService import AbstractTileService + from .model.nexusmodel import Tile, BBox, TileStats, TileVariable EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) @@ -78,7 +85,9 @@ class NexusTileServiceException(Exception): pass -class NexusTileService(object): +class NexusTileService(AbstractTileService): + backends = {} + def __init__(self, skipDatastore=False, skipMetadatastore=False, config=None): self._datastore = None self._metadatastore = None @@ -352,92 +361,6 @@ def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_tim bounds = self._metadatastore.find_distinct_bounding_boxes_in_polygon(bounding_polygon, ds, start_time, end_time) return [box(*b) for b in bounds] - def mask_tiles_to_bbox(self, min_lat, max_lat, min_lon, max_lon, tiles): - - for tile in tiles: - tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) - tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) - - # Or together the masks of the individual arrays to create the new mask - data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ - | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ - | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] - - # If this is multi-var, need to mask each variable separately. - if tile.is_multi: - # Combine space/time mask with existing mask on data - data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) - - num_vars = len(tile.data) - multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) - tile.data = ma.masked_where(multi_data_mask, tile.data) - else: - tile.data = ma.masked_where(data_mask, tile.data) - - tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] - - return tiles - - def mask_tiles_to_bbox_and_time(self, min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles): - for tile in tiles: - tile.times = ma.masked_outside(tile.times, start_time, end_time) - tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) - tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) - - # Or together the masks of the individual arrays to create the new mask - data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ - | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ - | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] - - tile.data = ma.masked_where(data_mask, tile.data) - - tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] - - return tiles - - def mask_tiles_to_polygon(self, bounding_polygon, tiles): - - min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds - - return self.mask_tiles_to_bbox(min_lat, max_lat, min_lon, max_lon, tiles) - - def mask_tiles_to_polygon_and_time(self, bounding_polygon, start_time, end_time, tiles): - min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds - - return self.mask_tiles_to_bbox_and_time(min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles) - - def mask_tiles_to_time_range(self, start_time, end_time, tiles): - """ - Masks data in tiles to specified time range. - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :param tiles: List of tiles - :return: A list tiles with data masked to specified time range - """ - if 0 <= start_time <= end_time: - for tile in tiles: - tile.times = ma.masked_outside(tile.times, start_time, end_time) - - # Or together the masks of the individual arrays to create the new mask - data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ - | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ - | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] - - # If this is multi-var, need to mask each variable separately. - if tile.is_multi: - # Combine space/time mask with existing mask on data - data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) - - num_vars = len(tile.data) - multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) - tile.data = ma.masked_where(multi_data_mask, tile.data) - else: - tile.data = ma.masked_where(data_mask, tile.data) - - tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] - - return tiles - def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): """ Return number of tiles that match search criteria. From 4f3f6112f0156f5d928f8549ed0a58d6d8f64e9e Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 5 Jul 2023 13:09:11 -0700 Subject: [PATCH 02/70] n/a --- data-access/nexustiles/AbstractTileService.py | 5 ++ data-access/nexustiles/config/datasets.ini | 18 +++++ .../nexustiles/config/datasets.ini.default | 18 +++++ data-access/nexustiles/nexustiles.py | 78 +++++++++++++------ 4 files changed, 96 insertions(+), 23 deletions(-) create mode 100644 data-access/nexustiles/config/datasets.ini create mode 100644 data-access/nexustiles/config/datasets.ini.default diff --git a/data-access/nexustiles/AbstractTileService.py b/data-access/nexustiles/AbstractTileService.py index f4f4449c..307a2c15 100644 --- a/data-access/nexustiles/AbstractTileService.py +++ b/data-access/nexustiles/AbstractTileService.py @@ -37,6 +37,11 @@ from nexustiles.nexustiles import NexusTileServiceException class AbstractTileService(ABC): + @staticmethod + @abstractmethod + def open_dataset(dataset_s, **kwargs): + pass + @abstractmethod def get_dataseries_list(self, simple=False): raise NotImplementedError() diff --git a/data-access/nexustiles/config/datasets.ini b/data-access/nexustiles/config/datasets.ini new file mode 100644 index 00000000..9f586cf2 --- /dev/null +++ b/data-access/nexustiles/config/datasets.ini @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[solr] +host=http://localhost:8983 +core=nexusdatasets diff --git a/data-access/nexustiles/config/datasets.ini.default b/data-access/nexustiles/config/datasets.ini.default new file mode 100644 index 00000000..9f586cf2 --- /dev/null +++ b/data-access/nexustiles/config/datasets.ini.default @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[solr] +host=http://localhost:8983 +core=nexusdatasets diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index 333b0c55..622792eb 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -18,14 +18,16 @@ import sys import json from datetime import datetime -from functools import wraps, reduce +from functools import wraps, reduce, partial import numpy as np import numpy.ma as ma import pkg_resources from pytz import timezone, UTC from shapely.geometry import MultiPolygon, box +import pysolr +import threading from .dao import CassandraProxy from .dao import DynamoProxy from .dao import S3Proxy @@ -41,6 +43,8 @@ from .model.nexusmodel import Tile, BBox, TileStats, TileVariable +from webservice.webmodel import DatasetNotFoundException + EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) logging.basicConfig( @@ -85,36 +89,64 @@ class NexusTileServiceException(Exception): pass -class NexusTileService(AbstractTileService): - backends = {} +SOLR_LOCK = threading.Lock() +thread_local = threading.local() + - def __init__(self, skipDatastore=False, skipMetadatastore=False, config=None): - self._datastore = None - self._metadatastore = None +class NexusTileService(AbstractTileService): + backends = {} # relate ds names to factory func objects + + def __init__(self, config=None): self._config = configparser.RawConfigParser() - self._config.read(NexusTileService._get_config_files('config/datastores.ini')) + self._config.read(NexusTileService._get_config_files('config/datasets.ini')) + + self._alg_config = config if config: self.override_config(config) - if not skipDatastore: - datastore = self._config.get("datastore", "store") - if datastore == "cassandra": - self._datastore = CassandraProxy.CassandraProxy(self._config) - elif datastore == "s3": - self._datastore = S3Proxy.S3Proxy(self._config) - elif datastore == "dynamo": - self._datastore = DynamoProxy.DynamoProxy(self._config) + NexusTileService.backends[None] = NexusprotoTileService(False, False, config) + NexusTileService.backends['__nexusproto__'] = NexusTileService.backends[None] + + + + def _get_ingested_datasets(self): + solr_url = self._config.get("solr", "host") + solr_core = self._config.get("solr", "core") + solr_kwargs = {} + + if self._config.has_option("solr", "time_out"): + solr_kwargs["timeout"] = self._config.get("solr", "time_out") + + with SOLR_LOCK: + solrcon = getattr(thread_local, 'solrcon', None) + if solrcon is None: + solr_url = '%s/solr/%s' % (solr_url, solr_core) + solrcon = pysolr.Solr(solr_url, **solr_kwargs) + thread_local.solrcon = solrcon + + solrcon = solrcon + + response = solrcon.search('*:*') + + for dataset in response.docs: + d_id = dataset['dataset_s'] + store_type = dataset.get('store_type_s', 'nexusproto') + + if store_type == 'nexus_proto': + NexusTileService.backends[d_id] = NexusTileService.backends[None] else: - raise ValueError("Error reading datastore from config file") - - if not skipMetadatastore: - metadatastore = self._config.get("metadatastore", "store", fallback='solr') - if metadatastore == "solr": - self._metadatastore = SolrProxy.SolrProxy(self._config) - elif metadatastore == "elasticsearch": - self._metadatastore = ElasticsearchProxy.ElasticsearchProxy(self._config) + ds_config = dataset['config'] + # NexusTileService.backends[d_id] = + + + + + def get_tileservice_factory(self, dataset=None): + pass + + def override_config(self, config): for section in config.sections(): From e32d5addd4488ced41bc895a744c7c3de70f4301 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 6 Jul 2023 14:30:58 -0700 Subject: [PATCH 03/70] More nts backend stuff --- data-access/nexustiles/AbstractTileService.py | 45 +---- .../nexustiles/backends/nexusproto/backend.py | 4 +- .../nexustiles/backends/zarr/backend.py | 45 +++++ data-access/nexustiles/nexustiles.py | 181 +++++++++--------- 4 files changed, 143 insertions(+), 132 deletions(-) create mode 100644 data-access/nexustiles/backends/zarr/backend.py diff --git a/data-access/nexustiles/AbstractTileService.py b/data-access/nexustiles/AbstractTileService.py index 307a2c15..6426295b 100644 --- a/data-access/nexustiles/AbstractTileService.py +++ b/data-access/nexustiles/AbstractTileService.py @@ -36,11 +36,16 @@ from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable from nexustiles.nexustiles import NexusTileServiceException + class AbstractTileService(ABC): - @staticmethod + # @staticmethod + # @abstractmethod + # def open_dataset(dataset_s, **kwargs): + # pass + @abstractmethod - def open_dataset(dataset_s, **kwargs): - pass + def try_connect(self) -> bool: + raise NotImplementedError() @abstractmethod def get_dataseries_list(self, simple=False): @@ -115,19 +120,6 @@ def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, * """ raise NotImplementedError() - @abstractmethod - def get_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): - """ - Return list of tiles that matches the specified metadata, start_time, end_time with tile data outside of time - range properly masked out. - :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] - :param ds: The dataset name to search - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :return: A list of tiles - """ - raise NotImplementedError() - @abstractmethod def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs): """ @@ -148,15 +140,6 @@ def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs) def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): raise NotImplementedError() - @abstractmethod - def get_tiles_bounded_by_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, - **kwargs): - raise NotImplementedError() - - @abstractmethod - def get_tiles_bounded_by_polygon(self, polygon, ds=None, start_time=0, end_time=-1, **kwargs): - raise NotImplementedError() - @abstractmethod def get_min_max_time_by_granule(self, ds, granule_name): raise NotImplementedError() @@ -165,18 +148,6 @@ def get_min_max_time_by_granule(self, ds, granule_name): def get_dataset_overall_stats(self, ds): raise NotImplementedError() - @abstractmethod - def get_tiles_bounded_by_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - raise NotImplementedError() - - @abstractmethod - def get_tiles_bounded_by_polygon_at_time(self, polygon, dataset, time, **kwargs): - raise NotImplementedError() - - @abstractmethod - def get_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - raise NotImplementedError() - @abstractmethod def get_stats_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): raise NotImplementedError() diff --git a/data-access/nexustiles/backends/nexusproto/backend.py b/data-access/nexustiles/backends/nexusproto/backend.py index 86d5ca6a..aa0ab290 100644 --- a/data-access/nexustiles/backends/nexusproto/backend.py +++ b/data-access/nexustiles/backends/nexusproto/backend.py @@ -34,6 +34,7 @@ from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable from nexustiles.nexustiles import NexusTileServiceException +from nexustiles.AbstractTileService import AbstractTileService EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) @@ -44,8 +45,9 @@ logger = logging.getLogger("testing") -class NexusprotoTileService(object): +class NexusprotoTileService(AbstractTileService): def __init__(self, skipDatastore=False, skipMetadatastore=False, config=None): + AbstractTileService.__init__(self) self._datastore = None self._metadatastore = None diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py new file mode 100644 index 00000000..019cd753 --- /dev/null +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import configparser +import logging +import sys +import json +from datetime import datetime +from functools import reduce + +import numpy as np +import numpy.ma as ma +import pkg_resources +from pytz import timezone, UTC +from shapely.geometry import MultiPolygon, box + +from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable +from nexustiles.nexustiles import NexusTileServiceException +from nexustiles.AbstractTileService import AbstractTileService + +EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt="%Y-%m-%dT%H:%M:%S", stream=sys.stdout) +logger = logging.getLogger("testing") + + +class ZarrBackend(AbstractTileService): + def __init__(self, config): + AbstractTileService.__init__(self) + self.__config = config diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index 622792eb..fde0a5f3 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -28,13 +28,10 @@ import pysolr import threading -from .dao import CassandraProxy -from .dao import DynamoProxy -from .dao import S3Proxy -from .dao import SolrProxy -from .dao import ElasticsearchProxy +from time import sleep from .backends.nexusproto.backend import NexusprotoTileService +from .backends.zarr.backend import ZarrBackend from abc import ABC, abstractmethod @@ -42,8 +39,9 @@ from .AbstractTileService import AbstractTileService from .model.nexusmodel import Tile, BBox, TileStats, TileVariable +from typing import Dict, Union -from webservice.webmodel import DatasetNotFoundException +from webservice.webmodel import DatasetNotFoundException, NexusProcessingException EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) @@ -90,12 +88,13 @@ class NexusTileServiceException(Exception): SOLR_LOCK = threading.Lock() +DS_LOCK = threading.Lock() thread_local = threading.local() class NexusTileService(AbstractTileService): - backends = {} # relate ds names to factory func objects + backends: Dict[Union[None, str], Dict[str, Union[AbstractTileService, bool]]] = {} def __init__(self, config=None): self._config = configparser.RawConfigParser() @@ -106,12 +105,37 @@ def __init__(self, config=None): if config: self.override_config(config) - NexusTileService.backends[None] = NexusprotoTileService(False, False, config) + NexusTileService.backends[None] = {"backend": NexusprotoTileService(False, False, config), 'up': True} NexusTileService.backends['__nexusproto__'] = NexusTileService.backends[None] + def __update_datasets(): + while True: + with DS_LOCK: + self._update_datasets() + sleep(3600) + threading.Thread(target=__update_datasets, name='dataset_update', daemon=False).start() - def _get_ingested_datasets(self): + + + @staticmethod + def __get_backend(dataset_s) -> AbstractTileService: + if dataset_s not in NexusTileService.backends: + raise DatasetNotFoundException(reason=f'Dataset {dataset_s} is not currently loaded/ingested') + + b = NexusTileService.backends[dataset_s] + + if not b['up']: + success = b['backend'].try_connect() + + if not success: + raise NexusProcessingException(reason=f'Dataset {dataset_s} is currently unavailable') + else: + NexusTileService.backends[dataset_s]['up'] = True + + return b['backend'] + + def _update_datasets(self): solr_url = self._config.get("solr", "host") solr_core = self._config.get("solr", "core") solr_kwargs = {} @@ -130,23 +154,34 @@ def _get_ingested_datasets(self): response = solrcon.search('*:*') + present_datasets = set() + for dataset in response.docs: d_id = dataset['dataset_s'] store_type = dataset.get('store_type_s', 'nexusproto') - if store_type == 'nexus_proto': - NexusTileService.backends[d_id] = NexusTileService.backends[None] - else: - ds_config = dataset['config'] - # NexusTileService.backends[d_id] = - + present_datasets.add(d_id) + if d_id in NexusTileService.backends: + continue + # is_up = NexusTileService.backends[d_id]['backend'].try_connect() + if store_type == 'nexus_proto' or store_type == 'nexusproto': + NexusTileService.backends[d_id] = NexusTileService.backends[None] + elif store_type == 'zarr': + ds_config = json.loads(dataset['config'][0]) + NexusTileService.backends[d_id] = { + 'backend': ZarrBackend(ds_config), + 'up': True + } + else: + logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') - def get_tileservice_factory(self, dataset=None): - pass - + removed_datasets = set(NexusTileService.backends.keys()).difference(present_datasets) + for dataset in removed_datasets: + logger.info(f"Removing dataset {dataset}") + del NexusTileService.backends[dataset] def override_config(self, config): for section in config.sections(): @@ -163,65 +198,35 @@ def get_dataseries_list(self, simple=False): @tile_data() def find_tile_by_id(self, tile_id, **kwargs): - return self._metadatastore.find_tile_by_id(tile_id) + return NexusTileService.__get_backend('__nexusproto__').find_tile_by_id(tile_id) @tile_data() def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): - return self._metadatastore.find_tiles_by_id(tile_ids, ds=ds, **kwargs) + return NexusTileService.__get_backend('__nexusproto__').find_tiles_by_id(tile_ids, ds=ds, **kwargs) def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, start_time, end_time, metrics_callback=None, **kwargs): - start = datetime.now() - result = self._metadatastore.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, dataset, start_time, - end_time, - **kwargs) - duration = (datetime.now() - start).total_seconds() - if metrics_callback: - metrics_callback(solr=duration) - return result + return NexusTileService.__get_backend(dataset).find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, + dataset, start_time, end_time, + metrics_callback, **kwargs) @tile_data() def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year, **kwargs): - """ - Given a bounding polygon, dataset, and day of year, find tiles in that dataset with the same bounding - polygon and the closest day of year. - - For example: - given a polygon minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; and day of year=32 - search for first tile in MY_DS with identical bbox and day_of_year <= 32 (sorted by day_of_year desc) - - Valid matches: - minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 - minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 - - Invalid matches: - minx=1, miny=0, maxx=2, maxy=1; dataset=MY_DS; day of year = 32 - minx=0, miny=0, maxx=1, maxy=1; dataset=MY_OTHER_DS; day of year = 32 - minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 if minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 also exists - - :param bounding_polygon: The exact bounding polygon of tiles to search for - :param ds: The dataset name being searched - :param day_of_year: Tile day of year to search for, tile nearest to this day (without going over) will be returned - :return: List of one tile from ds with bounding_polygon on or before day_of_year or raise NexusTileServiceException if no tile found - """ - try: - tile = self._metadatastore.find_tile_by_polygon_and_most_recent_day_of_year(bounding_polygon, ds, - day_of_year) - except IndexError: - raise NexusTileServiceException("No tile found.").with_traceback(sys.exc_info()[2]) - - return tile + return NexusTileService.__get_backend(ds).find_tile_by_polygon_and_most_recent_day_of_year( + bounding_polygon, ds, day_of_year, **kwargs + ) @tile_data() def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - return self._metadatastore.find_all_tiles_in_box_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, - rows=5000, - **kwargs) + return NexusTileService.__get_backend(dataset).find_all_tiles_in_box_at_time( + min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs + ) @tile_data() def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **kwargs): - return self._metadatastore.find_all_tiles_in_polygon_at_time(bounding_polygon, dataset, time, rows=5000, - **kwargs) + return NexusTileService.__get_backend(dataset).find_all_tiles_in_polygon_at_time( + bounding_polygon, dataset, time, **kwargs + ) @tile_data() def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): @@ -230,33 +235,22 @@ def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_t start_time = (start_time - EPOCH).total_seconds() if type(end_time) is datetime: end_time = (end_time - EPOCH).total_seconds() - return self._metadatastore.find_all_tiles_in_box_sorttimeasc(min_lat, max_lat, min_lon, max_lon, ds, start_time, - end_time, **kwargs) + + return NexusTileService.__get_backend(ds).find_tiles_in_box( + min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs + ) @tile_data() def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=0, end_time=-1, **kwargs): - # Find tiles that fall within the polygon in the Solr index - if 'sort' in list(kwargs.keys()): - tiles = self._metadatastore.find_all_tiles_in_polygon(bounding_polygon, ds, start_time, end_time, **kwargs) - else: - tiles = self._metadatastore.find_all_tiles_in_polygon_sorttimeasc(bounding_polygon, ds, start_time, - end_time, - **kwargs) - return tiles + return NexusTileService.__get_backend(ds).find_tiles_in_polygon( + bounding_polygon, ds, start_time, end_time, **kwargs + ) @tile_data() def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): - """ - Return list of tiles whose metadata matches the specified metadata, start_time, end_time. - :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] - :param ds: The dataset name to search - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :return: A list of tiles - """ - tiles = self._metadatastore.find_all_tiles_by_metadata(metadata, ds, start_time, end_time, **kwargs) - - return tiles + return NexusTileService.__get_backend(ds).find_tiles_by_metadata( + metadata, ds, start_time, end_time, **kwargs + ) def get_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): """ @@ -287,16 +281,15 @@ def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs) :param kwargs: fetch_data: True/False = whether or not to retrieve tile data :return: """ - tiles = self._metadatastore.find_tiles_by_exact_bounds(bounds[0], bounds[1], bounds[2], bounds[3], ds, - start_time, - end_time) - return tiles + return NexusTileService.__get_backend(ds).find_tiles_by_exact_bounds( + bounds, ds, start_time, end_time, **kwargs + ) @tile_data() def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - return self._metadatastore.find_all_boundary_tiles_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, - rows=5000, - **kwargs) + return NexusTileService.__get_backend(dataset).find_all_boundary_tiles_at_time( + min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs + ) def get_tiles_bounded_by_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): @@ -317,12 +310,12 @@ def get_tiles_bounded_by_polygon(self, polygon, ds=None, start_time=0, end_time= return tiles def get_min_max_time_by_granule(self, ds, granule_name): - start_time, end_time = self._metadatastore.find_min_max_date_from_granule(ds, granule_name) - - return start_time, end_time + return NexusTileService.__get_backend(ds).get_min_max_time_by_granule( + ds, granule_name + ) def get_dataset_overall_stats(self, ds): - return self._metadatastore.get_data_series_stats(ds) + return NexusTileService.__get_backend(ds).get_dataset_overall_stats(ds) def get_tiles_bounded_by_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): tiles = self.find_all_tiles_in_box_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs) From ccc0de4e56122e570a1acc8dbbf6f9443dfebc23 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 10 Jul 2023 15:26:22 -0700 Subject: [PATCH 04/70] Working(?) np backend --- .../algorithms/DailyDifferenceAverage.py | 3 +- .../algorithms/StandardDeviationSearch.py | 2 +- .../app_builders/NexusAppBuilder.py | 2 +- data-access/nexustiles/AbstractTileService.py | 112 +- .../nexustiles/backends/nexusproto/backend.py | 2 +- .../nexustiles/backends/zarr/backend.py | 2 +- data-access/nexustiles/dao/CassandraProxy.py | 317 ----- data-access/nexustiles/dao/DynamoProxy.py | 146 -- .../nexustiles/dao/ElasticsearchProxy.py | 1235 ----------------- data-access/nexustiles/dao/S3Proxy.py | 141 -- data-access/nexustiles/dao/SolrProxy.py | 731 ---------- data-access/nexustiles/dao/__init__.py | 14 - data-access/nexustiles/exception.py | 2 + data-access/nexustiles/nexustiles.py | 286 ++-- data-access/setup.py | 18 +- 15 files changed, 205 insertions(+), 2808 deletions(-) delete mode 100644 data-access/nexustiles/dao/CassandraProxy.py delete mode 100644 data-access/nexustiles/dao/DynamoProxy.py delete mode 100644 data-access/nexustiles/dao/ElasticsearchProxy.py delete mode 100644 data-access/nexustiles/dao/S3Proxy.py delete mode 100644 data-access/nexustiles/dao/SolrProxy.py delete mode 100644 data-access/nexustiles/dao/__init__.py create mode 100644 data-access/nexustiles/exception.py diff --git a/analysis/webservice/algorithms/DailyDifferenceAverage.py b/analysis/webservice/algorithms/DailyDifferenceAverage.py index 05274fc2..c6c84951 100644 --- a/analysis/webservice/algorithms/DailyDifferenceAverage.py +++ b/analysis/webservice/algorithms/DailyDifferenceAverage.py @@ -21,7 +21,8 @@ import numpy as np import pytz -from nexustiles.nexustiles import NexusTileService, NexusTileServiceException +from nexustiles.nexustiles import NexusTileService +from nexustiles.exception import NexusTileServiceException from shapely.geometry import box from webservice.NexusHandler import nexus_handler diff --git a/analysis/webservice/algorithms/StandardDeviationSearch.py b/analysis/webservice/algorithms/StandardDeviationSearch.py index ae0566f1..26451cb1 100644 --- a/analysis/webservice/algorithms/StandardDeviationSearch.py +++ b/analysis/webservice/algorithms/StandardDeviationSearch.py @@ -19,7 +19,7 @@ from datetime import datetime from functools import partial -from nexustiles.nexustiles import NexusTileServiceException +from nexustiles.exception import NexusTileServiceException from pytz import timezone from webservice.NexusHandler import nexus_handler diff --git a/analysis/webservice/nexus_tornado/app_builders/NexusAppBuilder.py b/analysis/webservice/nexus_tornado/app_builders/NexusAppBuilder.py index afe7d690..01798583 100644 --- a/analysis/webservice/nexus_tornado/app_builders/NexusAppBuilder.py +++ b/analysis/webservice/nexus_tornado/app_builders/NexusAppBuilder.py @@ -53,7 +53,7 @@ def set_modules(self, module_dir, algorithm_config, remote_collections=None, max NexusHandler.executeInitializers(algorithm_config) self.log.info("Initializing request ThreadPool to %s" % max_request_threads) - tile_service_factory = partial(NexusTileService, False, False, algorithm_config) + tile_service_factory = partial(NexusTileService, algorithm_config) handler_args_builder = HandlerArgsBuilder( max_request_threads, tile_service_factory, diff --git a/data-access/nexustiles/AbstractTileService.py b/data-access/nexustiles/AbstractTileService.py index 6426295b..6e5b4640 100644 --- a/data-access/nexustiles/AbstractTileService.py +++ b/data-access/nexustiles/AbstractTileService.py @@ -13,28 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import configparser -import logging -import sys -import json from abc import ABC, abstractmethod -from datetime import datetime from functools import reduce import numpy as np import numpy.ma as ma -import pkg_resources -from pytz import timezone, UTC -from shapely.geometry import MultiPolygon, box - -from .dao import CassandraProxy -from .dao import DynamoProxy -from .dao import S3Proxy -from .dao import SolrProxy -from .dao import ElasticsearchProxy - -from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable -from nexustiles.nexustiles import NexusTileServiceException class AbstractTileService(ABC): @@ -43,9 +26,9 @@ class AbstractTileService(ABC): # def open_dataset(dataset_s, **kwargs): # pass - @abstractmethod - def try_connect(self) -> bool: - raise NotImplementedError() + # @abstractmethod + # def try_connect(self) -> bool: + # raise NotImplementedError() @abstractmethod def get_dataseries_list(self, simple=False): @@ -193,91 +176,6 @@ def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_tim """ raise NotImplementedError() - def mask_tiles_to_bbox(self, min_lat, max_lat, min_lon, max_lon, tiles): - for tile in tiles: - tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) - tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) - - # Or together the masks of the individual arrays to create the new mask - data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ - | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ - | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] - - # If this is multi-var, need to mask each variable separately. - if tile.is_multi: - # Combine space/time mask with existing mask on data - data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) - - num_vars = len(tile.data) - multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) - tile.data = ma.masked_where(multi_data_mask, tile.data) - else: - tile.data = ma.masked_where(data_mask, tile.data) - - tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] - - return tiles - - def mask_tiles_to_bbox_and_time(self, min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles): - for tile in tiles: - tile.times = ma.masked_outside(tile.times, start_time, end_time) - tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) - tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) - - # Or together the masks of the individual arrays to create the new mask - data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ - | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ - | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] - - tile.data = ma.masked_where(data_mask, tile.data) - - tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] - - return tiles - - def mask_tiles_to_polygon(self, bounding_polygon, tiles): - - min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds - - return self.mask_tiles_to_bbox(min_lat, max_lat, min_lon, max_lon, tiles) - - def mask_tiles_to_polygon_and_time(self, bounding_polygon, start_time, end_time, tiles): - min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds - - return self.mask_tiles_to_bbox_and_time(min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles) - - def mask_tiles_to_time_range(self, start_time, end_time, tiles): - """ - Masks data in tiles to specified time range. - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :param tiles: List of tiles - :return: A list tiles with data masked to specified time range - """ - if 0 <= start_time <= end_time: - for tile in tiles: - tile.times = ma.masked_outside(tile.times, start_time, end_time) - - # Or together the masks of the individual arrays to create the new mask - data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ - | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ - | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] - - # If this is multi-var, need to mask each variable separately. - if tile.is_multi: - # Combine space/time mask with existing mask on data - data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) - - num_vars = len(tile.data) - multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) - tile.data = ma.masked_where(multi_data_mask, tile.data) - else: - tile.data = ma.masked_where(data_mask, tile.data) - - tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] - - return tiles - @abstractmethod def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): """ @@ -295,10 +193,6 @@ def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, m def fetch_data_for_tiles(self, *tiles): raise NotImplementedError() - @abstractmethod - def open_dataset(self, dataset): - raise NotImplementedError() - @abstractmethod def _metadata_store_docs_to_tiles(self, *store_docs): raise NotImplementedError() diff --git a/data-access/nexustiles/backends/nexusproto/backend.py b/data-access/nexustiles/backends/nexusproto/backend.py index aa0ab290..6aa63644 100644 --- a/data-access/nexustiles/backends/nexusproto/backend.py +++ b/data-access/nexustiles/backends/nexusproto/backend.py @@ -33,7 +33,7 @@ from .dao import ElasticsearchProxy from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable -from nexustiles.nexustiles import NexusTileServiceException +from nexustiles.exception import NexusTileServiceException from nexustiles.AbstractTileService import AbstractTileService EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 019cd753..93963166 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -27,7 +27,7 @@ from shapely.geometry import MultiPolygon, box from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable -from nexustiles.nexustiles import NexusTileServiceException +from nexustiles.exception import NexusTileServiceException from nexustiles.AbstractTileService import AbstractTileService EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) diff --git a/data-access/nexustiles/dao/CassandraProxy.py b/data-access/nexustiles/dao/CassandraProxy.py deleted file mode 100644 index 96f7c4c6..00000000 --- a/data-access/nexustiles/dao/CassandraProxy.py +++ /dev/null @@ -1,317 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import uuid -from configparser import NoOptionError - -import nexusproto.DataTile_pb2 as nexusproto -import numpy as np -from cassandra.auth import PlainTextAuthProvider -from cassandra.cqlengine import columns, connection, CQLEngineException -from cassandra.cluster import NoHostAvailable -from cassandra.cqlengine.models import Model -from cassandra.policies import TokenAwarePolicy, DCAwareRoundRobinPolicy, WhiteListRoundRobinPolicy -from multiprocessing.synchronize import Lock -from nexusproto.serialization import from_shaped_array - -INIT_LOCK = Lock(ctx=None) - -logger = logging.getLogger(__name__) - -class NexusTileData(Model): - __table_name__ = 'sea_surface_temp' - tile_id = columns.UUID(primary_key=True) - tile_blob = columns.Blob() - - __nexus_tile = None - - def _get_nexus_tile(self): - if self.__nexus_tile is None: - self.__nexus_tile = nexusproto.TileData.FromString(self.tile_blob) - - return self.__nexus_tile - - def get_raw_data_array(self): - - nexus_tile = self._get_nexus_tile() - the_tile_type = nexus_tile.tile.WhichOneof("tile_type") - - the_tile_data = getattr(nexus_tile.tile, the_tile_type) - - return from_shaped_array(the_tile_data.variable_data) - - def get_lat_lon_time_data_meta(self): - """ - Retrieve data from data store and metadata from metadata store - for this tile. For gridded tiles, the tile shape of the data - will match the input shape. For example, if the input was a - 30x30 tile, all variables will also be 30x30. However, if the - tile is a swath tile, the data will be transformed along the - diagonal of the data matrix. For example, a 30x30 tile would - become 900x900 where the 900 points are along the diagonal. - - Multi-variable tile will also include an extra dimension in the - data array. For example, a 30 x 30 x 30 array would be - transformed to N x 30 x 30 x 30 where N is the number of - variables in this tile. - - latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data, is_multi_var - - :return: latitude data - :return: longitude data - :return: time data - :return: data - :return: meta data dictionary - :return: boolean flag, True if this tile has more than one variable - """ - is_multi_var = False - - if self._get_nexus_tile().HasField('grid_tile'): - grid_tile = self._get_nexus_tile().grid_tile - - grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_tile.variable_data)) - latitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.latitude)) - longitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.longitude)) - - if len(grid_tile_data.shape) == 2: - grid_tile_data = grid_tile_data[np.newaxis, :] - - # Extract the meta data - meta_data = {} - for meta_data_obj in grid_tile.meta_data: - name = meta_data_obj.name - meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - if len(meta_array.shape) == 2: - meta_array = meta_array[np.newaxis, :] - meta_data[name] = meta_array - - return latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data, is_multi_var - elif self._get_nexus_tile().HasField('swath_tile'): - swath_tile = self._get_nexus_tile().swath_tile - - latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) - longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) - time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) - - # Simplify the tile if the time dimension is the same value repeated - if np.all(time_data == np.min(time_data)): - time_data = np.array([np.min(time_data)]) - - swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) - - tile_data = self._to_standard_index(swath_tile_data, - (len(time_data), len(latitude_data), len(longitude_data))) - - # Extract the meta data - meta_data = {} - for meta_data_obj in swath_tile.meta_data: - name = meta_data_obj.name - actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) - meta_data[name] = reshaped_meta_array - - return latitude_data, longitude_data, time_data, tile_data, meta_data, is_multi_var - elif self._get_nexus_tile().HasField('time_series_tile'): - time_series_tile = self._get_nexus_tile().time_series_tile - - time_series_tile_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.variable_data)) - time_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.time)).reshape(-1) - latitude_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.latitude)) - longitude_data = np.ma.masked_invalid(from_shaped_array(time_series_tile.longitude)) - - reshaped_array = np.ma.masked_all((len(time_data), len(latitude_data), len(longitude_data))) - idx = np.arange(len(latitude_data)) - reshaped_array[:, idx, idx] = time_series_tile_data - tile_data = reshaped_array - # Extract the meta data - meta_data = {} - for meta_data_obj in time_series_tile.meta_data: - name = meta_data_obj.name - meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - - reshaped_meta_array = np.ma.masked_all((len(time_data), len(latitude_data), len(longitude_data))) - idx = np.arange(len(latitude_data)) - reshaped_meta_array[:, idx, idx] = meta_array - - meta_data[name] = reshaped_meta_array - - return latitude_data, longitude_data, time_data, tile_data, meta_data, is_multi_var - elif self._get_nexus_tile().HasField('swath_multi_variable_tile'): - swath_tile = self._get_nexus_tile().swath_multi_variable_tile - is_multi_var = True - - latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) - longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) - time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) - - # Simplify the tile if the time dimension is the same value repeated - if np.all(time_data == np.min(time_data)): - time_data = np.array([np.min(time_data)]) - - swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) - - desired_shape = ( - len(time_data), - len(latitude_data), - len(longitude_data), - ) - tile_data = self._to_standard_index(swath_tile_data, desired_shape, is_multi_var=True) - - # Extract the meta data - meta_data = {} - for meta_data_obj in swath_tile.meta_data: - name = meta_data_obj.name - actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) - meta_data[name] = reshaped_meta_array - - return latitude_data, longitude_data, time_data, tile_data, meta_data, is_multi_var - elif self._get_nexus_tile().HasField('grid_multi_variable_tile'): - grid_multi_variable_tile = self._get_nexus_tile().grid_multi_variable_tile - is_multi_var = True - - grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_multi_variable_tile.variable_data)) - latitude_data = np.ma.masked_invalid(from_shaped_array(grid_multi_variable_tile.latitude)) - longitude_data = np.ma.masked_invalid(from_shaped_array(grid_multi_variable_tile.longitude)) - - # If there are 3 dimensions, that means the time dimension - # was squeezed. Add back in - if len(grid_tile_data.shape) == 3: - grid_tile_data = np.expand_dims(grid_tile_data, axis=1) - # If there are 4 dimensions, that means the time dimension - # is present. Move the multivar dimension. - if len(grid_tile_data.shape) == 4: - grid_tile_data = np.moveaxis(grid_tile_data, -1, 0) - - # Extract the meta data - meta_data = {} - for meta_data_obj in grid_multi_variable_tile.meta_data: - name = meta_data_obj.name - meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - if len(meta_array.shape) == 2: - meta_array = meta_array[np.newaxis, :] - meta_data[name] = meta_array - - return latitude_data, longitude_data, np.array([grid_multi_variable_tile.time]), grid_tile_data, meta_data, is_multi_var - else: - raise NotImplementedError("Only supports grid_tile, swath_tile, swath_multi_variable_tile, and time_series_tile") - - @staticmethod - def _to_standard_index(data_array, desired_shape, is_multi_var=False): - """ - Transform swath data to a standard format where data runs along - diagonal of ND matrix and the non-diagonal data points are - masked - - :param data_array: The data array to be transformed - :param desired_shape: The desired shape of the resulting array - :param is_multi_var: True if this is a multi-variable tile - :type data_array: np.array - :type desired_shape: tuple - :type is_multi_var: bool - :return: Reshaped array - :rtype: np.array - """ - - reshaped_array = [] - if is_multi_var: - reshaped_data_array = np.moveaxis(data_array, -1, 0) - else: - reshaped_data_array = [data_array] - - for variable_data_array in reshaped_data_array: - if desired_shape[0] == 1: - variable_reshaped_array = np.ma.masked_all((desired_shape[1], desired_shape[2])) - else: - variable_reshaped_array = np.ma.masked_all(desired_shape) - - row, col = np.indices(variable_data_array.shape) - - variable_reshaped_array[ - np.diag_indices(desired_shape[1], len(variable_reshaped_array.shape))] = \ - variable_data_array[ - row.flat, col.flat] - variable_reshaped_array.mask[ - np.diag_indices(desired_shape[1], len(variable_reshaped_array.shape))] = \ - variable_data_array.mask[ - row.flat, col.flat] - - if desired_shape[0] == 1: - reshaped_array.append(variable_reshaped_array[np.newaxis, :]) - else: - reshaped_array.append(variable_reshaped_array) - - if not is_multi_var: - # If single var, squeeze extra dim out of array - reshaped_array = reshaped_array[0] - - return reshaped_array - - -class CassandraProxy(object): - def __init__(self, config): - self.config = config - self.__cass_url = config.get("cassandra", "host") - self.__cass_username = config.get("cassandra", "username") - self.__cass_password = config.get("cassandra", "password") - self.__cass_keyspace = config.get("cassandra", "keyspace") - self.__cass_local_DC = config.get("cassandra", "local_datacenter") - self.__cass_protocol_version = config.getint("cassandra", "protocol_version") - self.__cass_dc_policy = config.get("cassandra", "dc_policy") - - try: - self.__cass_port = config.getint("cassandra", "port") - except NoOptionError: - self.__cass_port = 9042 - - with INIT_LOCK: - try: - connection.get_cluster() - except CQLEngineException: - self.__open() - - def __open(self): - if self.__cass_dc_policy == 'DCAwareRoundRobinPolicy': - dc_policy = DCAwareRoundRobinPolicy(self.__cass_local_DC) - token_policy = TokenAwarePolicy(dc_policy) - elif self.__cass_dc_policy == 'WhiteListRoundRobinPolicy': - token_policy = WhiteListRoundRobinPolicy([self.__cass_url]) - - if self.__cass_username and self.__cass_password: - auth_provider = PlainTextAuthProvider(username=self.__cass_username, password=self.__cass_password) - else: - auth_provider = None - try: - connection.setup( - [host for host in self.__cass_url.split(',')], self.__cass_keyspace, - protocol_version=self.__cass_protocol_version, load_balancing_policy=token_policy, - port=self.__cass_port, - auth_provider=auth_provider - ) - except NoHostAvailable as e: - logger.error("Cassandra is not accessible, SDAP will not server local datasets", e) - - def fetch_nexus_tiles(self, *tile_ids): - tile_ids = [uuid.UUID(str(tile_id)) for tile_id in tile_ids if - (isinstance(tile_id, str) or isinstance(tile_id, str))] - - res = [] - for tile_id in tile_ids: - filterResults = NexusTileData.objects.filter(tile_id=tile_id) - if len(filterResults) > 0: - res.append(filterResults[0]) - - return res diff --git a/data-access/nexustiles/dao/DynamoProxy.py b/data-access/nexustiles/dao/DynamoProxy.py deleted file mode 100644 index 1ee70ac1..00000000 --- a/data-access/nexustiles/dao/DynamoProxy.py +++ /dev/null @@ -1,146 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import uuid -import nexusproto.DataTile_pb2 as nexusproto -from nexusproto.serialization import from_shaped_array -import numpy as np -import boto3 - -class NexusTileData(object): - __nexus_tile = None - __data = None - tile_id = None - - def __init__(self, data, _tile_id): - if self.__data is None: - self.__data = data - if self.tile_id is None: - self.tile_id = _tile_id - - def _get_nexus_tile(self): - if self.__nexus_tile is None: - self.__nexus_tile = nexusproto.TileData.FromString(self.__data) - - return self.__nexus_tile - - def get_raw_data_array(self): - - nexus_tile = self._get_nexus_tile() - the_tile_type = nexus_tile.tile.WhichOneof("tile_type") - - the_tile_data = getattr(nexus_tile.tile, the_tile_type) - - return from_shaped_array(the_tile_data.variable_data) - - def get_lat_lon_time_data_meta(self): - if self._get_nexus_tile().HasField('grid_tile'): - grid_tile = self._get_nexus_tile().grid_tile - - grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_tile.variable_data)) - latitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.latitude)) - longitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.longitude)) - - if len(grid_tile_data.shape) == 2: - grid_tile_data = grid_tile_data[np.newaxis, :] - - # Extract the meta data - meta_data = {} - for meta_data_obj in grid_tile.meta_data: - name = meta_data_obj.name - meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - if len(meta_array.shape) == 2: - meta_array = meta_array[np.newaxis, :] - meta_data[name] = meta_array - - return latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data - elif self._get_nexus_tile().HasField('swath_tile'): - swath_tile = self._get_nexus_tile().swath_tile - - latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) - longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) - time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) - - # Simplify the tile if the time dimension is the same value repeated - if np.all(time_data == np.min(time_data)): - time_data = np.array([np.min(time_data)]) - - swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) - - tile_data = self._to_standard_index(swath_tile_data, - (len(time_data), len(latitude_data), len(longitude_data))) - - # Extract the meta data - meta_data = {} - for meta_data_obj in swath_tile.meta_data: - name = meta_data_obj.name - actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) - meta_data[name] = reshaped_meta_array - - return latitude_data, longitude_data, time_data, tile_data, meta_data - else: - raise NotImplementedError("Only supports grid_tile and swath_tile") - - @staticmethod - def _to_standard_index(data_array, desired_shape): - - if desired_shape[0] == 1: - reshaped_array = np.ma.masked_all((desired_shape[1], desired_shape[2])) - row, col = np.indices(data_array.shape) - - reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ - row.flat, col.flat] - reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ - row.flat, col.flat] - reshaped_array = reshaped_array[np.newaxis, :] - else: - reshaped_array = np.ma.masked_all(desired_shape) - row, col = np.indices(data_array.shape) - - reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ - row.flat, col.flat] - reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ - row.flat, col.flat] - - return reshaped_array - - -class DynamoProxy(object): - def __init__(self, config): - self.config = config - self.__dynamo_tablename = config.get("dynamo", "table") - self.__dynamo_region = config.get("dynamo", "region") - self.__dynamo = boto3.resource('dynamodb', region_name=self.__dynamo_region) - self.__dynamo_table = self.__dynamo.Table(self.__dynamo_tablename) - self.__nexus_tile = None - - def fetch_nexus_tiles(self, *tile_ids): - - tile_ids = [uuid.UUID(str(tile_id)) for tile_id in tile_ids if - (isinstance(tile_id, str) or isinstance(tile_id, str))] - res = [] - for tile_id in tile_ids: - response = self.__dynamo_table.get_item( - Key = { - 'tile_id': str(tile_id) - } - ) - item = response['Item'] - data = item['data'].__str__() - nexus_tile = NexusTileData(data, str(tile_id)) - res.append(nexus_tile) - - return res \ No newline at end of file diff --git a/data-access/nexustiles/dao/ElasticsearchProxy.py b/data-access/nexustiles/dao/ElasticsearchProxy.py deleted file mode 100644 index 157630f6..00000000 --- a/data-access/nexustiles/dao/ElasticsearchProxy.py +++ /dev/null @@ -1,1235 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -import threading -import time -import re -from datetime import datetime -from pytz import timezone, UTC - -import requests -import pysolr -from shapely import wkt -from elasticsearch import Elasticsearch - -ELASTICSEARCH_CON_LOCK = threading.Lock() -thread_local = threading.local() - -EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) -ELASTICSEARCH_FORMAT = '%Y-%m-%dT%H:%M:%SZ' -ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' - - -class ElasticsearchProxy(object): - def __init__(self, config): - self.elasticsearchHosts = config.get("elasticsearch", "host").split(',') - self.elasticsearchIndex = config.get("elasticsearch", "index") - self.elasticsearchUsername = config.get("elasticsearch", "username") - self.elasticsearchPassword = config.get("elasticsearch", "password") - self.logger = logging.getLogger(__name__) - - with ELASTICSEARCH_CON_LOCK: - elasticsearchcon = getattr(thread_local, 'elasticsearchcon', None) - if elasticsearchcon is None: - elasticsearchcon = Elasticsearch(hosts=self.elasticsearchHosts, http_auth=(self.elasticsearchUsername, self.elasticsearchPassword)) - thread_local.elasticsearchcon = elasticsearchcon - - self.elasticsearchcon = elasticsearchcon - - def find_tile_by_id(self, tile_id): - - params = { - "size": 1, - "query": { - "term": { - "id": { - "value": tile_id - } - } - } - } - - results, _, hits = self.do_query(*(None, None, None, True, None), **params) - assert hits == 1, f"Found {hits} results, expected exactly 1" - return [results[0]["_source"]] - - def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): - - params = { - "query": { - "bool": { - "filter": [], - "should": [], - "minimum_should_match": 1 - } - } - } - - for tile_id in tile_ids: - params['query']['bool']['should'].append({"term": {"id": {"value": tile_id}}}) - - if ds is not None: - params['query']['bool']['filter'].append({"term": {"dataset_s": {"value": ds}}}) - - self._merge_kwargs(params, **kwargs) - - results = self.do_query_all(*(None, None, None, False, None), **params) - assert len(results) == len(tile_ids), "Found %s results, expected exactly %s" % (len(results), len(tile_ids)) - return results - - def find_min_date_from_tiles(self, tile_ids, ds=None, **kwargs): - params = { - "size": 0, - "query": { - "bool": { - "filter": [], - "should": [] - } - }, - "aggs": { - "min_date_agg": { - "min": { - "field": "tile_min_time_dt" - } - } - } - } - - for tile_id in tile_ids: - params['query']['bool']['should'].append({"term": {"id": {"value": tile_id}}}) - if ds is not None: - params['query']['bool']['filter'].append({"term": {"dataset_s": {"value": ds}}}) - - aggregations = self.do_aggregation(*(None, None, None, True, None), **params) - return self.convert_iso_to_datetime(aggregations['min_date_agg']["value_as_string"]) - - def find_max_date_from_tiles(self, tile_ids, ds=None, **kwargs): - - params = { - "size": 0, - "query": { - "bool": { - "filter": [], - "should": [] - } - }, - "aggs": { - "max_date_agg": { - "max": { - "field": "tile_max_time_dt" - } - } - } - } - - for tile_id in tile_ids: - params['query']['bool']['should'].append({"term": {"id": {"value": tile_id}}}) - if ds is not None: - params['query']['bool']['filter'].append({"term": {"dataset_s": {"value": ds}}}) - - aggregations = self.do_aggregation(*(None, None, None, True, None), **params) - return self.convert_iso_to_datetime(aggregations['max_date_agg']["value_as_string"]) - - - def find_min_max_date_from_granule(self, ds, granule_name, **kwargs): - - params = { - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "term": { - "granule_s": { - "value": granule_name - } - } - } - ] - } - }, - "aggs": { - "min_date_agg": { - "max": { - "field": "tile_min_time_dt" - } - }, - "max_date_agg": { - "max": { - "field": "tile_max_time_dt" - } - } - } - } - - self._merge_kwargs(params, **kwargs) - - aggregations = self.do_aggregation(*(None, None, None, False, None), **params) - start_time = self.convert_iso_to_datetime(aggregations['min_date_agg']["value_as_string"]) - end_time = self.convert_iso_to_datetime(aggregations['max_date_agg']["value_as_string"]) - - return start_time, end_time - - def get_data_series_list(self): - - datasets = self.get_data_series_list_simple() - - for dataset in datasets: - min_date = self.find_min_date_from_tiles([], ds=dataset['title']) - max_date = self.find_max_date_from_tiles([], ds=dataset['title']) - dataset['start'] = (min_date - EPOCH).total_seconds() - dataset['end'] = (max_date - EPOCH).total_seconds() - dataset['iso_start'] = min_date.strftime(ISO_8601) - dataset['iso_end'] = max_date.strftime(ISO_8601) - - return datasets - - def get_data_series_list_simple(self): - - params = { - 'size': 0, - "aggs": { - "dataset_list_agg": { - "composite": { - "size":100, - "sources": [ - { - "dataset_s": { - "terms": { - "field": "dataset_s" - } - } - } - ] - } - } - } - } - - aggregations = self.do_aggregation_all(params, 'dataset_list_agg') - l = [] - - for dataset in aggregations: - l.append({ - "shortName": dataset['key']['dataset_s'], - "title": dataset['key']['dataset_s'], - "tileCount": dataset["doc_count"] - }) - - l = sorted(l, key=lambda entry: entry["title"]) - return l - - def get_data_series_stats(self, ds): - - params = { - "size": 0, - "query": { - "term":{ - "dataset_s": { - "value": ds - } - } - }, - "aggs": { - "available_dates": { - "composite": { - "size": 100, - "sources": [ - {"terms_tile_max_time_dt": {"terms": {"field": "tile_max_time_dt"}}} - ] - } - } - } - } - - aggregations = self.do_aggregation_all(params, 'available_dates') - stats = {} - stats['available_dates'] = [] - - for dt in aggregations: - stats['available_dates'].append(dt['key']['terms_tile_max_time_dt'] / 1000) - - stats['available_dates'] = sorted(stats['available_dates']) - - params = { - "size": 0, - "query": { - "term":{ - "dataset_s": { - "value": ds - } - } - }, - "aggs": { - "min_tile_min_val_d": { - "min": { - "field": "tile_min_val_d" - } - }, - "min_tile_max_time_dt": { - "min": { - "field": "tile_max_time_dt" - } - }, - "max_tile_max_time_dt": { - "max": { - "field": "tile_max_time_dt" - } - }, - "max_tile_max_val_d": { - "max": { - "field": "tile_max_val_d" - } - } - } - } - - aggregations = self.do_aggregation(*(None, None, None, False, None), **params) - stats["start"] = int(aggregations["min_tile_max_time_dt"]["value"]) / 1000 - stats["end"] = int(aggregations["max_tile_max_time_dt"]["value"]) / 1000 - stats["minValue"] = aggregations["min_tile_min_val_d"]["value"] - stats["maxValue"] = aggregations["max_tile_max_val_d"]["value"] - - return stats - - # day_of_year_i added (SDAP-347) - def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year): - - max_lat = bounding_polygon.bounds[3] - min_lon = bounding_polygon.bounds[0] - min_lat = bounding_polygon.bounds[1] - max_lon = bounding_polygon.bounds[2] - - params = { - "size": "1", - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] - }, - "relation": "intersects" - } - } - }, - { - "range": { - "tile_count_i": { - "gte": 1 - } - } - }, - { - "range": { - "day_of_year_i": { - "lte": day_of_year - } - } - } - ] - } - } - } - result, _, _ = self.do_query(*(None, None, None, True, 'day_of_year_i desc'), **params) - - return [result[0]] - - def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs): - - search_start_s = datetime.utcfromtimestamp(start_time).strftime(ELASTICSEARCH_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(ELASTICSEARCH_FORMAT) - - params = { - "size": "0", - "_source": "tile_min_time_dt", - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "range": { - "tile_min_time_dt": { - "gte": search_start_s, - "lte": search_end_s - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] - }, - "relation": "intersects" - } - } - } - ] - } - }, - "aggs": { - "days_range_agg": { - "composite": { - "size":100, - "sources": [ - { - "tile_min_time_dt": { - "terms": { - "field": "tile_min_time_dt" - } - } - } - ] - } - } - } - } - - aggregations = self.do_aggregation_all(params, 'days_range_agg') - results = [res['key']['tile_min_time_dt'] for res in aggregations] - daysinrangeasc = sorted([(res / 1000) for res in results]) - return daysinrangeasc - - def find_all_tiles_in_box_sorttimeasc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, - end_time=-1, **kwargs): - - params = { - "size": 1000, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] - }, - "relation": "intersects" - } - } - }, - { - "range": { - "tile_count_i": { - "gte": 1 - } - } - } - ] - } - } - } - - - if 0 < start_time <= end_time: - params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) - params["query"]["bool"]["minimum_should_match"] = 1 - - self._merge_kwargs(params, **kwargs) - - return self.do_query_all(*(None, None, None, False, 'tile_min_time_dt asc,tile_max_time_dt asc'), **params) - - def find_all_tiles_in_polygon_sorttimeasc(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): - - nums = re.findall(r'\d+(?:\.\d*)?', bounding_polygon.wkt.rpartition(',')[0]) - polygon_coordinates = list(zip(*[iter(nums)] * 2)) - - max_lat = bounding_polygon.bounds[3] - min_lon = bounding_polygon.bounds[0] - min_lat = bounding_polygon.bounds[1] - max_lon = bounding_polygon.bounds[2] - - params = { - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] - }, - "relation": "intersects" - } - } - } - ] - } - } - } - - try: - if 'fl' in list(kwargs.keys()): - params["_source"] = kwargs["fl"].split(',') - except KeyError: - pass - - if 0 < start_time <= end_time: - params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) - params["query"]["bool"]["minimum_should_match"] = 1 - - return self.do_query_all(*(None, None, None, False, 'tile_min_time_dt asc,tile_max_time_dt asc'), **params) - - def find_all_tiles_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): - - nums = re.findall(r'\d+(?:\.\d*)?', bounding_polygon.wkt.rpartition(',')[0]) - polygon_coordinates = list(zip(*[iter(nums)] * 2)) - - max_lat = bounding_polygon.bounds[3] - min_lon = bounding_polygon.bounds[0] - min_lat = bounding_polygon.bounds[1] - max_lon = bounding_polygon.bounds[2] - - params = { - "size": 1000, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] - }, - "relation": "intersects" - } - } - }, - { - "range": { - "tile_count_i": { - "gte": 1 - } - } - } - ] - } - } - } - - try: - if 'fl' in list(kwargs.keys()): - params["_source"] = kwargs["fl"].split(',') - except KeyError: - pass - - if 0 < start_time <= end_time: - params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) - params["query"]["bool"]["minimum_should_match"] = 1 - - self._merge_kwargs(params, **kwargs) - - return self.do_query_all(*(None, None, None, False, None), **params) - - def find_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): - - tile_max_lat = bounding_polygon.bounds[3] - tile_min_lon = bounding_polygon.bounds[0] - tile_min_lat = bounding_polygon.bounds[1] - tile_max_lon = bounding_polygon.bounds[2] - - params = { - "size": 0, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[tile_min_lon, tile_max_lat], [tile_max_lon, tile_min_lat]] - }, - "relation": "intersects" - } - } - } - ] - } - }, - "aggs": { - "distinct_bounding_boxes": { - "composite": { - "size": 100, - "sources": [ - { - "bounding_box": { - "terms": { - "script": { - "source": "String.valueOf(doc['tile_min_lon'].value) + ', ' + String.valueOf(doc['tile_max_lon'].value) + ', ' + String.valueOf(doc['tile_min_lat'].value) + ', ' + String.valueOf(doc['tile_max_lat'].value)", - "lang": "painless" - } - } - } - } - ] - } - } - } - } - - if 0 < start_time <= end_time: - params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) - params["query"]["bool"]["minimum_should_match"] = 1 - - self._merge_kwargs(params, **kwargs) - aggregations = self.do_aggregation_all(params, 'distinct_bounding_boxes') - distinct_bounds = [] - for agg in aggregations: - coords = agg['key']['bounding_box'].split(',') - min_lon = round(float(coords[0]), 2) - max_lon = round(float(coords[1]), 2) - min_lat = round(float(coords[2]), 2) - max_lat = round(float(coords[3]), 2) - polygon = 'POLYGON((%s %s, %s %s, %s %s, %s %s, %s %s))' % (min_lon, max_lat, min_lon, min_lat, max_lon, min_lat, max_lon, max_lat, min_lon, max_lat) - distinct_bounds.append(wkt.loads(polygon).bounds) - - return distinct_bounds - - def find_tiles_by_exact_bounds(self, minx, miny, maxx, maxy, ds, start_time=0, end_time=-1, **kwargs): - - params = { - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "term": { - "tile_min_lon": { - "value": minx - } - } - }, - { - "term": { - "tile_min_lat": { - "value": miny - } - } - }, - { - "term": { - "tile_max_lon": { - "value": maxx - } - } - }, - { - "term": { - "tile_max_lat": { - "value": maxy - } - } - } - ] - } - }} - - if 0 < start_time <= end_time: - params["query"]["bool"]["should"] = self.get_formatted_time_clause(start_time, end_time) - params["query"]["bool"]["minimum_should_match"] = 1 - - self._merge_kwargs(params, **kwargs) - - return self.do_query_all(*(None, None, None, False, None), **params) - - def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, search_time, **kwargs): - - the_time = datetime.utcfromtimestamp(search_time).strftime(ELASTICSEARCH_FORMAT) - - params = { - "size": 1000, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] - }, - "relation": "intersects" - } - } - }, - { - "range": { - "tile_min_time_dt": { - "lte": the_time - } - } - }, - { - "range": { - "tile_max_time_dt": { - "gte": the_time - } - } - } - ] - } - } - } - - self._merge_kwargs(params, **kwargs) - - return self.do_query_all(*(None, None, None, False, None), **params) - - def find_all_tiles_in_polygon_at_time(self, bounding_polygon, ds, search_time, **kwargs): - - the_time = datetime.utcfromtimestamp(search_time).strftime(ELASTICSEARCH_FORMAT) - - max_lat = bounding_polygon.bounds[3] - min_lon = bounding_polygon.bounds[0] - min_lat = bounding_polygon.bounds[1] - max_lon = bounding_polygon.bounds[2] - - params = { - "size": 1000, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] - }, - "relation": "intersects" - } - } - }, - { "range": { - "tile_min_time_dt": { - "lte": the_time - } - } }, - { "range": { - "tile_max_time_dt": { - "gte": the_time - } - } } - ] - } - } - } - - self._merge_kwargs(params, **kwargs) - - return self.do_query_all(*(None, None, None, False, None), **params) - - - def find_all_tiles_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): - - the_time = datetime.utcfromtimestamp(time).strftime(ELASTICSEARCH_FORMAT) - - params = { - "size": 1000, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat],[max_lon, min_lat]] - }, - "relation": "within" - } - } - }, - { - "range": { - "tile_count_i": { - "gte": 1 - } - } - }, - { - "range": { - "tile_min_time_dt": { - "lte": the_time - } - } - }, - { - "range": { - "tile_max_time_dt": { - "gte": the_time - } - } - } - ] - } - } - } - - - self._merge_kwargs(params, **kwargs) - - return self.do_query_all(*(None, "product(tile_avg_val_d, tile_count_i),*", None, False, None), **params) - - def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): - - the_time = datetime.utcfromtimestamp(time).strftime(ELASTICSEARCH_FORMAT) - - params = { - "size": 1000, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "geo_shape": { - "geo": { - "shape": { - "type": "multilinestring", - "coordinates": [[[min_lon, max_lat], [max_lon, max_lat], [min_lon, max_lat], [min_lon, min_lat], [max_lon, max_lat], [max_lon, min_lat], [min_lon, min_lat], [max_lon, min_lat]]] - }, - "relation": "intersects" - } - } - }, - { - "range": { - "tile_count_i": { - "gte": 1 - } - } - }, - { - "range": { - "tile_min_time_dt": { - "lte": the_time - } - } - }, - { - "range": { - "tile_max_time_dt": { - "gte": the_time - } - } - } - ], - "must_not" : { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] - }, - "relation": "within" - } - } - } - } - } - } - - self._merge_kwargs(params, **kwargs) - - return self.do_query_all(*(None, None, None, False, None), **params) - - def find_all_tiles_by_metadata(self, metadata, ds, start_time=0, end_time=-1, **kwargs): - """ - Get a list of tile metadata that matches the specified metadata, start_time, end_time. - :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] - :param ds: The dataset name to search - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :return: A list of tile metadata - """ - - params = { - "query": { - "bool": { - "must": [ - { - "term": { - "dataset_s": {"value": ds} - } - } - ] - } - } - } - - if len(metadata) > 0: - for key_value in metadata: - key = key_value.split(':')[0] - value = key_value.split(':')[1] - params['query']['bool']['must'].append({"match": {key: value}}) - - if 0 < start_time <= end_time: - params['query']['bool']['should'] = self.get_formatted_time_clause(start_time, end_time) - params["query"]["bool"]["minimum_should_match"] = 1 - - self._merge_kwargs(params, **kwargs) - return self.do_query_all(*(None, None, None, False, None), **params) - - def get_formatted_time_clause(self, start_time, end_time): - search_start_s = datetime.utcfromtimestamp(start_time).strftime(ELASTICSEARCH_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(ELASTICSEARCH_FORMAT) - - time_clause = [ - { - "range": { - "tile_min_time_dt": { - "lte": search_end_s, - "gte": search_start_s - } - } - }, - { - "range": { - "tile_max_time_dt": { - "lte": search_end_s, - "gte": search_start_s - } - } - }, - { - "bool": { - "must": [ - { - "range": { - "tile_min_time_dt": { - "gte": search_start_s - } - } - }, - { - "range": { - "tile_max_time_dt": { - "lte": search_end_s - } - } - } - ] - } - } - ] - - return time_clause - - def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): - """ - Return number of tiles that match search criteria. - :param ds: The dataset name to search - :param bounding_polygon: The polygon to search for tiles - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] - :return: number of tiles that match search criteria - """ - - params = { - "size": 0, - "query": { - "bool": { - "filter": [ - { - "term": { - "dataset_s": { - "value": ds - } - } - }, - { - "range": { - "tile_count_i": { - "gte": 1 - } - } - } - ] - } - } - } - - if bounding_polygon: - min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds - geo_clause = { - "geo_shape": { - "geo": { - "shape": { - "type": "envelope", - "coordinates": [[min_lon, max_lat], [max_lon, min_lat]] - } - } - } - } - - params['query']['bool']['filter'].append(geo_clause) - - if 0 < start_time <= end_time: - params['query']['bool']['should'] = self.get_formatted_time_clause(start_time, end_time) - params["query"]["bool"]["minimum_should_match"] = 1 - - if len(metadata) > 0: - for key_value in metadata: - key = key_value.split(':')[0] - value = key_value.split(':')[1] - params['query']['bool']['filter'].append({"term": {key: {"value": value}}}) - - self._merge_kwargs(params, **kwargs) - _, _, found = self.do_query(*(None, None, None, True, None), **params) - - return found - - def do_aggregation(self, *args, **params): - # Gets raw aggregations - - response = self.do_query_raw(*args, **params) - aggregations = response.get('aggregations', None) - return aggregations - - def do_aggregation_all(self, params, agg_name): - # Used for pagination when results can exceed ES max size (use of after_key) - - with ELASTICSEARCH_CON_LOCK: - response = self.elasticsearchcon.search(index=self.elasticsearchIndex, body=params) - all_buckets = [] - - try: - aggregations = response.get('aggregations', None) - current_buckets = aggregations.get(agg_name, None) - buckets = current_buckets.get('buckets', None) - all_buckets += buckets - after_bucket = current_buckets.get('after_key', None) - - while after_bucket is not None: - for agg in params['aggs']: - params['aggs'][agg]['composite']['after'] = {} - for source in params['aggs'][agg]['composite']['sources']: - key_name = next(iter(source)) - params['aggs'][agg]['composite']['after'][key_name] = after_bucket[key_name] - with ELASTICSEARCH_CON_LOCK: - response = self.elasticsearchcon.search(index=self.elasticsearchIndex, body=params) - - aggregations = response.get('aggregations', None) - current_buckets = aggregations.get(agg_name, None) - buckets = current_buckets.get('buckets', None) - all_buckets += buckets - after_bucket = current_buckets.get('after_key', None) - - except AttributeError as e: - self.logger.error('Error when accessing aggregation buckets - ' + str(e)) - - return all_buckets - - def do_query(self, *args, **params): - response = self.do_query_raw(*args, **params) - return response['hits']['hits'], None, response['hits']['total']['value'] - - def do_query_raw(self, *args, **params): - - if args[4]: - - sort_fields = args[4].split(",") - - if 'sort' not in list(params.keys()): - params["sort"] = [] - - for field in sort_fields: - field_order = field.split(' ') - sort_instruction = {field_order[0]: field_order[1]} - if sort_instruction not in params['sort']: - params["sort"].append(sort_instruction) - with ELASTICSEARCH_CON_LOCK: - response = self.elasticsearchcon.search(index=self.elasticsearchIndex, body=params) - - return response - - def do_query_all(self, *args, **params): - # Used to paginate with search_after. - # The method calling this might already have a sort clause, - # so we merge both sort clauses inside do_query_raw - - results = [] - - search = None - - # Add track option to not be blocked at 10000 hits per worker - if 'track_total_hits' not in params.keys(): - params['track_total_hits'] = True - - # Add sort instruction order to paginate the results : - params["sort"] = [ - { "tile_min_time_dt": "asc"}, - { "_id": "asc" } - ] - - response = self.do_query_raw(*args, **params) - results.extend([r["_source"] for r in response["hits"]["hits"]]) - - total_hits = response["hits"]["total"]["value"] - - try: - search_after = [] - for sort_param in response["hits"]["hits"][-1]["sort"]: - search_after.append(str(sort_param)) - except (KeyError, IndexError): - search_after = [] - - try: - while len(results) < total_hits: - params["search_after"] = search_after - response = self.do_query_raw(*args, **params) - results.extend([r["_source"] for r in response["hits"]["hits"]]) - - search_after = [] - for sort_param in response["hits"]["hits"][-1]["sort"]: - search_after.append(str(sort_param)) - - except (KeyError, IndexError): - pass - - return results - - def convert_iso_to_datetime(self, date): - return datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=UTC) - - def convert_iso_to_timestamp(self, date): - return (self.convert_iso_to_datetime(date) - EPOCH).total_seconds() - - @staticmethod - def _merge_kwargs(params, **kwargs): - # Only Solr-specific kwargs are parsed - # And the special 'limit' - try: - params['limit'] = kwargs['limit'] - except KeyError: - pass - - try: - params['_route_'] = kwargs['_route_'] - except KeyError: - pass - - try: - params['size'] = kwargs['size'] - except KeyError: - pass - - try: - params['start'] = kwargs['start'] - except KeyError: - pass - - try: - s = kwargs['sort'] if isinstance(kwargs['sort'], list) else [kwargs['sort']] - except KeyError: - s = None - - try: - params['sort'].extend(s) - except KeyError: - if s is not None: - params['sort'] = s diff --git a/data-access/nexustiles/dao/S3Proxy.py b/data-access/nexustiles/dao/S3Proxy.py deleted file mode 100644 index c8d3adfe..00000000 --- a/data-access/nexustiles/dao/S3Proxy.py +++ /dev/null @@ -1,141 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import uuid - -import boto3 -import nexusproto.DataTile_pb2 as nexusproto -import numpy as np -from nexusproto.serialization import from_shaped_array - - -class NexusTileData(object): - __nexus_tile = None - __data = None - tile_id = None - - def __init__(self, data, _tile_id): - if self.__data is None: - self.__data = data - if self.tile_id is None: - self.tile_id = _tile_id - - def _get_nexus_tile(self): - if self.__nexus_tile is None: - self.__nexus_tile = nexusproto.TileData.FromString(self.__data) - - return self.__nexus_tile - - def get_raw_data_array(self): - - nexus_tile = self._get_nexus_tile() - the_tile_type = nexus_tile.tile.WhichOneof("tile_type") - - the_tile_data = getattr(nexus_tile.tile, the_tile_type) - - return from_shaped_array(the_tile_data.variable_data) - - def get_lat_lon_time_data_meta(self): - if self._get_nexus_tile().HasField('grid_tile'): - grid_tile = self._get_nexus_tile().grid_tile - - grid_tile_data = np.ma.masked_invalid(from_shaped_array(grid_tile.variable_data)) - latitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.latitude)) - longitude_data = np.ma.masked_invalid(from_shaped_array(grid_tile.longitude)) - - if len(grid_tile_data.shape) == 2: - grid_tile_data = grid_tile_data[np.newaxis, :] - - # Extract the meta data - meta_data = {} - for meta_data_obj in grid_tile.meta_data: - name = meta_data_obj.name - meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - if len(meta_array.shape) == 2: - meta_array = meta_array[np.newaxis, :] - meta_data[name] = meta_array - - return latitude_data, longitude_data, np.array([grid_tile.time]), grid_tile_data, meta_data - elif self._get_nexus_tile().HasField('swath_tile'): - swath_tile = self._get_nexus_tile().swath_tile - - latitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.latitude)).reshape(-1) - longitude_data = np.ma.masked_invalid(from_shaped_array(swath_tile.longitude)).reshape(-1) - time_data = np.ma.masked_invalid(from_shaped_array(swath_tile.time)).reshape(-1) - - # Simplify the tile if the time dimension is the same value repeated - if np.all(time_data == np.min(time_data)): - time_data = np.array([np.min(time_data)]) - - swath_tile_data = np.ma.masked_invalid(from_shaped_array(swath_tile.variable_data)) - - tile_data = self._to_standard_index(swath_tile_data, - (len(time_data), len(latitude_data), len(longitude_data))) - - # Extract the meta data - meta_data = {} - for meta_data_obj in swath_tile.meta_data: - name = meta_data_obj.name - actual_meta_array = np.ma.masked_invalid(from_shaped_array(meta_data_obj.meta_data)) - reshaped_meta_array = self._to_standard_index(actual_meta_array, tile_data.shape) - meta_data[name] = reshaped_meta_array - - return latitude_data, longitude_data, time_data, tile_data, meta_data - else: - raise NotImplementedError("Only supports grid_tile and swath_tile") - - @staticmethod - def _to_standard_index(data_array, desired_shape): - - if desired_shape[0] == 1: - reshaped_array = np.ma.masked_all((desired_shape[1], desired_shape[2])) - row, col = np.indices(data_array.shape) - - reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ - row.flat, col.flat] - reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ - row.flat, col.flat] - reshaped_array = reshaped_array[np.newaxis, :] - else: - reshaped_array = np.ma.masked_all(desired_shape) - row, col = np.indices(data_array.shape) - - reshaped_array[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array[ - row.flat, col.flat] - reshaped_array.mask[np.diag_indices(desired_shape[1], len(reshaped_array.shape))] = data_array.mask[ - row.flat, col.flat] - - return reshaped_array - - -class S3Proxy(object): - def __init__(self, config): - self.config = config - self.__s3_bucketname = config.get("s3", "bucket") - self.__s3_region = config.get("s3", "region") - self.__s3 = boto3.resource('s3') - self.__nexus_tile = None - - def fetch_nexus_tiles(self, *tile_ids): - tile_ids = [uuid.UUID(str(tile_id)) for tile_id in tile_ids if - (isinstance(tile_id, str) or isinstance(tile_id, str))] - res = [] - for tile_id in tile_ids: - obj = self.__s3.Object(self.__s3_bucketname, str(tile_id)) - data = obj.get()['Body'].read() - nexus_tile = NexusTileData(data, str(tile_id)) - res.append(nexus_tile) - - return res diff --git a/data-access/nexustiles/dao/SolrProxy.py b/data-access/nexustiles/dao/SolrProxy.py deleted file mode 100644 index 9b16533d..00000000 --- a/data-access/nexustiles/dao/SolrProxy.py +++ /dev/null @@ -1,731 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -import threading -import time -from datetime import datetime -from pytz import timezone, UTC - -import requests -import pysolr -from shapely import wkt - -SOLR_CON_LOCK = threading.Lock() -thread_local = threading.local() - -EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) -SOLR_FORMAT = '%Y-%m-%dT%H:%M:%SZ' -ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' - - -class SolrProxy(object): - def __init__(self, config): - self.solrUrl = config.get("solr", "host") - self.solrCore = config.get("solr", "core") - solr_kargs = {} - if config.has_option("solr", "time_out"): - solr_kargs["timeout"] = config.get("solr", "time_out") - self.logger = logging.getLogger('nexus') - - with SOLR_CON_LOCK: - solrcon = getattr(thread_local, 'solrcon', None) - if solrcon is None: - solr_url = '%s/solr/%s' % (self.solrUrl, self.solrCore) - self.logger.info("connect to solr, url {} with option(s) = {}".format(solr_url, solr_kargs)) - solrcon = pysolr.Solr(solr_url, **solr_kargs) - thread_local.solrcon = solrcon - - self.solrcon = solrcon - - def find_tile_by_id(self, tile_id): - - search = 'id:%s' % tile_id - - params = { - 'rows': 1 - } - - results, start, found = self.do_query(*(search, None, None, True, None), **params) - - assert len(results) == 1, "Found %s results, expected exactly 1" % len(results) - return [results[0]] - - def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): - - if ds is not None: - search = 'dataset_s:%s' % ds - else: - search = '*:*' - - additionalparams = { - 'fq': [ - "{!terms f=id}%s" % ','.join(tile_ids) - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - - results = self.do_query_all(*(search, None, None, False, None), **additionalparams) - - assert len(results) == len(tile_ids), "Found %s results, expected exactly %s" % (len(results), len(tile_ids)) - return results - - def find_min_date_from_tiles(self, tile_ids, ds=None, **kwargs): - - if ds is not None: - search = 'dataset_s:%s' % ds - else: - search = '*:*' - - kwargs['rows'] = 1 - kwargs['fl'] = 'tile_min_time_dt' - kwargs['sort'] = ['tile_min_time_dt asc'] - additionalparams = { - 'fq': [ - "{!terms f=id}%s" % ','.join(tile_ids) if len(tile_ids) > 0 else '' - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - - results, start, found = self.do_query(*(search, None, None, True, None), **additionalparams) - - return self.convert_iso_to_datetime(results[0]['tile_min_time_dt']) - - def find_max_date_from_tiles(self, tile_ids, ds=None, **kwargs): - - if ds is not None: - search = 'dataset_s:%s' % ds - else: - search = '*:*' - - kwargs['rows'] = 1 - kwargs['fl'] = 'tile_max_time_dt' - kwargs['sort'] = ['tile_max_time_dt desc'] - additionalparams = { - 'fq': [ - "{!terms f=id}%s" % ','.join(tile_ids) if len(tile_ids) > 0 else '' - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - - results, start, found = self.do_query(*(search, None, None, True, None), **additionalparams) - - return self.convert_iso_to_datetime(results[0]['tile_max_time_dt']) - - def find_min_max_date_from_granule(self, ds, granule_name, **kwargs): - search = 'dataset_s:%s' % ds - - kwargs['rows'] = 1 - kwargs['fl'] = 'tile_min_time_dt' - kwargs['sort'] = ['tile_min_time_dt asc'] - additionalparams = { - 'fq': [ - "granule_s:%s" % granule_name - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - results, start, found = self.do_query(*(search, None, None, False, None), **additionalparams) - start_time = self.convert_iso_to_datetime(results[0]['tile_min_time_dt']) - - kwargs['fl'] = 'tile_max_time_dt' - kwargs['sort'] = ['tile_max_time_dt desc'] - additionalparams = { - 'fq': [ - "granule_s:%s" % granule_name - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - results, start, found = self.do_query(*(search, None, None, False, None), **additionalparams) - end_time = self.convert_iso_to_datetime(results[0]['tile_max_time_dt']) - - return start_time, end_time - - def get_data_series_list(self): - - datasets = self.get_data_series_list_simple() - - for dataset in datasets: - min_date = self.find_min_date_from_tiles([], ds=dataset['title']) - max_date = self.find_max_date_from_tiles([], ds=dataset['title']) - dataset['start'] = (min_date - EPOCH).total_seconds() - dataset['end'] = (max_date - EPOCH).total_seconds() - dataset['iso_start'] = min_date.strftime(ISO_8601) - dataset['iso_end'] = max_date.strftime(ISO_8601) - - return datasets - - def get_data_series_list_simple(self): - search = "*:*" - params = { - 'rows': 0, - "facet": "true", - "facet.field": "dataset_s", - "facet.mincount": "1", - "facet.limit": "-1" - } - - - response = self.do_query_raw(*(search, None, None, False, None), **params) - l = [] - for g, v in zip(*[iter(response.facets["facet_fields"]["dataset_s"])]*2): - l.append({ - "shortName": g, - "title": g, - "tileCount": v - }) - l = sorted(l, key=lambda entry: entry["title"]) - return l - - def get_data_series_stats(self, ds): - search = "dataset_s:%s" % ds - params = { - "facet": "true", - "facet.field": ["dataset_s", "tile_max_time_dt"], - "facet.limit": "-1", - "facet.mincount": "1", - "facet.pivot": "{!stats=piv1}dataset_s", - "stats": "on", - "stats.field": ["{!tag=piv1 min=true max=true sum=false}tile_max_time_dt","{!tag=piv1 min=true max=false sum=false}tile_min_val_d","{!tag=piv1 min=false max=true sum=false}tile_max_val_d"] - } - - response = self.do_query_raw(*(search, None, None, False, None), **params) - - stats = {} - - for g in response.facets["facet_pivot"]["dataset_s"]: - if g["value"] == ds: - stats["start"] = self.convert_iso_to_timestamp(g["stats"]["stats_fields"]["tile_max_time_dt"]["min"]) - stats["end"] = self.convert_iso_to_timestamp(g["stats"]["stats_fields"]["tile_max_time_dt"]["max"]) - stats["minValue"] = g["stats"]["stats_fields"]["tile_min_val_d"]["min"] - stats["maxValue"] = g["stats"]["stats_fields"]["tile_max_val_d"]["max"] - - - stats["availableDates"] = [] - for dt in response.facets["facet_fields"]["tile_max_time_dt"][::2]: - stats["availableDates"].append(self.convert_iso_to_timestamp(dt)) - - stats["availableDates"] = sorted(stats["availableDates"]) - - return stats - - def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year): - - search = 'dataset_s:%s' % ds - - params = { - 'fq': [ - "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, - "tile_count_i:[1 TO *]", - "day_of_year_i:[* TO %s]" % day_of_year - ], - 'rows': 1 - } - - results, start, found = self.do_query( - *(search, None, None, True, ('day_of_year_i desc',)), **params) - - return [results[0]] - - def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs): - - search = 'dataset_s:%s' % ds - - search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) - - additionalparams = { - 'fq': [ - "geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon), - "{!frange l=0 u=0}ms(tile_min_time_dt,tile_max_time_dt)", - "tile_count_i:[1 TO *]", - "tile_min_time_dt:[%s TO %s] " % (search_start_s, search_end_s) - ], - 'rows': 0, - 'facet': 'true', - 'facet.field': 'tile_min_time_dt', - 'facet.mincount': '1', - 'facet.limit': '-1' - } - - self._merge_kwargs(additionalparams, **kwargs) - - response = self.do_query_raw(*(search, None, None, False, None), **additionalparams) - - daysinrangeasc = sorted( - [(datetime.strptime(a_date, SOLR_FORMAT) - datetime.utcfromtimestamp(0)).total_seconds() for a_date - in response.facets['facet_fields']['tile_min_time_dt'][::2]]) - - return daysinrangeasc - - def find_all_tiles_in_box_sorttimeasc(self, min_lat, max_lat, min_lon, max_lon, ds, start_time=0, - end_time=-1, **kwargs): - - search = 'dataset_s:%s' % ds - - additionalparams = { - 'fq': [ - "geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon), - "tile_count_i:[1 TO *]" - ] - } - - if 0 <= start_time <= end_time: - search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) - - time_clause = "(" \ - "tile_min_time_dt:[%s TO %s] " \ - "OR tile_max_time_dt:[%s TO %s] " \ - "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ - ")" % ( - search_start_s, search_end_s, - search_start_s, search_end_s, - search_start_s, search_end_s - ) - additionalparams['fq'].append(time_clause) - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all( - *(search, None, None, False, 'tile_min_time_dt asc, tile_max_time_dt asc'), - **additionalparams) - - def find_all_tiles_in_polygon_sorttimeasc(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): - - search = 'dataset_s:%s' % ds - - additionalparams = { - 'fq': [ - "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, - "tile_count_i:[1 TO *]" - ] - } - - if 0 <= start_time <= end_time: - search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) - - time_clause = "(" \ - "tile_min_time_dt:[%s TO %s] " \ - "OR tile_max_time_dt:[%s TO %s] " \ - "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ - ")" % ( - search_start_s, search_end_s, - search_start_s, search_end_s, - search_start_s, search_end_s - ) - additionalparams['fq'].append(time_clause) - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all( - *(search, None, None, False, 'tile_min_time_dt asc, tile_max_time_dt asc'), - **additionalparams) - - def find_all_tiles_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): - - search = 'dataset_s:%s' % ds - - additionalparams = { - 'fq': [ - "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, - "tile_count_i:[1 TO *]" - ] - } - - if 0 <= start_time <= end_time: - search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) - - time_clause = "(" \ - "tile_min_time_dt:[%s TO %s] " \ - "OR tile_max_time_dt:[%s TO %s] " \ - "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ - ")" % ( - search_start_s, search_end_s, - search_start_s, search_end_s, - search_start_s, search_end_s - ) - additionalparams['fq'].append(time_clause) - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all( - *(search, None, None, False, None), - **additionalparams) - - def find_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time=0, end_time=-1, **kwargs): - - search = 'dataset_s:%s' % ds - - additionalparams = { - 'fq': [ - "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, - "tile_count_i:[1 TO *]" - ], - 'rows': 0, - 'facet': 'true', - 'facet.field': 'geo_s', - 'facet.limit': -1, - 'facet.mincount': 1 - } - - if 0 <= start_time <= end_time: - search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) - - time_clause = "(" \ - "tile_min_time_dt:[%s TO %s] " \ - "OR tile_max_time_dt:[%s TO %s] " \ - "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ - ")" % ( - search_start_s, search_end_s, - search_start_s, search_end_s, - search_start_s, search_end_s - ) - additionalparams['fq'].append(time_clause) - - self._merge_kwargs(additionalparams, **kwargs) - - response = self.do_query_raw(*(search, None, None, False, None), **additionalparams) - - distinct_bounds = [wkt.loads(key).bounds for key in response.facets["facet_fields"]["geo_s"][::2]] - - return distinct_bounds - - def find_tiles_by_exact_bounds(self, minx, miny, maxx, maxy, ds, start_time=0, end_time=-1, **kwargs): - - search = 'dataset_s:%s' % ds - - additionalparams = { - 'fq': [ - "tile_min_lon:\"%s\"" % minx, - "tile_min_lat:\"%s\"" % miny, - "tile_max_lon:\"%s\"" % maxx, - "tile_max_lat:\"%s\"" % maxy, - "tile_count_i:[1 TO *]" - ] - } - - if 0 <= start_time <= end_time: - search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) - - time_clause = "(" \ - "tile_min_time_dt:[%s TO %s] " \ - "OR tile_max_time_dt:[%s TO %s] " \ - "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ - ")" % ( - search_start_s, search_end_s, - search_start_s, search_end_s, - search_start_s, search_end_s - ) - additionalparams['fq'].append(time_clause) - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all( - *(search, None, None, False, None), - **additionalparams) - - def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, search_time, **kwargs): - search = 'dataset_s:%s' % ds - - the_time = datetime.utcfromtimestamp(search_time).strftime(SOLR_FORMAT) - time_clause = "(" \ - "tile_min_time_dt:[* TO %s] " \ - "AND tile_max_time_dt:[%s TO *] " \ - ")" % ( - the_time, the_time - ) - - additionalparams = { - 'fq': [ - "geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon), - "tile_count_i:[1 TO *]", - time_clause - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all(*(search, None, None, False, None), **additionalparams) - - def find_all_tiles_in_polygon_at_time(self, bounding_polygon, ds, search_time, **kwargs): - search = 'dataset_s:%s' % ds - - the_time = datetime.utcfromtimestamp(search_time).strftime(SOLR_FORMAT) - time_clause = "(" \ - "tile_min_time_dt:[* TO %s] " \ - "AND tile_max_time_dt:[%s TO *] " \ - ")" % ( - the_time, the_time - ) - - additionalparams = { - 'fq': [ - "{!field f=geo}Intersects(%s)" % bounding_polygon.wkt, - "tile_count_i:[1 TO *]", - time_clause - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all(*(search, None, None, False, None), **additionalparams) - - def find_all_tiles_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): - search = 'dataset_s:%s' % ds - - the_time = datetime.utcfromtimestamp(time).strftime(SOLR_FORMAT) - time_clause = "(" \ - "tile_min_time_dt:[* TO %s] " \ - "AND tile_max_time_dt:[%s TO *] " \ - ")" % ( - the_time, the_time - ) - - additionalparams = { - 'fq': [ - "geo:\"Within(ENVELOPE(%s,%s,%s,%s))\"" % (min_lon, max_lon, max_lat, min_lat), - "tile_count_i:[1 TO *]", - time_clause - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all(*(search, "product(tile_avg_val_d, tile_count_i),*", None, False, None), - **additionalparams) - - def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, ds, time, **kwargs): - search = 'dataset_s:%s' % ds - - the_time = datetime.utcfromtimestamp(time).strftime(SOLR_FORMAT) - time_clause = "(" \ - "tile_min_time_dt:[* TO %s] " \ - "AND tile_max_time_dt:[%s TO *] " \ - ")" % ( - the_time, the_time - ) - - additionalparams = { - 'fq': [ - "geo:\"Intersects(MultiLineString((%s %s, %s %s),(%s %s, %s %s),(%s %s, %s %s),(%s %s, %s %s)))\"" % ( - min_lon, max_lat, max_lon, max_lat, min_lon, max_lat, min_lon, min_lat, max_lon, max_lat, max_lon, - min_lat, min_lon, min_lat, max_lon, min_lat), - "-geo:\"Within(ENVELOPE(%s,%s,%s,%s))\"" % (min_lon, max_lon, max_lat, min_lat), - "tile_count_i:[1 TO *]", - time_clause - ] - } - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all(*(search, None, None, False, None), **additionalparams) - - def find_all_tiles_by_metadata(self, metadata, ds, start_time=0, end_time=-1, **kwargs): - """ - Get a list of tile metadata that matches the specified metadata, start_time, end_time. - :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] - :param ds: The dataset name to search - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :return: A list of tile metadata - """ - search = 'dataset_s:%s' % ds - - additionalparams = { - 'fq': metadata - } - - if 0 <= start_time <= end_time: - additionalparams['fq'].append(self.get_formatted_time_clause(start_time, end_time)) - - self._merge_kwargs(additionalparams, **kwargs) - - return self.do_query_all( - *(search, None, None, False, None), - **additionalparams) - - def get_formatted_time_clause(self, start_time, end_time): - search_start_s = datetime.utcfromtimestamp(start_time).strftime(SOLR_FORMAT) - search_end_s = datetime.utcfromtimestamp(end_time).strftime(SOLR_FORMAT) - - time_clause = "(" \ - "tile_min_time_dt:[%s TO %s] " \ - "OR tile_max_time_dt:[%s TO %s] " \ - "OR (tile_min_time_dt:[* TO %s] AND tile_max_time_dt:[%s TO *])" \ - ")" % ( - search_start_s, search_end_s, - search_start_s, search_end_s, - search_start_s, search_end_s - ) - return time_clause - - def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): - """ - Return number of tiles that match search criteria. - :param ds: The dataset name to search - :param bounding_polygon: The polygon to search for tiles - :param start_time: The start time to search for tiles - :param end_time: The end time to search for tiles - :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] - :return: number of tiles that match search criteria - """ - search = 'dataset_s:%s' % ds - - additionalparams = { - 'fq': [ - "tile_count_i:[1 TO *]" - ], - 'rows': 0 - } - - if bounding_polygon: - min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds - additionalparams['fq'].append("geo:[%s,%s TO %s,%s]" % (min_lat, min_lon, max_lat, max_lon)) - - if 0 <= start_time <= end_time: - additionalparams['fq'].append(self.get_formatted_time_clause(start_time, end_time)) - - if metadata: - additionalparams['fq'].extend(metadata) - - self._merge_kwargs(additionalparams, **kwargs) - - results, start, found = self.do_query(*(search, None, None, True, None), **additionalparams) - - return found - - def do_query(self, *args, **params): - - response = self.do_query_raw(*args, **params) - - return response.docs, response.raw_response['response']['start'], response.hits - - def do_query_raw(self, *args, **params): - - if 'fl' not in list(params.keys()) and args[1]: - params['fl'] = args[1] - - if 'sort' not in list(params.keys()) and args[4]: - params['sort'] = args[4] - - # If dataset_s is specified as the search term, - # add the _route_ parameter to limit the search to the correct shard - if 'dataset_s:' in args[0]: - ds = args[0].split(':')[-1] - params['shard_keys'] = ds + '!' - - with SOLR_CON_LOCK: - response = self.solrcon.search(args[0], **params) - - return response - - - def do_query_all(self, *args, **params): - - results = [] - - response = self.do_query_raw(*args, **params) - results.extend(response.docs) - - limit = min(params.get('limit', float('inf')), response.hits) - - while len(results) < limit: - params['start'] = len(results) - response = self.do_query_raw(*args, **params) - results.extend(response.docs) - - assert len(results) == limit - - return results - - def convert_iso_to_datetime(self, date): - return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=UTC) - - def convert_iso_to_timestamp(self, date): - return (self.convert_iso_to_datetime(date) - EPOCH).total_seconds() - - def ping(self): - solrAdminPing = '%s/solr/%s/admin/ping' % (self.solrUrl, self.solrCore) - try: - r = requests.get(solrAdminPing, params={'wt': 'json'}) - results = json.loads(r.text) - return results - except: - return None - - @staticmethod - def _merge_kwargs(additionalparams, **kwargs): - # Only Solr-specific kwargs are parsed - # And the special 'limit' - try: - additionalparams['limit'] = kwargs['limit'] - except KeyError: - pass - - try: - additionalparams['_route_'] = kwargs['_route_'] - except KeyError: - pass - - try: - additionalparams['rows'] = kwargs['rows'] - except KeyError: - pass - - try: - additionalparams['start'] = kwargs['start'] - except KeyError: - pass - - try: - kwfq = kwargs['fq'] if isinstance(kwargs['fq'], list) else list(kwargs['fq']) - except KeyError: - kwfq = [] - - try: - additionalparams['fq'].extend(kwfq) - except KeyError: - additionalparams['fq'] = kwfq - - try: - kwfl = kwargs['fl'] if isinstance(kwargs['fl'], list) else [kwargs['fl']] - except KeyError: - kwfl = [] - - try: - additionalparams['fl'].extend(kwfl) - except KeyError: - additionalparams['fl'] = kwfl - - try: - s = kwargs['sort'] if isinstance(kwargs['sort'], list) else [kwargs['sort']] - except KeyError: - s = None - - try: - additionalparams['sort'].extend(s) - except KeyError: - if s is not None: - additionalparams['sort'] = s diff --git a/data-access/nexustiles/dao/__init__.py b/data-access/nexustiles/dao/__init__.py deleted file mode 100644 index 6acb5d12..00000000 --- a/data-access/nexustiles/dao/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/data-access/nexustiles/exception.py b/data-access/nexustiles/exception.py new file mode 100644 index 00000000..33ab5296 --- /dev/null +++ b/data-access/nexustiles/exception.py @@ -0,0 +1,2 @@ +class NexusTileServiceException(Exception): + pass diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index fde0a5f3..d09c3aa6 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -14,34 +14,27 @@ # limitations under the License. import configparser +import json import logging import sys -import json +import threading from datetime import datetime -from functools import wraps, reduce, partial +from functools import reduce, wraps +from time import sleep +from typing import Dict, Union import numpy as np import numpy.ma as ma import pkg_resources -from pytz import timezone, UTC -from shapely.geometry import MultiPolygon, box import pysolr +from pytz import timezone, UTC +from shapely.geometry import box +from webservice.webmodel import DatasetNotFoundException, NexusProcessingException -import threading -from time import sleep - +from .AbstractTileService import AbstractTileService from .backends.nexusproto.backend import NexusprotoTileService from .backends.zarr.backend import ZarrBackend - - -from abc import ABC, abstractmethod - -from .AbstractTileService import AbstractTileService - from .model.nexusmodel import Tile, BBox, TileStats, TileVariable -from typing import Dict, Union - -from webservice.webmodel import DatasetNotFoundException, NexusProcessingException EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) @@ -49,7 +42,7 @@ level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt="%Y-%m-%dT%H:%M:%S", stream=sys.stdout) -logger = logging.getLogger("testing") +logger = logging.getLogger("nexus-tile-svc") def tile_data(default_fetch=True): @@ -83,19 +76,25 @@ def fetch_data_for_func(*args, **kwargs): return tile_data_decorator -class NexusTileServiceException(Exception): - pass - - SOLR_LOCK = threading.Lock() DS_LOCK = threading.Lock() thread_local = threading.local() - -class NexusTileService(AbstractTileService): +class NexusTileService: backends: Dict[Union[None, str], Dict[str, Union[AbstractTileService, bool]]] = {} + ds_config = None + + __update_thread = None + + @staticmethod + def __update_datasets(): + while True: + with DS_LOCK: + NexusTileService._update_datasets() + sleep(3600) + def __init__(self, config=None): self._config = configparser.RawConfigParser() self._config.read(NexusTileService._get_config_files('config/datasets.ini')) @@ -105,43 +104,54 @@ def __init__(self, config=None): if config: self.override_config(config) - NexusTileService.backends[None] = {"backend": NexusprotoTileService(False, False, config), 'up': True} - NexusTileService.backends['__nexusproto__'] = NexusTileService.backends[None] + if not NexusTileService.backends: + NexusTileService.ds_config = configparser.RawConfigParser() + NexusTileService.ds_config.read(NexusTileService._get_config_files('config/datasets.ini')) - def __update_datasets(): - while True: - with DS_LOCK: - self._update_datasets() - sleep(3600) + default_backend = {"backend": NexusprotoTileService(False, False, config), 'up': True} + + NexusTileService.backends[None] = default_backend + NexusTileService.backends['__nexusproto__'] = default_backend - threading.Thread(target=__update_datasets, name='dataset_update', daemon=False).start() + if not NexusTileService.__update_thread: + NexusTileService.__update_thread = threading.Thread( + target=NexusTileService.__update_datasets, + name='dataset_update', + daemon=False + ) + logger.info('Starting dataset refresh thread') + NexusTileService.__update_thread.start() @staticmethod def __get_backend(dataset_s) -> AbstractTileService: - if dataset_s not in NexusTileService.backends: - raise DatasetNotFoundException(reason=f'Dataset {dataset_s} is not currently loaded/ingested') + with DS_LOCK: + if dataset_s not in NexusTileService.backends: + raise DatasetNotFoundException(reason=f'Dataset {dataset_s} is not currently loaded/ingested') - b = NexusTileService.backends[dataset_s] + b = NexusTileService.backends[dataset_s] - if not b['up']: - success = b['backend'].try_connect() + if not b['up']: + success = b['backend'].try_connect() - if not success: - raise NexusProcessingException(reason=f'Dataset {dataset_s} is currently unavailable') - else: - NexusTileService.backends[dataset_s]['up'] = True + if not success: + raise NexusProcessingException(reason=f'Dataset {dataset_s} is currently unavailable') + else: + NexusTileService.backends[dataset_s]['up'] = True - return b['backend'] + return b['backend'] - def _update_datasets(self): - solr_url = self._config.get("solr", "host") - solr_core = self._config.get("solr", "core") + @staticmethod + def _update_datasets(): + solr_url = NexusTileService.ds_config.get("solr", "host") + solr_core = NexusTileService.ds_config.get("solr", "core") solr_kwargs = {} - if self._config.has_option("solr", "time_out"): - solr_kwargs["timeout"] = self._config.get("solr", "time_out") + update_logger = logging.getLogger("nexus-tile-svc.backends") + + if NexusTileService.ds_config.has_option("solr", "time_out"): + solr_kwargs["timeout"] = NexusTileService.ds_config.get("solr", "time_out") with SOLR_LOCK: solrcon = getattr(thread_local, 'solrcon', None) @@ -152,33 +162,53 @@ def _update_datasets(self): solrcon = solrcon - response = solrcon.search('*:*') + update_logger.info('Executing update query to check for new datasets') - present_datasets = set() + present_datasets = {None, '__nexusproto__'} + next_cursor_mark = '*' - for dataset in response.docs: - d_id = dataset['dataset_s'] - store_type = dataset.get('store_type_s', 'nexusproto') + while True: + response = solrcon.search('*:*', cursorMark=next_cursor_mark, sort='id asc') - present_datasets.add(d_id) + try: + response_cursor_mark = response.nextCursorMark + except AttributeError: + break - if d_id in NexusTileService.backends: - continue - # is_up = NexusTileService.backends[d_id]['backend'].try_connect() + if response_cursor_mark == next_cursor_mark: + break + else: + next_cursor_mark = response_cursor_mark - if store_type == 'nexus_proto' or store_type == 'nexusproto': - NexusTileService.backends[d_id] = NexusTileService.backends[None] - elif store_type == 'zarr': - ds_config = json.loads(dataset['config'][0]) - NexusTileService.backends[d_id] = { - 'backend': ZarrBackend(ds_config), - 'up': True - } - else: - logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') + for dataset in response.docs: + d_id = dataset['dataset_s'] + store_type = dataset.get('store_type_s', 'nexusproto') + + present_datasets.add(d_id) + + if d_id in NexusTileService.backends: + continue + # is_up = NexusTileService.backends[d_id]['backend'].try_connect() + + if store_type == 'nexus_proto' or store_type == 'nexusproto': + update_logger.info(f"Detected new nexusproto dataset {d_id}, using default nexusproto backend") + NexusTileService.backends[d_id] = NexusTileService.backends[None] + elif store_type == 'zarr': + update_logger.info(f"Detected new zarr dataset {d_id}, opening new zarr backend") + + ds_config = json.loads(dataset['config'][0]) + NexusTileService.backends[d_id] = { + 'backend': ZarrBackend(ds_config), + 'up': True + } + else: + logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') removed_datasets = set(NexusTileService.backends.keys()).difference(present_datasets) + if len(removed_datasets) > 0: + logger.info(f'{len(removed_datasets)} marked for removal') + for dataset in removed_datasets: logger.info(f"Removing dataset {dataset}") del NexusTileService.backends[dataset] @@ -336,23 +366,17 @@ def get_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset return tiles def get_stats_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - tiles = self._metadatastore.find_all_tiles_within_box_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, - **kwargs) - - return tiles + return NexusTileService.get_stats_within_box_at_time( + min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs + ) - def get_bounding_box(self, tile_ids): + def get_bounding_box(self, tile_ids, ds=None): """ Retrieve a bounding box that encompasses all of the tiles represented by the given tile ids. :param tile_ids: List of tile ids :return: shapely.geometry.Polygon that represents the smallest bounding box that encompasses all of the tiles """ - tiles = self.find_tiles_by_id(tile_ids, fl=['tile_min_lat', 'tile_max_lat', 'tile_min_lon', 'tile_max_lon'], - fetch_data=False, rows=len(tile_ids)) - polys = [] - for tile in tiles: - polys.append(box(tile.bbox.min_lon, tile.bbox.min_lat, tile.bbox.max_lon, tile.bbox.max_lat)) - return box(*MultiPolygon(polys).bounds) + return NexusTileService.__get_backend(ds).get_bounding_box(tile_ids, ds) def get_min_time(self, tile_ids, ds=None): """ @@ -361,8 +385,7 @@ def get_min_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ - min_time = self._metadatastore.find_min_date_from_tiles(tile_ids, ds=ds) - return int((min_time - EPOCH).total_seconds()) + return NexusTileService.__get_backend(ds).get_min_time(tile_ids, ds) def get_max_time(self, tile_ids, ds=None): """ @@ -371,8 +394,7 @@ def get_max_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ - max_time = self._metadatastore.find_max_date_from_tiles(tile_ids, ds=ds) - return int((max_time - EPOCH).total_seconds()) + return int(NexusTileService.__get_backend(ds).get_max_time(tile_ids)) def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): """ @@ -398,33 +420,95 @@ def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, m """ return self._metadatastore.get_tile_count(ds, bounding_polygon, start_time, end_time, metadata, **kwargs) - def fetch_data_for_tiles(self, *tiles): + def mask_tiles_to_bbox(self, min_lat, max_lat, min_lon, max_lon, tiles): + for tile in tiles: + tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) + tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + # If this is multi-var, need to mask each variable separately. + if tile.is_multi: + # Combine space/time mask with existing mask on data + data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) + + num_vars = len(tile.data) + multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) + tile.data = ma.masked_where(multi_data_mask, tile.data) + else: + tile.data = ma.masked_where(data_mask, tile.data) - nexus_tile_ids = set([tile.tile_id for tile in tiles]) - matched_tile_data = self._datastore.fetch_nexus_tiles(*nexus_tile_ids) + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] - tile_data_by_id = {str(a_tile_data.tile_id): a_tile_data for a_tile_data in matched_tile_data} + return tiles - missing_data = nexus_tile_ids.difference(list(tile_data_by_id.keys())) - if len(missing_data) > 0: - raise Exception("Missing data for tile_id(s) %s." % missing_data) + def mask_tiles_to_bbox_and_time(self, min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles): + for tile in tiles: + tile.times = ma.masked_outside(tile.times, start_time, end_time) + tile.latitudes = ma.masked_outside(tile.latitudes, min_lat, max_lat) + tile.longitudes = ma.masked_outside(tile.longitudes, min_lon, max_lon) - for a_tile in tiles: - lats, lons, times, data, meta, is_multi_var = tile_data_by_id[a_tile.tile_id].get_lat_lon_time_data_meta() + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] - a_tile.latitudes = lats - a_tile.longitudes = lons - a_tile.times = times - a_tile.data = data - a_tile.meta_data = meta - a_tile.is_multi = is_multi_var + tile.data = ma.masked_where(data_mask, tile.data) - del (tile_data_by_id[a_tile.tile_id]) + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] return tiles - def _metadata_store_docs_to_tiles(self, *store_docs): + def mask_tiles_to_polygon(self, bounding_polygon, tiles): + + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + + return self.mask_tiles_to_bbox(min_lat, max_lat, min_lon, max_lon, tiles) + def mask_tiles_to_polygon_and_time(self, bounding_polygon, start_time, end_time, tiles): + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + + return self.mask_tiles_to_bbox_and_time(min_lat, max_lat, min_lon, max_lon, start_time, end_time, tiles) + + def mask_tiles_to_time_range(self, start_time, end_time, tiles): + """ + Masks data in tiles to specified time range. + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param tiles: List of tiles + :return: A list tiles with data masked to specified time range + """ + if 0 <= start_time <= end_time: + for tile in tiles: + tile.times = ma.masked_outside(tile.times, start_time, end_time) + + # Or together the masks of the individual arrays to create the new mask + data_mask = ma.getmaskarray(tile.times)[:, np.newaxis, np.newaxis] \ + | ma.getmaskarray(tile.latitudes)[np.newaxis, :, np.newaxis] \ + | ma.getmaskarray(tile.longitudes)[np.newaxis, np.newaxis, :] + + # If this is multi-var, need to mask each variable separately. + if tile.is_multi: + # Combine space/time mask with existing mask on data + data_mask = reduce(np.logical_or, [tile.data[0].mask, data_mask]) + + num_vars = len(tile.data) + multi_data_mask = np.repeat(data_mask[np.newaxis, ...], num_vars, axis=0) + tile.data = ma.masked_where(multi_data_mask, tile.data) + else: + tile.data = ma.masked_where(data_mask, tile.data) + + tiles[:] = [tile for tile in tiles if not tile.data.mask.all()] + + return tiles + + def fetch_data_for_tiles(self, *tiles, dataset=None): + return NexusTileService.__get_backend(dataset).fetch_data_for_tiles(*tiles) + + def _metadata_store_docs_to_tiles(self, *store_docs): tiles = [] for store_doc in store_docs: tile = Tile() @@ -521,7 +605,6 @@ def _metadata_store_docs_to_tiles(self, *store_docs): except KeyError: pass - if 'tile_var_name_ss' in store_doc: tile.variables = [] for var_name in store_doc['tile_var_name_ss']: @@ -536,13 +619,6 @@ def _metadata_store_docs_to_tiles(self, *store_docs): return tiles - def pingSolr(self): - status = self._metadatastore.ping() - if status and status["status"] == "OK": - return True - else: - return False - @staticmethod def _get_config_files(filename): log = logging.getLogger(__name__) diff --git a/data-access/setup.py b/data-access/setup.py index ab0248f0..e539e1e0 100644 --- a/data-access/setup.py +++ b/data-access/setup.py @@ -12,11 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import setuptools from setuptools import setup -with open('../VERSION.txt', 'r') as f: - __version__ = f.read() +try: + with open('../VERSION.txt', 'r') as f: + __version__ = f.read() +except: + __version__ = None with open('requirements.txt') as f: @@ -32,8 +35,13 @@ description="NEXUS API.", long_description=open('README.md').read(), - packages=['nexustiles', 'nexustiles.model', 'nexustiles.dao'], - package_data={'nexustiles': ['config/datastores.ini.default', 'config/datastores.ini']}, + packages=setuptools.find_packages(), # ['nexustiles', 'nexustiles.model', 'nexustiles.dao'], + package_data={ + 'nexustiles': + ['config/datasets.ini.default', 'config/datasets.ini'], + 'nexustiles.backends.nexusproto': + ['config/datastores.ini.default', 'config/datastores.ini'] + }, platforms='any', python_requires='~=3.8', install_requires=pip_requirements, From b77aa11f65a8eb486509b4e0e7e5c5a149fdbcc0 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 10 Jul 2023 16:33:52 -0700 Subject: [PATCH 05/70] Working(?) np backend --- .../nexustiles/backends/zarr/backend.py | 150 +++++++++++++++++- data-access/nexustiles/nexustiles.py | 39 ++++- data-access/requirements.txt | 2 + 3 files changed, 182 insertions(+), 9 deletions(-) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 93963166..13622453 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -40,6 +40,154 @@ class ZarrBackend(AbstractTileService): - def __init__(self, config): + def __init__(self, path, config): AbstractTileService.__init__(self) self.__config = config + + def get_dataseries_list(self, simple=False): + raise NotImplementedError() + + def find_tile_by_id(self, tile_id, **kwargs): + raise NotImplementedError() + + def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): + raise NotImplementedError() + + def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, start_time, end_time, + metrics_callback=None, **kwargs): + raise NotImplementedError() + + def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year, **kwargs): + """ + Given a bounding polygon, dataset, and day of year, find tiles in that dataset with the same bounding + polygon and the closest day of year. + + For example: + given a polygon minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; and day of year=32 + search for first tile in MY_DS with identical bbox and day_of_year <= 32 (sorted by day_of_year desc) + + Valid matches: + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 + + Invalid matches: + minx=1, miny=0, maxx=2, maxy=1; dataset=MY_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_OTHER_DS; day of year = 32 + minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 30 if minx=0, miny=0, maxx=1, maxy=1; dataset=MY_DS; day of year = 32 also exists + + :param bounding_polygon: The exact bounding polygon of tiles to search for + :param ds: The dataset name being searched + :param day_of_year: Tile day of year to search for, tile nearest to this day (without going over) will be returned + :return: List of one tile from ds with bounding_polygon on or before day_of_year or raise NexusTileServiceException if no tile found + """ + raise NotImplementedError() + + def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **kwargs): + raise NotImplementedError() + + def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): + # Find tiles that fall in the given box in the Solr index + raise NotImplementedError() + + def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=0, end_time=-1, **kwargs): + # Find tiles that fall within the polygon in the Solr index + raise NotImplementedError() + + def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): + """ + Return list of tiles whose metadata matches the specified metadata, start_time, end_time. + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of tiles + """ + raise NotImplementedError() + + def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs): + """ + The method will return tiles with the exact given bounds within the time range. It differs from + find_tiles_in_polygon in that only tiles with exactly the given bounds will be returned as opposed to + doing a polygon intersection with the given bounds. + + :param bounds: (minx, miny, maxx, maxy) bounds to search for + :param ds: Dataset name to search + :param start_time: Start time to search (seconds since epoch) + :param end_time: End time to search (seconds since epoch) + :param kwargs: fetch_data: True/False = whether or not to retrieve tile data + :return: + """ + raise NotImplementedError() + + def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + def get_min_max_time_by_granule(self, ds, granule_name): + raise NotImplementedError() + + def get_dataset_overall_stats(self, ds): + raise NotImplementedError() + + def get_stats_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): + raise NotImplementedError() + + def get_bounding_box(self, tile_ids): + """ + Retrieve a bounding box that encompasses all of the tiles represented by the given tile ids. + :param tile_ids: List of tile ids + :return: shapely.geometry.Polygon that represents the smallest bounding box that encompasses all of the tiles + """ + raise NotImplementedError() + + def get_min_time(self, tile_ids, ds=None): + """ + Get the minimum tile date from the list of tile ids + :param tile_ids: List of tile ids + :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) + :return: long time in seconds since epoch + """ + raise NotImplementedError() + + def get_max_time(self, tile_ids, ds=None): + """ + Get the maximum tile date from the list of tile ids + :param tile_ids: List of tile ids + :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) + :return: long time in seconds since epoch + """ + raise NotImplementedError() + + def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): + """ + Get a list of distinct tile bounding boxes from all tiles within the given polygon and time range. + :param bounding_polygon: The bounding polygon of tiles to search for + :param ds: The dataset name to search + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :return: A list of distinct bounding boxes (as shapely polygons) for tiles in the search polygon + """ + raise NotImplementedError() + + def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, metadata=None, **kwargs): + """ + Return number of tiles that match search criteria. + :param ds: The dataset name to search + :param bounding_polygon: The polygon to search for tiles + :param start_time: The start time to search for tiles + :param end_time: The end time to search for tiles + :param metadata: List of metadata values to search for tiles e.g ["river_id_i:1", "granule_s:granule_name"] + :return: number of tiles that match search criteria + """ + raise NotImplementedError() + + def fetch_data_for_tiles(self, *tiles): + raise NotImplementedError() + + def _metadata_store_docs_to_tiles(self, *store_docs): + raise NotImplementedError() + + + diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index d09c3aa6..405b5b70 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -30,6 +30,7 @@ from pytz import timezone, UTC from shapely.geometry import box from webservice.webmodel import DatasetNotFoundException, NexusProcessingException +from webservice.NexusHandler import nexus_initializer from .AbstractTileService import AbstractTileService from .backends.nexusproto.backend import NexusprotoTileService @@ -81,6 +82,16 @@ def fetch_data_for_func(*args, **kwargs): thread_local = threading.local() +@nexus_initializer +class NTSInitializer: + def __init__(self): + self._log = logger.getChild('init') + + def init(self, config): + self._log.info('*** RUNNING NTS INITIALIZATION ***') + NexusTileService(config) + + class NexusTileService: backends: Dict[Union[None, str], Dict[str, Union[AbstractTileService, bool]]] = {} @@ -89,7 +100,7 @@ class NexusTileService: __update_thread = None @staticmethod - def __update_datasets(): + def __update_datasets_loop(): while True: with DS_LOCK: NexusTileService._update_datasets() @@ -115,7 +126,7 @@ def __init__(self, config=None): if not NexusTileService.__update_thread: NexusTileService.__update_thread = threading.Thread( - target=NexusTileService.__update_datasets, + target=NexusTileService.__update_datasets_loop, name='dataset_update', daemon=False ) @@ -128,7 +139,11 @@ def __init__(self, config=None): def __get_backend(dataset_s) -> AbstractTileService: with DS_LOCK: if dataset_s not in NexusTileService.backends: - raise DatasetNotFoundException(reason=f'Dataset {dataset_s} is not currently loaded/ingested') + logger.warning(f'Dataset {dataset_s} not currently loaded. Checking to see if it was recently' + f'added') + NexusTileService._update_datasets() + if dataset_s not in NexusTileService.backends: + raise DatasetNotFoundException(reason=f'Dataset {dataset_s} is not currently loaded/ingested') b = NexusTileService.backends[dataset_s] @@ -162,11 +177,13 @@ def _update_datasets(): solrcon = solrcon - update_logger.info('Executing update query to check for new datasets') + update_logger.info('Executing Solr query to check for new datasets') present_datasets = {None, '__nexusproto__'} next_cursor_mark = '*' + added_datasets = 0 + while True: response = solrcon.search('*:*', cursorMark=next_cursor_mark, sort='id asc') @@ -190,6 +207,8 @@ def _update_datasets(): continue # is_up = NexusTileService.backends[d_id]['backend'].try_connect() + added_datasets += 1 + if store_type == 'nexus_proto' or store_type == 'nexusproto': update_logger.info(f"Detected new nexusproto dataset {d_id}, using default nexusproto backend") NexusTileService.backends[d_id] = NexusTileService.backends[None] @@ -198,21 +217,25 @@ def _update_datasets(): ds_config = json.loads(dataset['config'][0]) NexusTileService.backends[d_id] = { - 'backend': ZarrBackend(ds_config), + 'backend': ZarrBackend(**ds_config), 'up': True } else: - logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') + update_logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') + added_datasets -= 1 removed_datasets = set(NexusTileService.backends.keys()).difference(present_datasets) if len(removed_datasets) > 0: - logger.info(f'{len(removed_datasets)} marked for removal') + update_logger.info(f'{len(removed_datasets)} old datasets marked for removal') for dataset in removed_datasets: - logger.info(f"Removing dataset {dataset}") + update_logger.info(f"Removing dataset {dataset}") del NexusTileService.backends[dataset] + update_logger.info(f'Finished dataset update: {added_datasets} added, {len(removed_datasets)} removed, ' + f'{len(NexusTileService.backends) - 2} total') + def override_config(self, config): for section in config.sections(): if self._config.has_section(section): # only override preexisting section, ignores the other diff --git a/data-access/requirements.txt b/data-access/requirements.txt index 51270182..7d33cced 100644 --- a/data-access/requirements.txt +++ b/data-access/requirements.txt @@ -20,3 +20,5 @@ urllib3==1.26.2 requests nexusproto Shapely +s3fs +fsspec \ No newline at end of file From 4ccec2e5bc4fae53feca1426127fe801236ff067 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 10 Jul 2023 16:41:13 -0700 Subject: [PATCH 06/70] gitignore ini --- .gitignore | 3 +- .../nexustiles/config/datastores.ini.default | 39 ------------------- 2 files changed, 2 insertions(+), 40 deletions(-) delete mode 100644 data-access/nexustiles/config/datastores.ini.default diff --git a/.gitignore b/.gitignore index 12ab2d61..23f84355 100644 --- a/.gitignore +++ b/.gitignore @@ -4,5 +4,6 @@ *.idea *.DS_Store analysis/webservice/algorithms/doms/domsconfig.ini -data-access/nexustiles/config/datastores.ini +data-access/nexustiles/backends/nexusproto/config/datastores.ini +data-access/nexustiles/config/datasets.ini venv/ diff --git a/data-access/nexustiles/config/datastores.ini.default b/data-access/nexustiles/config/datastores.ini.default deleted file mode 100644 index d8db1902..00000000 --- a/data-access/nexustiles/config/datastores.ini.default +++ /dev/null @@ -1,39 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -[cassandra] -host=localhost -port=9042 -keyspace=nexustiles -local_datacenter=datacenter1 -protocol_version=3 -dc_policy=DCAwareRoundRobinPolicy -username= -password= - -[s3] -bucket=nexus-jpl -region=us-west-2 - -[dynamo] -table=nexus-jpl-table -region=us-west-2 - -[solr] -host=http://localhost:8983 -core=nexustiles - -[datastore] -store=cassandra From 736a44e8740f601eddda87e4bc23eb92c270a32a Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 10 Jul 2023 16:43:47 -0700 Subject: [PATCH 07/70] ASF headers --- data-access/nexustiles/backends/__init__.py | 15 +++++++++++++++ .../nexustiles/backends/nexusproto/__init__.py | 15 +++++++++++++++ data-access/nexustiles/backends/zarr/__init__.py | 15 +++++++++++++++ data-access/nexustiles/exception.py | 15 +++++++++++++++ 4 files changed, 60 insertions(+) diff --git a/data-access/nexustiles/backends/__init__.py b/data-access/nexustiles/backends/__init__.py index e69de29b..8afd240a 100644 --- a/data-access/nexustiles/backends/__init__.py +++ b/data-access/nexustiles/backends/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/data-access/nexustiles/backends/nexusproto/__init__.py b/data-access/nexustiles/backends/nexusproto/__init__.py index e69de29b..8afd240a 100644 --- a/data-access/nexustiles/backends/nexusproto/__init__.py +++ b/data-access/nexustiles/backends/nexusproto/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/data-access/nexustiles/backends/zarr/__init__.py b/data-access/nexustiles/backends/zarr/__init__.py index e69de29b..8afd240a 100644 --- a/data-access/nexustiles/backends/zarr/__init__.py +++ b/data-access/nexustiles/backends/zarr/__init__.py @@ -0,0 +1,15 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/data-access/nexustiles/exception.py b/data-access/nexustiles/exception.py index 33ab5296..77850a2f 100644 --- a/data-access/nexustiles/exception.py +++ b/data-access/nexustiles/exception.py @@ -1,2 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + class NexusTileServiceException(Exception): pass From 70bdab12f4dfd80a59f572376e496ddff8145d37 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 11 Jul 2023 15:17:50 -0700 Subject: [PATCH 08/70] First functioning test of 2 simultaneous backends --- data-access/nexustiles/AbstractTileService.py | 3 + .../nexustiles/backends/nexusproto/backend.py | 8 +- .../nexustiles/backends/zarr/backend.py | 195 +++++++++++++++++- data-access/nexustiles/nexustiles.py | 73 ++++--- data-access/requirements.txt | 3 +- 5 files changed, 238 insertions(+), 44 deletions(-) diff --git a/data-access/nexustiles/AbstractTileService.py b/data-access/nexustiles/AbstractTileService.py index 6e5b4640..20467784 100644 --- a/data-access/nexustiles/AbstractTileService.py +++ b/data-access/nexustiles/AbstractTileService.py @@ -30,6 +30,9 @@ class AbstractTileService(ABC): # def try_connect(self) -> bool: # raise NotImplementedError() + def __init__(self, dataset_name): + self._name = dataset_name + @abstractmethod def get_dataseries_list(self, simple=False): raise NotImplementedError() diff --git a/data-access/nexustiles/backends/nexusproto/backend.py b/data-access/nexustiles/backends/nexusproto/backend.py index 6aa63644..8cca5813 100644 --- a/data-access/nexustiles/backends/nexusproto/backend.py +++ b/data-access/nexustiles/backends/nexusproto/backend.py @@ -38,16 +38,12 @@ EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) -logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - datefmt="%Y-%m-%dT%H:%M:%S", stream=sys.stdout) -logger = logging.getLogger("testing") +logger = logging.getLogger(__name__) class NexusprotoTileService(AbstractTileService): def __init__(self, skipDatastore=False, skipMetadatastore=False, config=None): - AbstractTileService.__init__(self) + AbstractTileService.__init__(self, None) self._datastore = None self._metadatastore = None diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 13622453..fe5a49dd 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -30,20 +30,72 @@ from nexustiles.exception import NexusTileServiceException from nexustiles.AbstractTileService import AbstractTileService +from yarl import URL + +import xarray as xr +import s3fs +from urllib.parse import urlparse + EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt="%Y-%m-%dT%H:%M:%S", stream=sys.stdout) -logger = logging.getLogger("testing") +logger = logging.getLogger() class ZarrBackend(AbstractTileService): - def __init__(self, path, config): - AbstractTileService.__init__(self) - self.__config = config - + def __init__(self, dataset_name, path, config=None): + AbstractTileService.__init__(self, dataset_name) + self.__config = config if config is not None else {} + + logger.info(f'Opening zarr backend at {path} for dataset {self._name}') + + url = urlparse(path) + + self.__url = path + + self.__store_type = url.scheme + self.__host = url.netloc + self.__path = url.path + + if 'variable' in config: + data_vars = config['variable'] + elif 'variables' in config: + data_vars = config['variables'] + else: + raise KeyError('Data variables not provided in config') + + if isinstance(data_vars, str): + self.__variables = [data_vars] + elif isinstance(data_vars, list): + self.__variables = data_vars + else: + raise TypeError(f'Improper type for variables config: {type(data_vars)}') + + self.__longitude = config['coords']['longitude'] + self.__latitude = config['coords']['latitude'] + self.__time = config['coords']['time'] + + self.__depth = config['coords'].get('depth') + + if self.__store_type in ['', 'file']: + store = self.__path + elif self.__store_type == 's3': + aws_cfg = self.__config['aws'] + + if aws_cfg['public']: + region = aws_cfg.get('region', 'us-west-2') + store = f'https://{self.__host}.s3.{region}.amazonaws.com{self.__path}' + else: + s3 = s3fs.S3FileSystem(False, key=aws_cfg['accessKeyID'], secret=aws_cfg['secretAccessKey']) + store = s3fs.S3Map(root=path, s3=s3, check=False) + else: + raise ValueError(self.__store_type) + + self.__ds: xr.Dataset = xr.open_zarr(store, consolidated=True) + def get_dataseries_list(self, simple=False): raise NotImplementedError() @@ -89,10 +141,31 @@ def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **k raise NotImplementedError() def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): - # Find tiles that fall in the given box in the Solr index - raise NotImplementedError() - - def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=0, end_time=-1, **kwargs): + if type(start_time) is datetime: + start_time = (start_time - EPOCH).total_seconds() + if type(end_time) is datetime: + end_time = (end_time - EPOCH).total_seconds() + + params = { + 'min_lat': min_lat, + 'max_lat': max_lat, + 'min_lon': min_lon, + 'max_lon': max_lon + } + + if 0 <= start_time <= end_time: + params['min_time'] = start_time + params['max_time'] = end_time + + if 'depth' in kwargs: + params['depth'] = kwargs['depth'] + elif 'min_depth' in kwargs or 'max_depth' in kwargs: + params['min_depth'] = kwargs.get('min_depth') + params['max_depth'] = kwargs.get('max_depth') + + return [ZarrBackend.__to_url(self._name, **params)] + + def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=None, end_time=None, **kwargs): # Find tiles that fall within the polygon in the Solr index raise NotImplementedError() @@ -184,10 +257,110 @@ def get_tile_count(self, ds, bounding_polygon=None, start_time=0, end_time=-1, m raise NotImplementedError() def fetch_data_for_tiles(self, *tiles): - raise NotImplementedError() + for tile in tiles: + self.__fetch_data_for_tile(tile) + + return tiles + + def __fetch_data_for_tile(self, tile: Tile): + bbox: BBox = tile.bbox + + min_lat = None + min_lon = None + max_lat = None + max_lon = None + + min_time = float(tile.min_time) + max_time = float(tile.max_time) + + if min_time: + min_time = datetime.fromtimestamp(min_time) + + if max_time: + max_time = datetime.fromtimestamp(max_time) + + if bbox: + min_lat = bbox.min_lat + min_lon = bbox.min_lon + max_lat = bbox.max_lat + max_lon = bbox.max_lon + + sel = { + self.__latitude: slice(min_lat, max_lat), + self.__longitude: slice(min_lon, max_lon), + self.__time: slice(min_time, max_time) + } + + tile.variables = [ + TileVariable(v, v) for v in self.__variables + ] + + matched = self.__ds.sel(sel) + + tile.latitudes = ma.masked_invalid(matched[self.__latitude].to_numpy()) + tile.longitudes = ma.masked_invalid(matched[self.__longitude].to_numpy()) + + times = matched[self.__time].to_numpy() + + if np.issubdtype(times.dtype, np.datetime64): + times = ((times - np.datetime64(EPOCH)) / 1e9).astype(int) + + tile.times = ma.masked_invalid(times) + + tile.data = ma.masked_invalid( + [matched[var].to_numpy() for var in self.__variables] + ) + + tile.is_multi = True def _metadata_store_docs_to_tiles(self, *store_docs): - raise NotImplementedError() + return [ZarrBackend.__nts_url_to_tile(d) for d in store_docs] + + @staticmethod + def __nts_url_to_tile(nts_url): + tile = Tile() + + url = URL(nts_url) + + tile.tile_id = nts_url + + try: + min_lat = float(url.query['min_lat']) + min_lon = float(url.query['min_lon']) + max_lat = float(url.query['max_lat']) + max_lon = float(url.query['max_lon']) + + tile.bbox = BBox(min_lat, max_lat, min_lon, max_lon) + except KeyError: + pass + + tile.dataset = url.host + + try: + tile.min_time = int(url.query['min_time']) + except KeyError: + pass + + try: + tile.max_time = int(url.query['max_time']) + except KeyError: + pass + + return tile + + @staticmethod + def __to_url(dataset, **kwargs): + if 'dataset' in kwargs: + del kwargs['dataset'] + + if 'ds' in kwargs: + del kwargs['ds'] + return str(URL.build( + scheme='nts', + host=dataset, + path='/', + query=kwargs + )) diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index 405b5b70..78fe23d4 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -37,6 +37,8 @@ from .backends.zarr.backend import ZarrBackend from .model.nexusmodel import Tile, BBox, TileStats, TileVariable +from requests.structures import CaseInsensitiveDict + EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) logging.basicConfig( @@ -53,13 +55,27 @@ def fetch_data_for_func(*args, **kwargs): metadatastore_start = datetime.now() metadatastore_docs = func(*args, **kwargs) metadatastore_duration = (datetime.now() - metadatastore_start).total_seconds() - tiles = args[0]._metadata_store_docs_to_tiles(*metadatastore_docs) + + # Try to determine source dataset to route calls to proper backend + guessed_dataset = None + + if 'ds' in kwargs: + guessed_dataset = kwargs['ds'] + elif 'dataset' in kwargs: + guessed_dataset = kwargs['dataset'] + else: + for arg in args: + if arg is not None and arg in NexusTileService.backends: + guessed_dataset = arg + break + + tiles = NexusTileService._get_backend(guessed_dataset)._metadata_store_docs_to_tiles(*metadatastore_docs) cassandra_duration = 0 if ('fetch_data' in kwargs and kwargs['fetch_data']) or ('fetch_data' not in kwargs and default_fetch): if len(tiles) > 0: cassandra_start = datetime.now() - args[0].fetch_data_for_tiles(*tiles) + NexusTileService._get_backend(guessed_dataset).fetch_data_for_tiles(*tiles) cassandra_duration += (datetime.now() - cassandra_start).total_seconds() if 'metrics_callback' in kwargs and kwargs['metrics_callback'] is not None: @@ -128,7 +144,7 @@ def __init__(self, config=None): NexusTileService.__update_thread = threading.Thread( target=NexusTileService.__update_datasets_loop, name='dataset_update', - daemon=False + daemon=True ) logger.info('Starting dataset refresh thread') @@ -136,7 +152,10 @@ def __init__(self, config=None): NexusTileService.__update_thread.start() @staticmethod - def __get_backend(dataset_s) -> AbstractTileService: + def _get_backend(dataset_s) -> AbstractTileService: + if dataset_s is not None: + dataset_s = dataset_s.lower() + with DS_LOCK: if dataset_s not in NexusTileService.backends: logger.warning(f'Dataset {dataset_s} not currently loaded. Checking to see if it was recently' @@ -198,7 +217,7 @@ def _update_datasets(): next_cursor_mark = response_cursor_mark for dataset in response.docs: - d_id = dataset['dataset_s'] + d_id = dataset['dataset_s'].lower() store_type = dataset.get('store_type_s', 'nexusproto') present_datasets.add(d_id) @@ -217,7 +236,7 @@ def _update_datasets(): ds_config = json.loads(dataset['config'][0]) NexusTileService.backends[d_id] = { - 'backend': ZarrBackend(**ds_config), + 'backend': ZarrBackend(dataset_name=dataset['dataset_s'], **ds_config), 'up': True } else: @@ -251,33 +270,33 @@ def get_dataseries_list(self, simple=False): @tile_data() def find_tile_by_id(self, tile_id, **kwargs): - return NexusTileService.__get_backend('__nexusproto__').find_tile_by_id(tile_id) + return NexusTileService._get_backend('__nexusproto__').find_tile_by_id(tile_id) @tile_data() def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): - return NexusTileService.__get_backend('__nexusproto__').find_tiles_by_id(tile_ids, ds=ds, **kwargs) + return NexusTileService._get_backend('__nexusproto__').find_tiles_by_id(tile_ids, ds=ds, **kwargs) def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, start_time, end_time, metrics_callback=None, **kwargs): - return NexusTileService.__get_backend(dataset).find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, - dataset, start_time, end_time, - metrics_callback, **kwargs) + return NexusTileService._get_backend(dataset).find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, + dataset, start_time, end_time, + metrics_callback, **kwargs) @tile_data() def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year, **kwargs): - return NexusTileService.__get_backend(ds).find_tile_by_polygon_and_most_recent_day_of_year( + return NexusTileService._get_backend(ds).find_tile_by_polygon_and_most_recent_day_of_year( bounding_polygon, ds, day_of_year, **kwargs ) @tile_data() def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - return NexusTileService.__get_backend(dataset).find_all_tiles_in_box_at_time( + return NexusTileService._get_backend(dataset).find_all_tiles_in_box_at_time( min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs ) @tile_data() def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **kwargs): - return NexusTileService.__get_backend(dataset).find_all_tiles_in_polygon_at_time( + return NexusTileService._get_backend(dataset).find_all_tiles_in_polygon_at_time( bounding_polygon, dataset, time, **kwargs ) @@ -289,19 +308,19 @@ def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_t if type(end_time) is datetime: end_time = (end_time - EPOCH).total_seconds() - return NexusTileService.__get_backend(ds).find_tiles_in_box( + return NexusTileService._get_backend(ds).find_tiles_in_box( min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs ) @tile_data() def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=0, end_time=-1, **kwargs): - return NexusTileService.__get_backend(ds).find_tiles_in_polygon( + return NexusTileService._get_backend(ds).find_tiles_in_polygon( bounding_polygon, ds, start_time, end_time, **kwargs ) @tile_data() def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): - return NexusTileService.__get_backend(ds).find_tiles_by_metadata( + return NexusTileService._get_backend(ds).find_tiles_by_metadata( metadata, ds, start_time, end_time, **kwargs ) @@ -334,13 +353,13 @@ def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs) :param kwargs: fetch_data: True/False = whether or not to retrieve tile data :return: """ - return NexusTileService.__get_backend(ds).find_tiles_by_exact_bounds( + return NexusTileService._get_backend(ds).find_tiles_by_exact_bounds( bounds, ds, start_time, end_time, **kwargs ) @tile_data() def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - return NexusTileService.__get_backend(dataset).find_all_boundary_tiles_at_time( + return NexusTileService._get_backend(dataset).find_all_boundary_tiles_at_time( min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs ) @@ -363,12 +382,12 @@ def get_tiles_bounded_by_polygon(self, polygon, ds=None, start_time=0, end_time= return tiles def get_min_max_time_by_granule(self, ds, granule_name): - return NexusTileService.__get_backend(ds).get_min_max_time_by_granule( + return NexusTileService._get_backend(ds).get_min_max_time_by_granule( ds, granule_name ) def get_dataset_overall_stats(self, ds): - return NexusTileService.__get_backend(ds).get_dataset_overall_stats(ds) + return NexusTileService._get_backend(ds).get_dataset_overall_stats(ds) def get_tiles_bounded_by_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): tiles = self.find_all_tiles_in_box_at_time(min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs) @@ -399,7 +418,7 @@ def get_bounding_box(self, tile_ids, ds=None): :param tile_ids: List of tile ids :return: shapely.geometry.Polygon that represents the smallest bounding box that encompasses all of the tiles """ - return NexusTileService.__get_backend(ds).get_bounding_box(tile_ids, ds) + return NexusTileService._get_backend(ds).get_bounding_box(tile_ids, ds) def get_min_time(self, tile_ids, ds=None): """ @@ -408,7 +427,7 @@ def get_min_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ - return NexusTileService.__get_backend(ds).get_min_time(tile_ids, ds) + return NexusTileService._get_backend(ds).get_min_time(tile_ids, ds) def get_max_time(self, tile_ids, ds=None): """ @@ -417,7 +436,7 @@ def get_max_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ - return int(NexusTileService.__get_backend(ds).get_max_time(tile_ids)) + return int(NexusTileService._get_backend(ds).get_max_time(tile_ids)) def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): """ @@ -528,8 +547,10 @@ def mask_tiles_to_time_range(self, start_time, end_time, tiles): return tiles - def fetch_data_for_tiles(self, *tiles, dataset=None): - return NexusTileService.__get_backend(dataset).fetch_data_for_tiles(*tiles) + def fetch_data_for_tiles(self, *tiles): + dataset = tiles[0].dataset + + return NexusTileService._get_backend(dataset).fetch_data_for_tiles(*tiles) def _metadata_store_docs_to_tiles(self, *store_docs): tiles = [] diff --git a/data-access/requirements.txt b/data-access/requirements.txt index 7d33cced..ab96e2af 100644 --- a/data-access/requirements.txt +++ b/data-access/requirements.txt @@ -21,4 +21,5 @@ requests nexusproto Shapely s3fs -fsspec \ No newline at end of file +fsspec +xarray~=2022.3.0 \ No newline at end of file From f3981cd8735b146206fe87955cc26cd4d25ad034 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 12 Jul 2023 13:38:12 -0700 Subject: [PATCH 09/70] Removed accidentally committed ini files --- .../backends/nexusproto/config/datastores.ini | 36 ------------------- data-access/nexustiles/config/datasets.ini | 18 ---------- 2 files changed, 54 deletions(-) delete mode 100644 data-access/nexustiles/backends/nexusproto/config/datastores.ini delete mode 100644 data-access/nexustiles/config/datasets.ini diff --git a/data-access/nexustiles/backends/nexusproto/config/datastores.ini b/data-access/nexustiles/backends/nexusproto/config/datastores.ini deleted file mode 100644 index f3facb95..00000000 --- a/data-access/nexustiles/backends/nexusproto/config/datastores.ini +++ /dev/null @@ -1,36 +0,0 @@ -[cassandra] -host=localhost -port=9042 -keyspace=nexustiles -local_datacenter=datacenter1 -protocol_version=3 -dc_policy=WhiteListRoundRobinPolicy -username=cassandra -password=cassandra - -[dynamo] -table=nexus-jpl-table -region=us-west-2 - -[solr] -host=http://localhost:8983 -core=nexustiles - -[s3] -bucket=cdms-dev-zarr -#key=MUR_aggregate/ -#key=MUR_1wk_7_100_100/ -#key=MUR_1wk_7_1500_2500/ -#key=MUR_2017_9dy_7_1500_2500/ -#key=MUR_2017_9dy_7_120_240/ -key=MUR_2017_2yr_30_120_240/ -#key=SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5_7_120_240.zarr/ -#key=SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5_1_240_240.zarr/ -#key=SMAP_JPL_L3_SSS_CAP_8DAY-RUNNINGMEAN_V5_90_120_240.zarr/ -public=false -region=us-west-2 -profile=saml-pub - -[datastore] -store=cassandra -#store=zarrS3 diff --git a/data-access/nexustiles/config/datasets.ini b/data-access/nexustiles/config/datasets.ini deleted file mode 100644 index 9f586cf2..00000000 --- a/data-access/nexustiles/config/datasets.ini +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -[solr] -host=http://localhost:8983 -core=nexusdatasets From 26f6220f2f6a8aaa3787ac6b80effc0b4e236837 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 12 Jul 2023 14:14:31 -0700 Subject: [PATCH 10/70] Working zarr backend ds list + datasets are no longer case sensitive + handling for failed zarr ds opens (bad path, bad creds, &c) --- .../backends/nexusproto/dao/SolrProxy.py | 3 +- .../nexustiles/backends/zarr/backend.py | 75 +++++++++++++++---- data-access/nexustiles/exception.py | 3 +- data-access/nexustiles/nexustiles.py | 37 ++++++--- 4 files changed, 90 insertions(+), 28 deletions(-) diff --git a/data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py b/data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py index 9b16533d..c9435a2b 100644 --- a/data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py +++ b/data-access/nexustiles/backends/nexusproto/dao/SolrProxy.py @@ -189,7 +189,8 @@ def get_data_series_list_simple(self): l.append({ "shortName": g, "title": g, - "tileCount": v + "tileCount": v, + "type": 'nexusproto' }) l = sorted(l, key=lambda entry: entry["title"]) return l diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index fe5a49dd..de1d86ba 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -37,12 +37,13 @@ from urllib.parse import urlparse EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) +ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt="%Y-%m-%dT%H:%M:%S", stream=sys.stdout) -logger = logging.getLogger() +logger = logging.getLogger(__name__) class ZarrBackend(AbstractTileService): @@ -83,21 +84,43 @@ def __init__(self, dataset_name, path, config=None): if self.__store_type in ['', 'file']: store = self.__path elif self.__store_type == 's3': - aws_cfg = self.__config['aws'] - - if aws_cfg['public']: - region = aws_cfg.get('region', 'us-west-2') - store = f'https://{self.__host}.s3.{region}.amazonaws.com{self.__path}' - else: - s3 = s3fs.S3FileSystem(False, key=aws_cfg['accessKeyID'], secret=aws_cfg['secretAccessKey']) - store = s3fs.S3Map(root=path, s3=s3, check=False) + try: + aws_cfg = self.__config['aws'] + + if aws_cfg['public']: + region = aws_cfg.get('region', 'us-west-2') + store = f'https://{self.__host}.s3.{region}.amazonaws.com{self.__path}' + else: + s3 = s3fs.S3FileSystem(False, key=aws_cfg['accessKeyID'], secret=aws_cfg['secretAccessKey']) + store = s3fs.S3Map(root=path, s3=s3, check=False) + except Exception as e: + logger.error(f'Failed to open zarr dataset at {self.__path}, ignoring it. Cause: {e}') + raise NexusTileServiceException(f'Cannot open S3 dataset ({e})') else: raise ValueError(self.__store_type) - self.__ds: xr.Dataset = xr.open_zarr(store, consolidated=True) + try: + self.__ds: xr.Dataset = xr.open_zarr(store, consolidated=True) + except Exception as e: + logger.error(f'Failed to open zarr dataset at {self.__path}, ignoring it. Cause: {e}') + raise NexusTileServiceException(f'Cannot open dataset ({e})') def get_dataseries_list(self, simple=False): - raise NotImplementedError() + ds = { + "shortName": self._name, + "title": self._name, + "type": "zarr" + } + + if not simple: + min_date = self.get_min_time([]) + max_date = self.get_max_time([]) + ds['start'] = min_date + ds['end'] = max_date + ds['iso_start'] = datetime.fromtimestamp(min_date).strftime(ISO_8601) + ds['iso_end'] = datetime.fromtimestamp(max_date).strftime(ISO_8601) + + return [ds] def find_tile_by_id(self, tile_id, **kwargs): raise NotImplementedError() @@ -215,6 +238,18 @@ def get_bounding_box(self, tile_ids): """ raise NotImplementedError() + def __get_ds_min_max_date(self): + min_date = self.__ds[self.__time].min().to_numpy() + max_date = self.__ds[self.__time].max().to_numpy() + + if np.issubdtype(min_date.dtype, np.datetime64): + min_date = ((min_date - np.datetime64(EPOCH)) / 1e9).astype(int).item() + + if np.issubdtype(max_date.dtype, np.datetime64): + max_date = ((max_date - np.datetime64(EPOCH)) / 1e9).astype(int).item() + + return min_date, max_date + def get_min_time(self, tile_ids, ds=None): """ Get the minimum tile date from the list of tile ids @@ -222,7 +257,11 @@ def get_min_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ - raise NotImplementedError() + if len(tile_ids) == 0: + min_date, max_date = self.__get_ds_min_max_date() + return min_date + else: + raise NotImplementedError() def get_max_time(self, tile_ids, ds=None): """ @@ -231,7 +270,11 @@ def get_max_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ - raise NotImplementedError() + if len(tile_ids) == 0: + min_date, max_date = self.__get_ds_min_max_date() + return max_date + else: + raise NotImplementedError() def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): """ @@ -334,7 +377,7 @@ def __nts_url_to_tile(nts_url): except KeyError: pass - tile.dataset = url.host + tile.dataset = url.path try: tile.min_time = int(url.query['min_time']) @@ -358,8 +401,8 @@ def __to_url(dataset, **kwargs): return str(URL.build( scheme='nts', - host=dataset, - path='/', + host='', + path=dataset, query=kwargs )) diff --git a/data-access/nexustiles/exception.py b/data-access/nexustiles/exception.py index 77850a2f..d6ed2c64 100644 --- a/data-access/nexustiles/exception.py +++ b/data-access/nexustiles/exception.py @@ -14,4 +14,5 @@ # limitations under the License. class NexusTileServiceException(Exception): - pass + def __init__(self, reason): + Exception.__init__(self, reason) diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index 78fe23d4..1b58f156 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -37,6 +37,8 @@ from .backends.zarr.backend import ZarrBackend from .model.nexusmodel import Tile, BBox, TileStats, TileVariable +from .exception import NexusTileServiceException + from requests.structures import CaseInsensitiveDict EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) @@ -93,6 +95,16 @@ def fetch_data_for_func(*args, **kwargs): return tile_data_decorator +def catch_not_implemented(func): + def wrapper(*args, **kwargs): + try: + return func(*args, **kwargs) + except NotImplementedError: + raise NexusTileServiceException('Action unsupported by backend') + + return wrapper + + SOLR_LOCK = threading.Lock() DS_LOCK = threading.Lock() thread_local = threading.local() @@ -154,7 +166,7 @@ def __init__(self, config=None): @staticmethod def _get_backend(dataset_s) -> AbstractTileService: if dataset_s is not None: - dataset_s = dataset_s.lower() + dataset_s = dataset_s with DS_LOCK: if dataset_s not in NexusTileService.backends: @@ -217,7 +229,7 @@ def _update_datasets(): next_cursor_mark = response_cursor_mark for dataset in response.docs: - d_id = dataset['dataset_s'].lower() + d_id = dataset['dataset_s'] store_type = dataset.get('store_type_s', 'nexusproto') present_datasets.add(d_id) @@ -235,10 +247,13 @@ def _update_datasets(): update_logger.info(f"Detected new zarr dataset {d_id}, opening new zarr backend") ds_config = json.loads(dataset['config'][0]) - NexusTileService.backends[d_id] = { - 'backend': ZarrBackend(dataset_name=dataset['dataset_s'], **ds_config), - 'up': True - } + try: + NexusTileService.backends[d_id] = { + 'backend': ZarrBackend(dataset_name=dataset['dataset_s'], **ds_config), + 'up': True + } + except NexusTileServiceException: + added_datasets -= 1 else: update_logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') added_datasets -= 1 @@ -263,10 +278,12 @@ def override_config(self, config): self._config.set(section, option, config.get(section, option)) def get_dataseries_list(self, simple=False): - if simple: - return self._metadatastore.get_data_series_list_simple() - else: - return self._metadatastore.get_data_series_list() + datasets = [] + for backend in set([b['backend'] for b in NexusTileService.backends.values() if b['up']]): + datasets.extend(backend.get_dataseries_list(simple)) + + return datasets + @tile_data() def find_tile_by_id(self, tile_id, **kwargs): From 91de6efef7b15480bde3993f9ae47aee8401e5ed Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 12 Jul 2023 16:06:18 -0700 Subject: [PATCH 11/70] Capture and handle NTS requests routed to backend that doesn't (yet) support them --- data-access/nexustiles/nexustiles.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index 1b58f156..b8165a1d 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -286,13 +286,16 @@ def get_dataseries_list(self, simple=False): @tile_data() + @catch_not_implemented def find_tile_by_id(self, tile_id, **kwargs): return NexusTileService._get_backend('__nexusproto__').find_tile_by_id(tile_id) @tile_data() + @catch_not_implemented def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): - return NexusTileService._get_backend('__nexusproto__').find_tiles_by_id(tile_ids, ds=ds, **kwargs) + return NexusTileService._get_backend(ds).find_tiles_by_id(tile_ids, ds=ds, **kwargs) + @catch_not_implemented def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, start_time, end_time, metrics_callback=None, **kwargs): return NexusTileService._get_backend(dataset).find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, @@ -300,24 +303,28 @@ def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, st metrics_callback, **kwargs) @tile_data() + @catch_not_implemented def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year, **kwargs): return NexusTileService._get_backend(ds).find_tile_by_polygon_and_most_recent_day_of_year( bounding_polygon, ds, day_of_year, **kwargs ) @tile_data() + @catch_not_implemented def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): return NexusTileService._get_backend(dataset).find_all_tiles_in_box_at_time( min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs ) @tile_data() + @catch_not_implemented def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **kwargs): return NexusTileService._get_backend(dataset).find_all_tiles_in_polygon_at_time( bounding_polygon, dataset, time, **kwargs ) @tile_data() + @catch_not_implemented def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): # Find tiles that fall in the given box in the Solr index if type(start_time) is datetime: @@ -330,12 +337,14 @@ def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_t ) @tile_data() + @catch_not_implemented def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=0, end_time=-1, **kwargs): return NexusTileService._get_backend(ds).find_tiles_in_polygon( bounding_polygon, ds, start_time, end_time, **kwargs ) @tile_data() + @catch_not_implemented def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): return NexusTileService._get_backend(ds).find_tiles_by_metadata( metadata, ds, start_time, end_time, **kwargs @@ -357,6 +366,7 @@ def get_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, ** return tiles @tile_data() + @catch_not_implemented def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs): """ The method will return tiles with the exact given bounds within the time range. It differs from @@ -375,6 +385,7 @@ def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs) ) @tile_data() + @catch_not_implemented def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): return NexusTileService._get_backend(dataset).find_all_boundary_tiles_at_time( min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs @@ -398,11 +409,13 @@ def get_tiles_bounded_by_polygon(self, polygon, ds=None, start_time=0, end_time= return tiles + @catch_not_implemented def get_min_max_time_by_granule(self, ds, granule_name): return NexusTileService._get_backend(ds).get_min_max_time_by_granule( ds, granule_name ) + @catch_not_implemented def get_dataset_overall_stats(self, ds): return NexusTileService._get_backend(ds).get_dataset_overall_stats(ds) @@ -424,6 +437,7 @@ def get_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset return tiles + @catch_not_implemented def get_stats_within_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): return NexusTileService.get_stats_within_box_at_time( min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs @@ -435,7 +449,7 @@ def get_bounding_box(self, tile_ids, ds=None): :param tile_ids: List of tile ids :return: shapely.geometry.Polygon that represents the smallest bounding box that encompasses all of the tiles """ - return NexusTileService._get_backend(ds).get_bounding_box(tile_ids, ds) + return NexusTileService._get_backend(ds).get_bounding_box(tile_ids) def get_min_time(self, tile_ids, ds=None): """ From df23919bc7466d9df12fda9893b581af61d10f80 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 12 Jul 2023 16:06:59 -0700 Subject: [PATCH 12/70] analysis setup fails to find VERSION.txt when building locally --- analysis/setup.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/analysis/setup.py b/analysis/setup.py index 99cd707c..6472621d 100644 --- a/analysis/setup.py +++ b/analysis/setup.py @@ -17,8 +17,11 @@ import setuptools from subprocess import check_call, CalledProcessError -with open('../VERSION.txt', 'r') as f: - __version__ = f.read() +try: + with open('../VERSION.txt', 'r') as f: + __version__ = f.read() +except: + __version__ = None try: From 07404f063dc9f2b0ae9c2941caef445c7aae26c2 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 12 Jul 2023 16:07:35 -0700 Subject: [PATCH 13/70] Implemented more NTS functions in zarr backend --- .../nexustiles/backends/zarr/backend.py | 89 ++++++++++++++++--- 1 file changed, 75 insertions(+), 14 deletions(-) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index de1d86ba..1f46a95e 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -123,14 +123,38 @@ def get_dataseries_list(self, simple=False): return [ds] def find_tile_by_id(self, tile_id, **kwargs): - raise NotImplementedError() + return tile_id def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): - raise NotImplementedError() + return tile_ids def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, start_time, end_time, metrics_callback=None, **kwargs): - raise NotImplementedError() + start = datetime.now() + + if not isinstance(start_time, datetime): + start_time = datetime.fromtimestamp(start_time) + + if not isinstance(end_time, datetime): + end_time = datetime.fromtimestamp(end_time) + + sel = { + self.__latitude: slice(min_lat, max_lat), + self.__longitude: slice(min_lon, max_lon), + self.__time: slice(start_time, end_time) + } + + times = self.__ds.sel(sel)[self.__time].to_numpy() + + if np.issubdtype(times.dtype, np.datetime64): + times = ((times - np.datetime64(EPOCH)) / 1e9).astype(int) + + times = sorted(list(times)) + + if metrics_callback: + metrics_callback(backend=(datetime.now() - start).total_seconds()) + + return times def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, day_of_year, **kwargs): """ @@ -158,10 +182,10 @@ def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, raise NotImplementedError() def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - raise NotImplementedError() + return self.find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, dataset, time, time, **kwargs) def find_all_tiles_in_polygon_at_time(self, bounding_polygon, dataset, time, **kwargs): - raise NotImplementedError() + return self.find_tiles_in_polygon(bounding_polygon, dataset, time, time, **kwargs) def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_time=0, end_time=-1, **kwargs): if type(start_time) is datetime: @@ -190,7 +214,14 @@ def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_t def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=None, end_time=None, **kwargs): # Find tiles that fall within the polygon in the Solr index - raise NotImplementedError() + bounds = bounding_polygon.bounds + + min_lon = bounds[0] + min_lat = bounds[1] + max_lon = bounds[2] + max_lat = bounds[3] + + return self.find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs) def find_tiles_by_metadata(self, metadata, ds=None, start_time=0, end_time=-1, **kwargs): """ @@ -216,10 +247,17 @@ def find_tiles_by_exact_bounds(self, bounds, ds, start_time, end_time, **kwargs) :param kwargs: fetch_data: True/False = whether or not to retrieve tile data :return: """ - raise NotImplementedError() + min_lon = bounds[0] + min_lat = bounds[1] + max_lon = bounds[2] + max_lat = bounds[3] + + return self.find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, **kwargs) def find_all_boundary_tiles_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): - raise NotImplementedError() + # Due to the precise nature of gridded Zarr's subsetting, it doesn't make sense to have a boundary region like + # this + return [] def get_min_max_time_by_granule(self, ds, granule_name): raise NotImplementedError() @@ -236,7 +274,20 @@ def get_bounding_box(self, tile_ids): :param tile_ids: List of tile ids :return: shapely.geometry.Polygon that represents the smallest bounding box that encompasses all of the tiles """ - raise NotImplementedError() + + bounds = [ + ( + float(URL(u).query['min_lon']), + float(URL(u).query['min_lat']), + float(URL(u).query['max_lon']), + float(URL(u).query['max_lat']) + ) + for u in tile_ids + ] + + poly = MultiPolygon([box(*b) for b in bounds]) + + return box(*poly.bounds) def __get_ds_min_max_date(self): min_date = self.__ds[self.__time].min().to_numpy() @@ -257,11 +308,13 @@ def get_min_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ - if len(tile_ids) == 0: + times = list(filter(lambda x: x is not None, [int(URL(tid).query['min_time']) for tid in tile_ids])) + + if len(times) == 0: min_date, max_date = self.__get_ds_min_max_date() return min_date else: - raise NotImplementedError() + return min(times) def get_max_time(self, tile_ids, ds=None): """ @@ -270,11 +323,13 @@ def get_max_time(self, tile_ids, ds=None): :param ds: Filter by a specific dataset. Defaults to None (queries all datasets) :return: long time in seconds since epoch """ + times = list(filter(lambda x: x is not None, [int(URL(tid).query['max_time']) for tid in tile_ids])) + if len(tile_ids) == 0: min_date, max_date = self.__get_ds_min_max_date() return max_date else: - raise NotImplementedError() + max(times) def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): """ @@ -331,14 +386,20 @@ def __fetch_data_for_tile(self, tile: Tile): sel = { self.__latitude: slice(min_lat, max_lat), self.__longitude: slice(min_lon, max_lon), - self.__time: slice(min_time, max_time) } + if min_time == max_time: + sel[self.__time] = min_time + method = 'nearest' + else: + sel[self.__time] = slice(min_time, max_time) + method = None + tile.variables = [ TileVariable(v, v) for v in self.__variables ] - matched = self.__ds.sel(sel) + matched = self.__ds.sel(sel, method=method) tile.latitudes = ma.masked_invalid(matched[self.__latitude].to_numpy()) tile.longitudes = ma.masked_invalid(matched[self.__longitude].to_numpy()) From 72888aa9f83f846dd9535835101e21aef93a8410 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 12 Jul 2023 16:11:22 -0700 Subject: [PATCH 14/70] Added misc backend time metrics record field in NCSH --- analysis/webservice/algorithms_spark/NexusCalcSparkHandler.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/analysis/webservice/algorithms_spark/NexusCalcSparkHandler.py b/analysis/webservice/algorithms_spark/NexusCalcSparkHandler.py index 4499773a..e0334676 100644 --- a/analysis/webservice/algorithms_spark/NexusCalcSparkHandler.py +++ b/analysis/webservice/algorithms_spark/NexusCalcSparkHandler.py @@ -362,6 +362,9 @@ def _create_metrics_record(self): SparkAccumulatorMetricsField(key='solr', description='Cumulative time to fetch data from Solr', accumulator=self._sc.accumulator(0)), + SparkAccumulatorMetricsField(key='backend', + description='Cumulative time to fetch data from external backend(s)', + accumulator=self._sc.accumulator(0)), SparkAccumulatorMetricsField(key='calculation', description='Cumulative time to do calculations', accumulator=self._sc.accumulator(0)), From 1c4a0e492485be2650c5756541cbfb9376b0a2bf Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 13 Jul 2023 13:55:38 -0700 Subject: [PATCH 15/70] fixes --- .../nexustiles/backends/nexusproto/backend.py | 3 +++ .../nexustiles/backends/zarr/backend.py | 21 +++++++------------ data-access/nexustiles/nexustiles.py | 16 +++++++------- 3 files changed, 18 insertions(+), 22 deletions(-) diff --git a/data-access/nexustiles/backends/nexusproto/backend.py b/data-access/nexustiles/backends/nexusproto/backend.py index 8cca5813..690b109c 100644 --- a/data-access/nexustiles/backends/nexusproto/backend.py +++ b/data-access/nexustiles/backends/nexusproto/backend.py @@ -269,6 +269,9 @@ def get_bounding_box(self, tile_ids): """ tiles = self.find_tiles_by_id(tile_ids, fl=['tile_min_lat', 'tile_max_lat', 'tile_min_lon', 'tile_max_lon'], fetch_data=False, rows=len(tile_ids)) + + tiles = self._metadata_store_docs_to_tiles(*tiles) + polys = [] for tile in tiles: polys.append(box(tile.bbox.min_lon, tile.bbox.min_lat, tile.bbox.max_lon, tile.bbox.max_lat)) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 1f46a95e..f4f92c56 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -13,29 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -import configparser import logging import sys -import json from datetime import datetime -from functools import reduce +from urllib.parse import urlparse import numpy as np import numpy.ma as ma -import pkg_resources -from pytz import timezone, UTC -from shapely.geometry import MultiPolygon, box - -from nexustiles.model.nexusmodel import Tile, BBox, TileStats, TileVariable -from nexustiles.exception import NexusTileServiceException +import s3fs +import xarray as xr from nexustiles.AbstractTileService import AbstractTileService - +from nexustiles.exception import NexusTileServiceException +from nexustiles.model.nexusmodel import Tile, BBox, TileVariable +from pytz import timezone +from shapely.geometry import MultiPolygon, box from yarl import URL -import xarray as xr -import s3fs -from urllib.parse import urlparse - EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index b8165a1d..fb8c0f33 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -67,7 +67,7 @@ def fetch_data_for_func(*args, **kwargs): guessed_dataset = kwargs['dataset'] else: for arg in args: - if arg is not None and arg in NexusTileService.backends: + if isinstance(arg, str) and arg in NexusTileService.backends: guessed_dataset = arg break @@ -178,13 +178,13 @@ def _get_backend(dataset_s) -> AbstractTileService: b = NexusTileService.backends[dataset_s] - if not b['up']: - success = b['backend'].try_connect() - - if not success: - raise NexusProcessingException(reason=f'Dataset {dataset_s} is currently unavailable') - else: - NexusTileService.backends[dataset_s]['up'] = True + # if not b['up']: + # success = b['backend'].try_connect() + # + # if not success: + # raise NexusProcessingException(reason=f'Dataset {dataset_s} is currently unavailable') + # else: + # NexusTileService.backends[dataset_s]['up'] = True return b['backend'] From 0a7cd7f3f55340107ff1d0e7f924f1dfd1cdfe26 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 17 Jul 2023 16:28:08 -0700 Subject: [PATCH 16/70] Dynamic dataset management --- analysis/webservice/config/web.ini | 2 +- analysis/webservice/management/Datasets.py | 78 ++++++++++++ analysis/webservice/management/__init__.py | 16 +++ .../request/handlers/NexusRequestHandler.py | 29 +++++ .../webservice/webmodel/NexusRequestObject.py | 6 + data-access/nexustiles/nexustiles.py | 113 +++++++++++------- 6 files changed, 197 insertions(+), 47 deletions(-) create mode 100644 analysis/webservice/management/Datasets.py create mode 100644 analysis/webservice/management/__init__.py diff --git a/analysis/webservice/config/web.ini b/analysis/webservice/config/web.ini index 85849758..a9e3dda8 100644 --- a/analysis/webservice/config/web.ini +++ b/analysis/webservice/config/web.ini @@ -29,4 +29,4 @@ static_enabled=true static_dir=static [modules] -module_dirs=webservice.algorithms,webservice.algorithms_spark,webservice.algorithms.doms \ No newline at end of file +module_dirs=webservice.algorithms,webservice.algorithms_spark,webservice.algorithms.doms,webservice.management \ No newline at end of file diff --git a/analysis/webservice/management/Datasets.py b/analysis/webservice/management/Datasets.py new file mode 100644 index 00000000..195ca38e --- /dev/null +++ b/analysis/webservice/management/Datasets.py @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from yaml import load +import json +from webservice.NexusHandler import nexus_handler +from nexustiles.nexustiles import NexusTileService +from webservice.webmodel import NexusRequestObject, NexusProcessingException +try: + from yaml import CLoader as Loader +except ImportError: + from yaml import Loader + + +class DatasetManagement: + @classmethod + def validate(cls): + pass + + @staticmethod + def parse_config(request: NexusRequestObject): + content_type = request.get_headers()['Content-Type'] + + if content_type in ['application/json', 'application/x-json']: + return json.loads(request.get_request_body()) + elif content_type == 'application/yaml': + return load(request.get_request_body(), Loader=Loader) + else: + raise NexusProcessingException(reason='Invalid Content-Type header', code=400) + + +@nexus_handler +class DatasetAdd(DatasetManagement): + name = 'Add dataset' + path = '/datasets/add' + description = "Add new dataset to running SDAP instance" + + def __init__(self, **args): + pass + + def calc(self, request: NexusRequestObject, **args): + # print('CALC') + try: + config = DatasetManagement.parse_config(request) + except Exception as e: + raise NexusProcessingException( + reason=repr(e), + code=400 + ) + + name = request.get_argument('name') + + if name is None: + raise NexusProcessingException( + reason='Name argument must be provided', + code=400 + ) + + try: + NexusTileService.user_ds_add(name, config) + except Exception as e: + raise NexusProcessingException( + reason=repr(e), + code=500 + ) + diff --git a/analysis/webservice/management/__init__.py b/analysis/webservice/management/__init__.py new file mode 100644 index 00000000..7c9f5ef4 --- /dev/null +++ b/analysis/webservice/management/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from webservice.management.Datasets import DatasetAdd \ No newline at end of file diff --git a/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py b/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py index 26455746..1c7e936c 100644 --- a/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py +++ b/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py @@ -63,6 +63,35 @@ def get(self): except Exception as e: self.async_onerror_callback(str(e), 500) + @tornado.gen.coroutine + def post(self): + self.logger.info("Received POST %s" % self._request_summary()) + + request = NexusRequestObject(self) + + # create NexusCalcHandler which will process the request + instance = self.__clazz(**self._clazz_init_args) + + try: + # process the request asynchronously on a different thread, + # the current tornado handler is still available to get other user requests + results = yield tornado.ioloop.IOLoop.current().run_in_executor(self.executor, instance.calc, request) + + if results: + try: + self.set_status(results.status_code) + except AttributeError: + pass + + renderer = NexusRendererFactory.get_renderer("JSON") + renderer.render(self, results) + + except NexusProcessingException as e: + self.async_onerror_callback(e.reason, e.code) + + except Exception as e: + self.async_onerror_callback(str(e), 500) + def async_onerror_callback(self, reason, code=500): self.logger.error("Error processing request", exc_info=True) diff --git a/analysis/webservice/webmodel/NexusRequestObject.py b/analysis/webservice/webmodel/NexusRequestObject.py index bbd28280..18962364 100644 --- a/analysis/webservice/webmodel/NexusRequestObject.py +++ b/analysis/webservice/webmodel/NexusRequestObject.py @@ -35,6 +35,12 @@ def __init__(self, reqHandler): self.requestHandler = reqHandler StatsComputeOptions.__init__(self) + def get_headers(self): + return self.requestHandler.request.headers + + def get_request_body(self): + return self.requestHandler.request.body + def get_argument(self, name, default=None): return self.requestHandler.get_argument(name, default=default) diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index fb8c0f33..eaecf941 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -188,14 +188,13 @@ def _get_backend(dataset_s) -> AbstractTileService: return b['backend'] + @staticmethod - def _update_datasets(): + def _get_datasets_store(): solr_url = NexusTileService.ds_config.get("solr", "host") solr_core = NexusTileService.ds_config.get("solr", "core") solr_kwargs = {} - update_logger = logging.getLogger("nexus-tile-svc.backends") - if NexusTileService.ds_config.has_option("solr", "time_out"): solr_kwargs["timeout"] = NexusTileService.ds_config.get("solr", "time_out") @@ -208,55 +207,62 @@ def _update_datasets(): solrcon = solrcon - update_logger.info('Executing Solr query to check for new datasets') + return solrcon + + @staticmethod + def _update_datasets(): + update_logger = logging.getLogger("nexus-tile-svc.backends") + solrcon = NexusTileService._get_datasets_store() + + update_logger.info('Executing Solr query to check for new datasets') - present_datasets = {None, '__nexusproto__'} - next_cursor_mark = '*' + present_datasets = {None, '__nexusproto__'} + next_cursor_mark = '*' - added_datasets = 0 + added_datasets = 0 - while True: - response = solrcon.search('*:*', cursorMark=next_cursor_mark, sort='id asc') + while True: + response = solrcon.search('*:*', cursorMark=next_cursor_mark, sort='id asc') - try: - response_cursor_mark = response.nextCursorMark - except AttributeError: - break + try: + response_cursor_mark = response.nextCursorMark + except AttributeError: + break - if response_cursor_mark == next_cursor_mark: - break - else: - next_cursor_mark = response_cursor_mark - - for dataset in response.docs: - d_id = dataset['dataset_s'] - store_type = dataset.get('store_type_s', 'nexusproto') - - present_datasets.add(d_id) - - if d_id in NexusTileService.backends: - continue - # is_up = NexusTileService.backends[d_id]['backend'].try_connect() - - added_datasets += 1 - - if store_type == 'nexus_proto' or store_type == 'nexusproto': - update_logger.info(f"Detected new nexusproto dataset {d_id}, using default nexusproto backend") - NexusTileService.backends[d_id] = NexusTileService.backends[None] - elif store_type == 'zarr': - update_logger.info(f"Detected new zarr dataset {d_id}, opening new zarr backend") - - ds_config = json.loads(dataset['config'][0]) - try: - NexusTileService.backends[d_id] = { - 'backend': ZarrBackend(dataset_name=dataset['dataset_s'], **ds_config), - 'up': True - } - except NexusTileServiceException: - added_datasets -= 1 - else: - update_logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') + if response_cursor_mark == next_cursor_mark: + break + else: + next_cursor_mark = response_cursor_mark + + for dataset in response.docs: + d_id = dataset['dataset_s'] + store_type = dataset.get('store_type_s', 'nexusproto') + + present_datasets.add(d_id) + + if d_id in NexusTileService.backends: + continue + # is_up = NexusTileService.backends[d_id]['backend'].try_connect() + + added_datasets += 1 + + if store_type == 'nexus_proto' or store_type == 'nexusproto': + update_logger.info(f"Detected new nexusproto dataset {d_id}, using default nexusproto backend") + NexusTileService.backends[d_id] = NexusTileService.backends[None] + elif store_type == 'zarr': + update_logger.info(f"Detected new zarr dataset {d_id}, opening new zarr backend") + + ds_config = json.loads(dataset['config'][0]) + try: + NexusTileService.backends[d_id] = { + 'backend': ZarrBackend(dataset_name=dataset['dataset_s'], **ds_config), + 'up': True + } + except NexusTileServiceException: added_datasets -= 1 + else: + update_logger.warning(f'Unsupported backend {store_type} for dataset {d_id}') + added_datasets -= 1 removed_datasets = set(NexusTileService.backends.keys()).difference(present_datasets) @@ -270,6 +276,21 @@ def _update_datasets(): update_logger.info(f'Finished dataset update: {added_datasets} added, {len(removed_datasets)} removed, ' f'{len(NexusTileService.backends) - 2} total') + # Update cfg (ie, creds) of dataset + @staticmethod + def user_ds_update(): + pass + + # Add dataset + backend + @staticmethod + def user_ds_add(name, config): + pass + + # Delete dataset backend (error if it's a hardcoded one) + @staticmethod + def user_ds_delete(): + pass + def override_config(self, config): for section in config.sections(): if self._config.has_section(section): # only override preexisting section, ignores the other From c8e7dbb5e178bf9d88928659dea1792132760179 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 18 Jul 2023 16:12:29 -0700 Subject: [PATCH 17/70] Dynamic dataset management --- analysis/conda-requirements.txt | 2 +- analysis/webservice/management/Datasets.py | 58 ++++++++++++++++++++-- data-access/nexustiles/nexustiles.py | 2 +- data-access/requirements.txt | 3 +- 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/analysis/conda-requirements.txt b/analysis/conda-requirements.txt index e27bdeae..902d5114 100644 --- a/analysis/conda-requirements.txt +++ b/analysis/conda-requirements.txt @@ -33,4 +33,4 @@ gdal==3.2.1 mock==4.0.3 importlib_metadata==4.11.4 #singledispatch==3.4.0.3 - +schema diff --git a/analysis/webservice/management/Datasets.py b/analysis/webservice/management/Datasets.py index 195ca38e..0f8df06d 100644 --- a/analysis/webservice/management/Datasets.py +++ b/analysis/webservice/management/Datasets.py @@ -18,12 +18,34 @@ from webservice.NexusHandler import nexus_handler from nexustiles.nexustiles import NexusTileService from webservice.webmodel import NexusRequestObject, NexusProcessingException + +from schema import Schema, Or, SchemaError +from schema import Optional as Opt + +from urllib.parse import urlparse try: from yaml import CLoader as Loader except ImportError: from yaml import Loader +CONFIG_SCHEMA = Schema({ + Or('variable', 'variables'): Or(str, [str]), + 'coords': { + 'latitude': str, + 'longitude': str, + 'time': str, + Opt('depth'): str + }, + Opt('aws'): { + 'accessKeyID': Or(str, None), + 'secretAccessKey': Or(str, None), + 'public': bool, + Opt('region'): str + } +}) + + class DatasetManagement: @classmethod def validate(cls): @@ -34,12 +56,22 @@ def parse_config(request: NexusRequestObject): content_type = request.get_headers()['Content-Type'] if content_type in ['application/json', 'application/x-json']: - return json.loads(request.get_request_body()) + config_dict = json.loads(request.get_request_body()) elif content_type == 'application/yaml': - return load(request.get_request_body(), Loader=Loader) + config_dict = load(request.get_request_body(), Loader=Loader) else: raise NexusProcessingException(reason='Invalid Content-Type header', code=400) + try: + CONFIG_SCHEMA.validate(config_dict) + except SchemaError as e: + raise NexusProcessingException( + reason=str(e), + code=400 + ) + + return config_dict + @nexus_handler class DatasetAdd(DatasetManagement): @@ -51,7 +83,6 @@ def __init__(self, **args): pass def calc(self, request: NexusRequestObject, **args): - # print('CALC') try: config = DatasetManagement.parse_config(request) except Exception as e: @@ -68,8 +99,27 @@ def calc(self, request: NexusRequestObject, **args): code=400 ) + path = request.get_argument('path') + + if path is None: + raise NexusProcessingException( + reason='Path argument must be provided', + code=400 + ) + + try: + if urlparse(path).scheme not in ['file','','s3']: + raise NexusProcessingException( + reason='Dataset URL must be for a local file or S3 URL', + code=400 + ) + except ValueError: + raise NexusProcessingException( + reason='Could not parse path URL', code=400 + ) + try: - NexusTileService.user_ds_add(name, config) + NexusTileService.user_ds_add(name, path, config) except Exception as e: raise NexusProcessingException( reason=repr(e), diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index eaecf941..68a2a584 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -283,7 +283,7 @@ def user_ds_update(): # Add dataset + backend @staticmethod - def user_ds_add(name, config): + def user_ds_add(name, path, config): pass # Delete dataset backend (error if it's a hardcoded one) diff --git a/data-access/requirements.txt b/data-access/requirements.txt index ab96e2af..c732bede 100644 --- a/data-access/requirements.txt +++ b/data-access/requirements.txt @@ -22,4 +22,5 @@ nexusproto Shapely s3fs fsspec -xarray~=2022.3.0 \ No newline at end of file +xarray~=2022.3.0 +numpy==1.24.3 \ No newline at end of file From e78f7ade3422c97008ff6532593d39c2f863e475 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 20 Jul 2023 12:29:27 -0700 Subject: [PATCH 18/70] Dataset management --- analysis/webservice/management/Datasets.py | 70 +++++++++++++++++ .../request/handlers/NexusRequestHandler.py | 4 +- data-access/nexustiles/nexustiles.py | 76 +++++++++++++++++-- 3 files changed, 142 insertions(+), 8 deletions(-) diff --git a/analysis/webservice/management/Datasets.py b/analysis/webservice/management/Datasets.py index 0f8df06d..48071f7c 100644 --- a/analysis/webservice/management/Datasets.py +++ b/analysis/webservice/management/Datasets.py @@ -73,6 +73,14 @@ def parse_config(request: NexusRequestObject): return config_dict +class Response: + def __init__(self, response): + self.response = response if response is not None else {} + + def toJson(self): + return json.dumps(self.response) + + @nexus_handler class DatasetAdd(DatasetManagement): name = 'Add dataset' @@ -126,3 +134,65 @@ def calc(self, request: NexusRequestObject, **args): code=500 ) + +@nexus_handler +class DatasetUpdate(DatasetManagement): + name = 'Update dynamically added dataset' + path = '/datasets/update' + description = "Update dataset in running SDAP instance" + + def __init__(self, **args): + pass + + def calc(self, request: NexusRequestObject, **args): + try: + config = DatasetManagement.parse_config(request) + except Exception as e: + raise NexusProcessingException( + reason=repr(e), + code=400 + ) + + name = request.get_argument('name') + + if name is None: + raise NexusProcessingException( + reason='Name argument must be provided', + code=400 + ) + + try: + return Response(NexusTileService.user_ds_update(name, config)) + except Exception as e: + raise NexusProcessingException( + reason=repr(e), + code=500 + ) + + +@nexus_handler +class DatasetDelete(DatasetManagement): + name = 'Remove dataset' + path = '/datasets/remove' + description = "Remove dataset from running SDAP instance" + + def __init__(self, **args): + pass + + def calc(self, request: NexusRequestObject, **args): + name = request.get_argument('name') + + if name is None: + raise NexusProcessingException( + reason='Name argument must be provided', + code=400 + ) + + try: + return Response(NexusTileService.user_ds_delete(name)) + except Exception as e: + raise NexusProcessingException( + reason=repr(e), + code=500 + ) + diff --git a/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py b/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py index 1c7e936c..6392f105 100644 --- a/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py +++ b/analysis/webservice/nexus_tornado/request/handlers/NexusRequestHandler.py @@ -65,7 +65,7 @@ def get(self): @tornado.gen.coroutine def post(self): - self.logger.info("Received POST %s" % self._request_summary()) + self.logger.info("Received %s" % self._request_summary()) request = NexusRequestObject(self) @@ -83,7 +83,7 @@ def post(self): except AttributeError: pass - renderer = NexusRendererFactory.get_renderer("JSON") + renderer = NexusRendererFactory.get_renderer(request) renderer.render(self, results) except NexusProcessingException as e: diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index 68a2a584..a5abd241 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -278,18 +278,82 @@ def _update_datasets(): # Update cfg (ie, creds) of dataset @staticmethod - def user_ds_update(): - pass + def user_ds_update(name, config): + solr = NexusTileService._get_datasets_store() + + docs = solr.search(f'dataset_s:{name}').docs + + if len(docs) != 1: + raise ValueError(f'Given name must match exactly one existing dataset; matched {len(docs)}') + + ds = docs[0] + + if 'source_s' not in ds or ds['source_s'] == 'collection_config': + raise ValueError('Provided dataset is source_s in collection config and cannot be deleted') + + config_dict = json.loads(ds['config'][0]) + + config_dict['config'] = config + + solr.delete(id=ds['id']) + solr.add([{ + 'id': name, + 'dataset_s': name, + 'latest_update_l': int(datetime.now().timestamp()), + 'store_type_s': ds['store_type_s'], + 'config': json.dumps(config_dict), + 'source_s': 'user_added' + }]) + solr.commit() + + return {'success': True} # Add dataset + backend @staticmethod - def user_ds_add(name, path, config): - pass + def user_ds_add(name, path, config, type='zarr'): + solr = NexusTileService._get_datasets_store() + + docs = solr.search(f'dataset_s:{name}').docs + + if len(docs) > 0: + raise ValueError(f'Dataset {name} already exists') + + config_dict = { + 'path': path, + 'config': config + } + + solr.add([{ + 'id': name, + 'dataset_s': name, + 'latest_update_l': int(datetime.now().timestamp()), + 'store_type_s': type, + 'config': json.dumps(config_dict), + 'source_s': 'user_added' + }]) + solr.commit() + + return {'success': True} # Delete dataset backend (error if it's a hardcoded one) @staticmethod - def user_ds_delete(): - pass + def user_ds_delete(name): + solr = NexusTileService._get_datasets_store() + + docs = solr.search(f'dataset_s:{name}').docs + + if len(docs) != 1: + raise ValueError(f'Given name must match exactly one existing dataset; matched {len(docs)}') + + ds = docs[0] + + if 'source_s' not in ds or ds['source_s'] == 'collection_config': + raise ValueError('Provided dataset is source_s in collection config and cannot be deleted') + + solr.delete(id=ds['id']) + solr.commit() + + return {'success': True} def override_config(self, config): for section in config.sections(): From a84d77e569fcf224597c73c17d1fa109f36a2a5b Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 27 Jul 2023 10:05:20 -0700 Subject: [PATCH 19/70] Timeseriesspark support --- .../algorithms_spark/TimeSeriesSpark.py | 5 +- .../nexustiles/backends/zarr/backend.py | 47 +++++++++++++------ 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/analysis/webservice/algorithms_spark/TimeSeriesSpark.py b/analysis/webservice/algorithms_spark/TimeSeriesSpark.py index faeaa0b1..6a353cf4 100644 --- a/analysis/webservice/algorithms_spark/TimeSeriesSpark.py +++ b/analysis/webservice/algorithms_spark/TimeSeriesSpark.py @@ -488,8 +488,9 @@ def calc_average_on_day(tile_service_factory, metrics_callback, normalize_dates, timestamps[0], timestamps[-1], rows=5000, - metrics_callback=metrics_callback) - + metrics_callback=metrics_callback, + distinct=True) + calculation_start = datetime.now() tile_dict = {} diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index f4f92c56..9aab3cff 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -110,8 +110,8 @@ def get_dataseries_list(self, simple=False): max_date = self.get_max_time([]) ds['start'] = min_date ds['end'] = max_date - ds['iso_start'] = datetime.fromtimestamp(min_date).strftime(ISO_8601) - ds['iso_end'] = datetime.fromtimestamp(max_date).strftime(ISO_8601) + ds['iso_start'] = datetime.utcfromtimestamp(min_date).strftime(ISO_8601) + ds['iso_end'] = datetime.utcfromtimestamp(max_date).strftime(ISO_8601) return [ds] @@ -126,10 +126,10 @@ def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, st start = datetime.now() if not isinstance(start_time, datetime): - start_time = datetime.fromtimestamp(start_time) + start_time = datetime.utcfromtimestamp(start_time) if not isinstance(end_time, datetime): - end_time = datetime.fromtimestamp(end_time) + end_time = datetime.utcfromtimestamp(end_time) sel = { self.__latitude: slice(min_lat, max_lat), @@ -142,7 +142,7 @@ def find_days_in_range_asc(self, min_lat, max_lat, min_lon, max_lon, dataset, st if np.issubdtype(times.dtype, np.datetime64): times = ((times - np.datetime64(EPOCH)) / 1e9).astype(int) - times = sorted(list(times)) + times = sorted(times.tolist()) if metrics_callback: metrics_callback(backend=(datetime.now() - start).total_seconds()) @@ -193,9 +193,14 @@ def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_t 'max_lon': max_lon } + times = None + if 0 <= start_time <= end_time: - params['min_time'] = start_time - params['max_time'] = end_time + if kwargs.get('distinct', False): + times_asc = self.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) + times = [(t, t) for t in times_asc] + else: + times = [(start_time, end_time)] if 'depth' in kwargs: params['depth'] = kwargs['depth'] @@ -203,7 +208,10 @@ def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_t params['min_depth'] = kwargs.get('min_depth') params['max_depth'] = kwargs.get('max_depth') - return [ZarrBackend.__to_url(self._name, **params)] + if times: + return [ZarrBackend.__to_url(self._name, min_time=t[0], max_time=t[1], **params) for t in times] + else: + return [ZarrBackend.__to_url(self._name, **params)] def find_tiles_in_polygon(self, bounding_polygon, ds=None, start_time=None, end_time=None, **kwargs): # Find tiles that fall within the polygon in the Solr index @@ -365,10 +373,10 @@ def __fetch_data_for_tile(self, tile: Tile): max_time = float(tile.max_time) if min_time: - min_time = datetime.fromtimestamp(min_time) + min_time = datetime.utcfromtimestamp(min_time) if max_time: - max_time = datetime.fromtimestamp(max_time) + max_time = datetime.utcfromtimestamp(max_time) if bbox: min_lat = bbox.min_lat @@ -376,23 +384,25 @@ def __fetch_data_for_tile(self, tile: Tile): max_lat = bbox.max_lat max_lon = bbox.max_lon - sel = { + sel_g = { self.__latitude: slice(min_lat, max_lat), self.__longitude: slice(min_lon, max_lon), } + sel_t = {} + if min_time == max_time: - sel[self.__time] = min_time + sel_t[self.__time] = [min_time] # List, otherwise self.__time dim will be dropped method = 'nearest' else: - sel[self.__time] = slice(min_time, max_time) + sel_t[self.__time] = slice(min_time, max_time) method = None tile.variables = [ TileVariable(v, v) for v in self.__variables ] - matched = self.__ds.sel(sel, method=method) + matched = self.__ds.sel(sel_g).sel(sel_t, method=method) tile.latitudes = ma.masked_invalid(matched[self.__latitude].to_numpy()) tile.longitudes = ma.masked_invalid(matched[self.__longitude].to_numpy()) @@ -453,6 +463,15 @@ def __to_url(dataset, **kwargs): if 'ds' in kwargs: del kwargs['ds'] + # If any params are numpy dtypes, extract them to base python types + for kw in kwargs: + v = kwargs[kw] + + if isinstance(v, np.generic): + v = v.item() + + kwargs[kw] = v + return str(URL.build( scheme='nts', host='', From 53190e2834e47b547bffefb3f5ffc407efa5165c Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 31 Jul 2023 07:54:13 -0700 Subject: [PATCH 20/70] Update backend dict on dataset mgmt query --- data-access/nexustiles/nexustiles.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index a5abd241..772f6f4f 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -306,6 +306,11 @@ def user_ds_update(name, config): }]) solr.commit() + logger.info(f'Updated dataset {name} in Solr. Updating backends') + + with DS_LOCK: + NexusTileService._update_datasets() + return {'success': True} # Add dataset + backend @@ -333,6 +338,11 @@ def user_ds_add(name, path, config, type='zarr'): }]) solr.commit() + logger.info(f'Added dataset {name} to Solr. Updating backends') + + with DS_LOCK: + NexusTileService._update_datasets() + return {'success': True} # Delete dataset backend (error if it's a hardcoded one) @@ -353,6 +363,11 @@ def user_ds_delete(name): solr.delete(id=ds['id']) solr.commit() + logger.info(f'Removed dataset {name} from Solr. Updating backends') + + with DS_LOCK: + NexusTileService._update_datasets() + return {'success': True} def override_config(self, config): From 2e7a0dcc280d7f447e9c09e723ddb3a5215d4460 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 31 Jul 2023 12:36:13 -0700 Subject: [PATCH 21/70] Fixes and improvements --- analysis/webservice/management/Datasets.py | 12 ++++++++-- .../nexustiles/backends/zarr/backend.py | 22 ++++++++++++++----- data-access/nexustiles/nexustiles.py | 10 ++++++++- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/analysis/webservice/management/Datasets.py b/analysis/webservice/management/Datasets.py index 48071f7c..ded1e8a2 100644 --- a/analysis/webservice/management/Datasets.py +++ b/analysis/webservice/management/Datasets.py @@ -38,8 +38,8 @@ Opt('depth'): str }, Opt('aws'): { - 'accessKeyID': Or(str, None), - 'secretAccessKey': Or(str, None), + Opt('accessKeyID'): str, + Opt('secretAccessKey'): str, 'public': bool, Opt('region'): str } @@ -64,6 +64,14 @@ def parse_config(request: NexusRequestObject): try: CONFIG_SCHEMA.validate(config_dict) + + if 'aws' in config_dict: + if not config_dict['aws']['public']: + if 'accessKeyID' not in config_dict['aws'] or 'secretAccessKey' not in config_dict['aws']: + raise NexusProcessingException( + reason='Must provide AWS creds for non-public bucket', + code=400 + ) except SchemaError as e: raise NexusProcessingException( reason=str(e), diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 9aab3cff..214a991b 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -81,8 +81,10 @@ def __init__(self, dataset_name, path, config=None): aws_cfg = self.__config['aws'] if aws_cfg['public']: - region = aws_cfg.get('region', 'us-west-2') - store = f'https://{self.__host}.s3.{region}.amazonaws.com{self.__path}' + # region = aws_cfg.get('region', 'us-west-2') + # store = f'https://{self.__host}.s3.{region}.amazonaws.com{self.__path}' + s3 = s3fs.S3FileSystem(True) + store = s3fs.S3Map(root=path, s3=s3, check=False) else: s3 = s3fs.S3FileSystem(False, key=aws_cfg['accessKeyID'], secret=aws_cfg['secretAccessKey']) store = s3fs.S3Map(root=path, s3=s3, check=False) @@ -116,7 +118,7 @@ def get_dataseries_list(self, simple=False): return [ds] def find_tile_by_id(self, tile_id, **kwargs): - return tile_id + return [tile_id] def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): return tile_ids @@ -330,7 +332,7 @@ def get_max_time(self, tile_ids, ds=None): min_date, max_date = self.__get_ds_min_max_date() return max_date else: - max(times) + return max(times) def get_distinct_bounding_boxes_in_polygon(self, bounding_polygon, ds, start_time, end_time): """ @@ -442,6 +444,7 @@ def __nts_url_to_tile(nts_url): pass tile.dataset = url.path + tile.dataset_id = url.path try: tile.min_time = int(url.query['min_time']) @@ -453,6 +456,8 @@ def __nts_url_to_tile(nts_url): except KeyError: pass + tile.meta_data = {} + return tile @staticmethod @@ -463,20 +468,25 @@ def __to_url(dataset, **kwargs): if 'ds' in kwargs: del kwargs['ds'] + params = {} + # If any params are numpy dtypes, extract them to base python types for kw in kwargs: v = kwargs[kw] + if v is None: + continue + if isinstance(v, np.generic): v = v.item() - kwargs[kw] = v + params[kw] = v return str(URL.build( scheme='nts', host='', path=dataset, - query=kwargs + query=params )) diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index 772f6f4f..ed526c55 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -31,6 +31,7 @@ from shapely.geometry import box from webservice.webmodel import DatasetNotFoundException, NexusProcessingException from webservice.NexusHandler import nexus_initializer +from yarl import URL from .AbstractTileService import AbstractTileService from .backends.nexusproto.backend import NexusprotoTileService @@ -388,11 +389,18 @@ def get_dataseries_list(self, simple=False): @tile_data() @catch_not_implemented def find_tile_by_id(self, tile_id, **kwargs): - return NexusTileService._get_backend('__nexusproto__').find_tile_by_id(tile_id) + tile = URL(tile_id) + + if tile.scheme == 'nts': + return NexusTileService._get_backend(tile.path).find_tile_by_id(tile_id) + else: + return NexusTileService._get_backend('__nexusproto__').find_tile_by_id(tile_id) @tile_data() @catch_not_implemented def find_tiles_by_id(self, tile_ids, ds=None, **kwargs): + if ds is None: + return [self.find_tile_by_id(tid, **kwargs, fetch_data=False) for tid in tile_ids] return NexusTileService._get_backend(ds).find_tiles_by_id(tile_ids, ds=ds, **kwargs) @catch_not_implemented From 08693754a542d655069c19c0503adbd41a401a7c Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 31 Jul 2023 12:38:01 -0700 Subject: [PATCH 22/70] Adapted matchup to work with zarr backends --- analysis/webservice/algorithms_spark/Matchup.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/analysis/webservice/algorithms_spark/Matchup.py b/analysis/webservice/algorithms_spark/Matchup.py index f27612a5..7f84063e 100644 --- a/analysis/webservice/algorithms_spark/Matchup.py +++ b/analysis/webservice/algorithms_spark/Matchup.py @@ -777,9 +777,9 @@ def match_satellite_to_insitu(tile_ids, primary_b, secondary_b, parameter_b, tt_ tile_service = tile_service_factory() # Determine the spatial temporal extents of this partition of tiles - tiles_bbox = tile_service.get_bounding_box(tile_ids) - tiles_min_time = tile_service.get_min_time(tile_ids) - tiles_max_time = tile_service.get_max_time(tile_ids) + tiles_bbox = tile_service.get_bounding_box(tile_ids, ds=primary_b.value) + tiles_min_time = tile_service.get_min_time(tile_ids, ds=primary_b.value) + tiles_max_time = tile_service.get_max_time(tile_ids, ds=primary_b.value) # Increase spatial extents by the radius tolerance matchup_min_lon, matchup_min_lat = add_meters_to_lon_lat(tiles_bbox.bounds[0], tiles_bbox.bounds[1], @@ -858,7 +858,7 @@ def match_satellite_to_insitu(tile_ids, primary_b, secondary_b, parameter_b, tt_ edge_results = [] for tile in matchup_tiles: # Retrieve tile data and convert to lat/lon projection - tiles = tile_service.find_tile_by_id(tile.tile_id, fetch_data=True) + tiles = tile_service.find_tile_by_id(tile.tile_id, fetch_data=True, ds=secondary_b.value) tile = tiles[0] valid_indices = tile.get_indices() @@ -884,14 +884,14 @@ def match_satellite_to_insitu(tile_ids, primary_b, secondary_b, parameter_b, tt_ # The actual matching happens in the generator. This is so that we only load 1 tile into memory at a time match_generators = [match_tile_to_point_generator(tile_service, tile_id, m_tree, edge_results, bounding_wkt_b.value, - parameter_b.value, rt_b.value, aeqd_proj) for tile_id - in tile_ids] + parameter_b.value, rt_b.value, aeqd_proj, primary_b.value) + for tile_id in tile_ids] return chain(*match_generators) def match_tile_to_point_generator(tile_service, tile_id, m_tree, edge_results, search_domain_bounding_wkt, - search_parameter, radius_tolerance, aeqd_proj): + search_parameter, radius_tolerance, aeqd_proj, primary_ds): from nexustiles.model.nexusmodel import NexusPoint from webservice.algorithms_spark.Matchup import DomsPoint # Must import DomsPoint or Spark complains @@ -899,7 +899,7 @@ def match_tile_to_point_generator(tile_service, tile_id, m_tree, edge_results, s try: the_time = datetime.now() tile = tile_service.mask_tiles_to_polygon(wkt.loads(search_domain_bounding_wkt), - tile_service.find_tile_by_id(tile_id))[0] + tile_service.find_tile_by_id(tile_id, ds=primary_ds))[0] print("%s Time to load tile %s" % (str(datetime.now() - the_time), tile_id)) except IndexError: # This should only happen if all measurements in a tile become masked after applying the bounding polygon From 1eb680bb794914f8a9e3d9b8fae3ebd3a0b970cb Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 1 Aug 2023 14:45:25 -0700 Subject: [PATCH 23/70] Zarr support - Distinct slices of time is now default - No longer assuming+shaping as multivar tiles unless needed --- .../webservice/algorithms_spark/HofMoellerSpark.py | 8 ++++---- data-access/nexustiles/backends/zarr/backend.py | 14 +++++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/analysis/webservice/algorithms_spark/HofMoellerSpark.py b/analysis/webservice/algorithms_spark/HofMoellerSpark.py index 6231bdb1..90ca87c0 100644 --- a/analysis/webservice/algorithms_spark/HofMoellerSpark.py +++ b/analysis/webservice/algorithms_spark/HofMoellerSpark.py @@ -44,12 +44,12 @@ class HofMoellerCalculator(object): def hofmoeller_stats(tile_service_factory, metrics_callback, tile_in_spark): (latlon, tile_id, index, - min_lat, max_lat, min_lon, max_lon) = tile_in_spark + min_lat, max_lat, min_lon, max_lon, dataset) = tile_in_spark tile_service = tile_service_factory() try: # Load the dataset tile - tile = tile_service.find_tile_by_id(tile_id, metrics_callback=metrics_callback)[0] + tile = tile_service.find_tile_by_id(tile_id, metrics_callback=metrics_callback, ds=dataset)[0] calculation_start = datetime.now() # Mask it to the search domain tile = tile_service.mask_tiles_to_bbox(min_lat, max_lat, @@ -352,7 +352,7 @@ def calc(self, compute_options, **args): min_lon, min_lat, max_lon, max_lat = bbox.bounds - nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon) for x, tile in + nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon, tile.dataset) for x, tile in enumerate(self._get_tile_service().find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, metrics_callback=metrics_record.record_metrics, @@ -408,7 +408,7 @@ def calc(self, compute_options, **args): min_lon, min_lat, max_lon, max_lat = bbox.bounds - nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon) for x, tile in + nexus_tiles_spark = [(self._latlon, tile.tile_id, x, min_lat, max_lat, min_lon, max_lon, tile.dataset) for x, tile in enumerate(self._get_tile_service().find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time, metrics_callback=metrics_record.record_metrics, diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 214a991b..29099d28 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -198,7 +198,7 @@ def find_tiles_in_box(self, min_lat, max_lat, min_lon, max_lon, ds=None, start_t times = None if 0 <= start_time <= end_time: - if kwargs.get('distinct', False): + if kwargs.get('distinct', True): times_asc = self.find_days_in_range_asc(min_lat, max_lat, min_lon, max_lon, ds, start_time, end_time) times = [(t, t) for t in times_asc] else: @@ -416,11 +416,15 @@ def __fetch_data_for_tile(self, tile: Tile): tile.times = ma.masked_invalid(times) - tile.data = ma.masked_invalid( - [matched[var].to_numpy() for var in self.__variables] - ) + var_data = [matched[var].to_numpy() for var in self.__variables] + + if len(self.__variables) > 1: + tile.data = ma.masked_invalid(var_data) + tile.is_multi = True + else: + tile.data = ma.masked_invalid(var_data[0]) + tile.is_multi = False - tile.is_multi = True def _metadata_store_docs_to_tiles(self, *store_docs): return [ZarrBackend.__nts_url_to_tile(d) for d in store_docs] From 0aef0f13d0b2178724eb6aa9e198fb4260066920 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 2 Aug 2023 14:34:15 -0700 Subject: [PATCH 24/70] DDAS adjustments --- .../algorithms_spark/DailyDifferenceAverageSpark.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis/webservice/algorithms_spark/DailyDifferenceAverageSpark.py b/analysis/webservice/algorithms_spark/DailyDifferenceAverageSpark.py index b4245783..12f7deec 100644 --- a/analysis/webservice/algorithms_spark/DailyDifferenceAverageSpark.py +++ b/analysis/webservice/algorithms_spark/DailyDifferenceAverageSpark.py @@ -324,7 +324,7 @@ def calculate_diff(tile_service_factory, tile_ids, bounding_wkt, dataset, climat for tile_id in tile_ids: # Get the dataset tile try: - dataset_tile = get_dataset_tile(tile_service, wkt.loads(bounding_wkt.value), tile_id) + dataset_tile = get_dataset_tile(tile_service, wkt.loads(bounding_wkt.value), tile_id, dataset.value) except NoDatasetTile: # This should only happen if all measurements in a tile become masked after applying the bounding polygon continue @@ -348,12 +348,12 @@ def calculate_diff(tile_service_factory, tile_ids, bounding_wkt, dataset, climat return chain(*diff_generators) -def get_dataset_tile(tile_service, search_bounding_shape, tile_id): +def get_dataset_tile(tile_service, search_bounding_shape, tile_id, dataset): the_time = datetime.now() try: # Load the dataset tile - dataset_tile = tile_service.find_tile_by_id(tile_id)[0] + dataset_tile = tile_service.find_tile_by_id(tile_id, ds=dataset)[0] # Mask it to the search domain dataset_tile = tile_service.mask_tiles_to_polygon(search_bounding_shape, [dataset_tile])[0] except IndexError: From 42b912ebec6e445ef4d163f7774c50aa1c422339 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 3 Aug 2023 14:33:26 -0700 Subject: [PATCH 25/70] find_tile_by_polygon_and_most_recent_day_of_year impl --- .../nexustiles/backends/zarr/backend.py | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 29099d28..d592954c 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -174,7 +174,24 @@ def find_tile_by_polygon_and_most_recent_day_of_year(self, bounding_polygon, ds, :param day_of_year: Tile day of year to search for, tile nearest to this day (without going over) will be returned :return: List of one tile from ds with bounding_polygon on or before day_of_year or raise NexusTileServiceException if no tile found """ - raise NotImplementedError() + + times = self.__ds[self.__time].to_numpy() + + to_doy = lambda dt: datetime.utcfromtimestamp(int(dt)).timetuple().tm_yday + + vfunc = np.vectorize(to_doy) + days_of_year = vfunc(times.astype(datetime) / 1e9) + + try: + time = times[np.where(days_of_year <= day_of_year)[0][-1]].astype(datetime) / 1e9 + except IndexError: + raise NexusTileServiceException(reason='No tiles matched') + + min_lon, min_lat, max_lon, max_lat = bounding_polygon.bounds + + return self.find_tiles_in_box( + min_lat, max_lat, min_lon, max_lon, ds, time, time + ) def find_all_tiles_in_box_at_time(self, min_lat, max_lat, min_lon, max_lon, dataset, time, **kwargs): return self.find_tiles_in_box(min_lat, max_lat, min_lon, max_lon, dataset, time, time, **kwargs) @@ -371,14 +388,14 @@ def __fetch_data_for_tile(self, tile: Tile): max_lat = None max_lon = None - min_time = float(tile.min_time) - max_time = float(tile.max_time) - - if min_time: - min_time = datetime.utcfromtimestamp(min_time) + min_time = tile.min_time + max_time = tile.max_time - if max_time: - max_time = datetime.utcfromtimestamp(max_time) + # if min_time: + # min_time = datetime.utcfromtimestamp(min_time) + # + # if max_time: + # max_time = datetime.utcfromtimestamp(max_time) if bbox: min_lat = bbox.min_lat @@ -451,12 +468,14 @@ def __nts_url_to_tile(nts_url): tile.dataset_id = url.path try: - tile.min_time = int(url.query['min_time']) + # tile.min_time = int(url.query['min_time']) + tile.min_time = datetime.utcfromtimestamp(int(url.query['min_time'])) except KeyError: pass try: - tile.max_time = int(url.query['max_time']) + # tile.max_time = int(url.query['max_time']) + tile.max_time = datetime.utcfromtimestamp(int(url.query['max_time'])) except KeyError: pass From 1559fbafee08a73e0ab8c44e063e30870592a078 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 8 Aug 2023 15:35:05 -0700 Subject: [PATCH 26/70] Don't sel by time if neither max nor min time are given --- data-access/nexustiles/backends/zarr/backend.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index d592954c..c8fd0fe1 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -410,7 +410,10 @@ def __fetch_data_for_tile(self, tile: Tile): sel_t = {} - if min_time == max_time: + if min_time is None and max_time is None: + sel_t = None + method = None + elif min_time == max_time: sel_t[self.__time] = [min_time] # List, otherwise self.__time dim will be dropped method = 'nearest' else: @@ -421,7 +424,10 @@ def __fetch_data_for_tile(self, tile: Tile): TileVariable(v, v) for v in self.__variables ] - matched = self.__ds.sel(sel_g).sel(sel_t, method=method) + matched = self.__ds.sel(sel_g) #.sel(sel_t, method=method) + + if sel_t is not None: + matched = matched.sel(sel_t, method=method) tile.latitudes = ma.masked_invalid(matched[self.__latitude].to_numpy()) tile.longitudes = ma.masked_invalid(matched[self.__longitude].to_numpy()) From 2bb52afb0925e89921f3576138228defea966c47 Mon Sep 17 00:00:00 2001 From: rileykk Date: Tue, 15 Aug 2023 13:15:29 -0700 Subject: [PATCH 27/70] Fix not calling partial when needed --- analysis/webservice/algorithms_spark/CorrMapSpark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/webservice/algorithms_spark/CorrMapSpark.py b/analysis/webservice/algorithms_spark/CorrMapSpark.py index fe1954df..7336993a 100644 --- a/analysis/webservice/algorithms_spark/CorrMapSpark.py +++ b/analysis/webservice/algorithms_spark/CorrMapSpark.py @@ -57,7 +57,7 @@ def _map(tile_service_factory, tile_in): # print 'days_at_a_time = ', days_at_a_time t_incr = 86400 * days_at_a_time - tile_service = tile_service_factory + tile_service = tile_service_factory() # Compute the intermediate summations needed for the Pearson # Correlation Coefficient. We use a one-pass online algorithm From f9dc2aebd77c1739a24823eaf2f529bba220ee4e Mon Sep 17 00:00:00 2001 From: rileykk Date: Fri, 18 Aug 2023 09:32:18 -0700 Subject: [PATCH 28/70] Pinned s3fs and fsspec versions --- data-access/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data-access/requirements.txt b/data-access/requirements.txt index c732bede..db1bf2cf 100644 --- a/data-access/requirements.txt +++ b/data-access/requirements.txt @@ -20,7 +20,7 @@ urllib3==1.26.2 requests nexusproto Shapely -s3fs -fsspec +s3fs==2022.5.0 +fsspec==2022.5.0 xarray~=2022.3.0 numpy==1.24.3 \ No newline at end of file From a6f602d63705bb753ccfaaced202de366c0dd462 Mon Sep 17 00:00:00 2001 From: rileykk Date: Fri, 18 Aug 2023 11:44:10 -0700 Subject: [PATCH 29/70] Fixed some dependencies to ensure image builds properly + s3fs works --- analysis/conda-requirements.txt | 3 ++- data-access/requirements.txt | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/analysis/conda-requirements.txt b/analysis/conda-requirements.txt index 902d5114..22dff066 100644 --- a/analysis/conda-requirements.txt +++ b/analysis/conda-requirements.txt @@ -22,7 +22,8 @@ pytz==2021.1 utm==0.6.0 shapely==1.7.1 backports.functools_lru_cache==1.6.1 -boto3==1.16.63 +boto3>=1.16.63 +botocore==1.24.21 pillow==8.1.0 mpld3=0.5.1 tornado==6.1 diff --git a/data-access/requirements.txt b/data-access/requirements.txt index db1bf2cf..48a1fc6a 100644 --- a/data-access/requirements.txt +++ b/data-access/requirements.txt @@ -22,5 +22,9 @@ nexusproto Shapely s3fs==2022.5.0 fsspec==2022.5.0 +botocore==1.24.21 +aiohttp==3.8.1 xarray~=2022.3.0 -numpy==1.24.3 \ No newline at end of file +numpy==1.24.3 +pandas<2.1.0rc0 # Temporary restriction because 2.1.0rc0 fails to build + From 1a451eba314f17d9fabcbf152b4214ab9819da4a Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 21 Aug 2023 07:39:23 -0700 Subject: [PATCH 30/70] Config override for backends --- data-access/nexustiles/nexustiles.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index ed526c55..b4fd6bba 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -141,9 +141,6 @@ def __init__(self, config=None): self._alg_config = config - if config: - self.override_config(config) - if not NexusTileService.backends: NexusTileService.ds_config = configparser.RawConfigParser() NexusTileService.ds_config.read(NexusTileService._get_config_files('config/datasets.ini')) @@ -153,6 +150,9 @@ def __init__(self, config=None): NexusTileService.backends[None] = default_backend NexusTileService.backends['__nexusproto__'] = default_backend + if config: + self.override_config(config) + if not NexusTileService.__update_thread: NexusTileService.__update_thread = threading.Thread( target=NexusTileService.__update_datasets_loop, @@ -377,6 +377,10 @@ def override_config(self, config): for option in config.options(section): if config.get(section, option) is not None: self._config.set(section, option, config.get(section, option)) + if NexusTileService.ds_config.has_section(section): # only override preexisting section, ignores the other + for option in config.options(section): + if config.get(section, option) is not None: + NexusTileService.ds_config.set(section, option, config.get(section, option)) def get_dataseries_list(self, simple=False): datasets = [] From 6f8f7b10f60c316b1a8dd6ed39984b81a8e19294 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 21 Aug 2023 07:40:51 -0700 Subject: [PATCH 31/70] Deps update --- data-access/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/data-access/requirements.txt b/data-access/requirements.txt index 48a1fc6a..9001ed34 100644 --- a/data-access/requirements.txt +++ b/data-access/requirements.txt @@ -25,6 +25,7 @@ fsspec==2022.5.0 botocore==1.24.21 aiohttp==3.8.1 xarray~=2022.3.0 +zarr>=2.11.3 numpy==1.24.3 pandas<2.1.0rc0 # Temporary restriction because 2.1.0rc0 fails to build From 483ad9f07d277cba13c18d946821a806995f1afd Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 31 Aug 2023 15:57:57 -0700 Subject: [PATCH 32/70] Add metadata from Zarr collection to /list --- data-access/nexustiles/backends/zarr/backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index c8fd0fe1..818d4b07 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -115,6 +115,8 @@ def get_dataseries_list(self, simple=False): ds['iso_start'] = datetime.utcfromtimestamp(min_date).strftime(ISO_8601) ds['iso_end'] = datetime.utcfromtimestamp(max_date).strftime(ISO_8601) + ds['metadata'] = dict(self.__ds.attrs) + return [ds] def find_tile_by_id(self, tile_id, **kwargs): From f5750c32eafaef0fb5f31795037c30cec1e1c325 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 14 Sep 2023 14:07:16 -0700 Subject: [PATCH 33/70] Zarr: Probe lat order and flip if necessary --- data-access/nexustiles/backends/zarr/backend.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 818d4b07..01559000 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -100,6 +100,13 @@ def __init__(self, dataset_name, path, config=None): logger.error(f'Failed to open zarr dataset at {self.__path}, ignoring it. Cause: {e}') raise NexusTileServiceException(f'Cannot open dataset ({e})') + lats = self.__ds[self.__latitude].to_numpy() + delta = lats[1] - lats[0] + + if delta < 0: + logger.warning(f'Latitude coordinate for {self._name} is in descending order. Flipping it to ascending') + self.__ds = self.__ds.isel({self.__latitude: slice(None, None, -1)}) + def get_dataseries_list(self, simple=False): ds = { "shortName": self._name, From 7fc260ae53b2b97f0140e12c1a18f9ba9d6e7b4e Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 20 Sep 2023 09:14:00 -0700 Subject: [PATCH 34/70] Strip quotes from variable names CM can sometimes publish with extra quotes resulting in KeyErrors --- data-access/nexustiles/backends/zarr/backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index 01559000..e1d0a0c1 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -68,6 +68,8 @@ def __init__(self, dataset_name, path, config=None): else: raise TypeError(f'Improper type for variables config: {type(data_vars)}') + self.__variables = [v.strip('\"\'') for v in self.__variables] + self.__longitude = config['coords']['longitude'] self.__latitude = config['coords']['latitude'] self.__time = config['coords']['time'] From b5df944ec6dcc3bb038c02f5849fd300c0a219c6 Mon Sep 17 00:00:00 2001 From: skorper Date: Mon, 25 Sep 2023 13:37:09 -0700 Subject: [PATCH 35/70] removed resultSizeLimit param from matchup --- analysis/webservice/algorithms_spark/Matchup.py | 17 +++-------------- analysis/webservice/apidocs/openapi.yml | 13 ------------- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/analysis/webservice/algorithms_spark/Matchup.py b/analysis/webservice/algorithms_spark/Matchup.py index a55f61d1..77ecc346 100644 --- a/analysis/webservice/algorithms_spark/Matchup.py +++ b/analysis/webservice/algorithms_spark/Matchup.py @@ -137,14 +137,6 @@ class Matchup(NexusCalcSparkTornadoHandler): + "If true, only the nearest point will be returned for each primary point. " + "If false, all points within the tolerances will be returned for each primary point. Default: False" }, - "resultSizeLimit": { - "name": "Result Size Limit", - "type": "int", - "description": "Optional integer value that limits the number of results returned from the matchup. " - "If the number of primary matches is greater than this limit, the service will respond with " - "(HTTP 202: Accepted) and an empty response body. A value of 0 means return all results. " - "Default: 500" - }, "prioritizeDistance": { "name": "Prioritize distance", "type": "boolean", @@ -223,8 +215,6 @@ def parse_arguments(self, request): match_once = request.get_boolean_arg("matchOnce", default=False) - result_size_limit = request.get_int_arg("resultSizeLimit", default=500) - start_seconds_from_epoch = int((start_time - EPOCH).total_seconds()) end_seconds_from_epoch = int((end_time - EPOCH).total_seconds()) @@ -234,7 +224,7 @@ def parse_arguments(self, request): return bounding_polygon, primary_ds_name, secondary_ds_names, parameter_s, \ start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, \ depth_min, depth_max, time_tolerance, radius_tolerance, \ - platforms, match_once, result_size_limit, prioritize_distance + platforms, match_once, prioritize_distance def get_job_pool(self, tile_ids): if len(tile_ids) > LARGE_JOB_THRESHOLD: @@ -244,7 +234,7 @@ def get_job_pool(self, tile_ids): def async_calc(self, execution_id, tile_ids, bounding_polygon, primary_ds_name, secondary_ds_names, parameter_s, start_time, end_time, depth_min, depth_max, time_tolerance, radius_tolerance, platforms, match_once, - result_size_limit, start, prioritize_distance): + start, prioritize_distance): # Call spark_matchup self.log.debug("Calling Spark Driver") @@ -310,7 +300,7 @@ def calc(self, request, tornado_io_loop, **args): bounding_polygon, primary_ds_name, secondary_ds_names, parameter_s, \ start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, \ depth_min, depth_max, time_tolerance, radius_tolerance, \ - platforms, match_once, result_size_limit, prioritize_distance = self.parse_arguments(request) + platforms, match_once, prioritize_distance = self.parse_arguments(request) args = { "primary": primary_ds_name, @@ -380,7 +370,6 @@ def calc(self, request, tornado_io_loop, **args): radius_tolerance=radius_tolerance, platforms=platforms, match_once=match_once, - result_size_limit=result_size_limit, start=start, prioritize_distance=prioritize_distance )) diff --git a/analysis/webservice/apidocs/openapi.yml b/analysis/webservice/apidocs/openapi.yml index ea9b16ba..dc6fdb4a 100644 --- a/analysis/webservice/apidocs/openapi.yml +++ b/analysis/webservice/apidocs/openapi.yml @@ -154,19 +154,6 @@ paths: type: boolean default: false example: false - - in: query - name: resultSizeLimit - description: | - Optional integer value that limits the number of results - returned from the matchup. If the number of primary matches - is greater than this limit, the service will respond with - (HTTP 202 Accepted) and an empty response body. A value of - 0 means return all results. - required: false - schema: - type: integer - default: 500 - example: 500 - in: query name: prioritizeDistance description: | From 5e0fbb2521cc8ce2fa33281707aeb28950384a6b Mon Sep 17 00:00:00 2001 From: skorper Date: Mon, 25 Sep 2023 15:45:41 -0700 Subject: [PATCH 36/70] Add # of primaries/avergae secondaries to job output --- .../webservice/algorithms/doms/ExecutionStatus.py | 12 +++++++++++- .../webservice/algorithms/doms/ResultsStorage.py | 4 ++-- .../webservice/webmodel/NexusExecutionResults.py | 13 +++++++++++-- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/analysis/webservice/algorithms/doms/ExecutionStatus.py b/analysis/webservice/algorithms/doms/ExecutionStatus.py index 1bae4556..2add7b1f 100644 --- a/analysis/webservice/algorithms/doms/ExecutionStatus.py +++ b/analysis/webservice/algorithms/doms/ExecutionStatus.py @@ -53,6 +53,14 @@ def calc(self, request, **args): code=404 ) + # Get execution stats. This call will raise an exception if the + # execution is not done. + with ResultsRetrieval(self.config) as retrieval: + try: + execution_stats = retrieval.retrieveStats(execution_id) + except NexusProcessingException: + execution_stats = {} + job_status = NexusExecutionResults.ExecutionStatus(execution_details['status']) host = f'{request.requestHandler.request.protocol}://{request.requestHandler.request.host}' @@ -63,5 +71,7 @@ def calc(self, request, **args): execution_id=execution_id, message=execution_details['message'], params=execution_params, - host=host + host=host, + num_primary_matched=execution_stats.get('numPrimaryMatched'), + num_secondary_matched=execution_stats.get('numSecondaryMatched') ) diff --git a/analysis/webservice/algorithms/doms/ResultsStorage.py b/analysis/webservice/algorithms/doms/ResultsStorage.py index 39db27b3..99e3c6b7 100644 --- a/analysis/webservice/algorithms/doms/ResultsStorage.py +++ b/analysis/webservice/algorithms/doms/ResultsStorage.py @@ -286,7 +286,7 @@ def retrieveResults(self, execution_id, trim_data=False, page_num=1, page_size=1 execution_id = uuid.UUID(execution_id) params = self.retrieveParams(execution_id) - stats = self.__retrieveStats(execution_id) + stats = self.retrieveStats(execution_id) data = self.__retrieveData(execution_id, trim_data=trim_data, page_num=page_num, page_size=page_size) return params, stats, data @@ -357,7 +357,7 @@ def __rowToDataEntry(self, row, trim_data=False): return entry - def __retrieveStats(self, id): + def retrieveStats(self, id): cql = "SELECT num_gridded_matched, num_insitu_matched, time_to_complete FROM doms_execution_stats where execution_id = %s limit 1" rows = self._session.execute(cql, (id,)) for row in rows: diff --git a/analysis/webservice/webmodel/NexusExecutionResults.py b/analysis/webservice/webmodel/NexusExecutionResults.py index d5c12046..7cf9abb1 100644 --- a/analysis/webservice/webmodel/NexusExecutionResults.py +++ b/analysis/webservice/webmodel/NexusExecutionResults.py @@ -44,7 +44,8 @@ def construct_job_status(job_state, created, updated, execution_id, params, host } -def construct_done(status, created, completed, execution_id, params, host): +def construct_done(status, created, completed, execution_id, params, host, + num_primary_matched, num_secondary_matched): job_body = construct_job_status( status, created, @@ -53,6 +54,9 @@ def construct_done(status, created, completed, execution_id, params, host): params, host ) + # Add stats to body + job_body['totalPrimaryMatched'] = num_primary_matched + job_body['averageSecondaryMatched'] = round(num_secondary_matched/num_primary_matched) # Construct urls formats = [ @@ -112,7 +116,8 @@ def construct_cancelled(status, created, completed, execution_id, params, host): class NexusExecutionResults: def __init__(self, status=None, created=None, completed=None, execution_id=None, message='', - params=None, host=None, status_code=200): + params=None, host=None, status_code=200, num_primary_matched=None, + num_secondary_matched=None): self.status_code = status_code self.status = status self.created = created @@ -121,6 +126,8 @@ def __init__(self, status=None, created=None, completed=None, execution_id=None, self.message = message self.execution_params = params self.host = host + self.num_primary_matched = num_primary_matched + self.num_secondary_matched = num_secondary_matched def toJson(self): params = { @@ -132,6 +139,8 @@ def toJson(self): } if self.status == ExecutionStatus.SUCCESS: params['completed'] = self.completed + params['num_primary_matched'] = self.num_primary_matched + params['num_secondary_matched'] = self.num_secondary_matched construct = construct_done elif self.status == ExecutionStatus.RUNNING: construct = construct_running From fbad6b72bf649709cdb51152d49f65bf0a7c4cac Mon Sep 17 00:00:00 2001 From: skorper Date: Mon, 25 Sep 2023 15:48:01 -0700 Subject: [PATCH 37/70] rename to executionId --- analysis/webservice/apidocs/openapi.yml | 4 ++-- analysis/webservice/webmodel/NexusExecutionResults.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/analysis/webservice/apidocs/openapi.yml b/analysis/webservice/apidocs/openapi.yml index dc6fdb4a..f5c57a3e 100644 --- a/analysis/webservice/apidocs/openapi.yml +++ b/analysis/webservice/apidocs/openapi.yml @@ -684,7 +684,7 @@ paths: - in: query name: id description: | - The job execution ID + The execution ID required: true schema: type: string @@ -702,7 +702,7 @@ paths: - in: query name: id description: | - The job execution ID + The execution ID required: true schema: type: string diff --git a/analysis/webservice/webmodel/NexusExecutionResults.py b/analysis/webservice/webmodel/NexusExecutionResults.py index 7cf9abb1..c80914dd 100644 --- a/analysis/webservice/webmodel/NexusExecutionResults.py +++ b/analysis/webservice/webmodel/NexusExecutionResults.py @@ -40,7 +40,7 @@ def construct_job_status(job_state, created, updated, execution_id, params, host 'rel': 'self' }], 'params': params, - 'jobID': execution_id + 'executionID': execution_id } From e0a5999792b466b502c65d7d042d911a5214a4ba Mon Sep 17 00:00:00 2001 From: skorper Date: Mon, 25 Sep 2023 15:50:20 -0700 Subject: [PATCH 38/70] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11789189..b8ed55b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,9 +24,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SDAP-482: Updated Saildrone in situ endpoint in config file - SDAP-485: Improved behavior for retrying failed Cassandra inserts when saving matchup results. - SDAP-487: Improved result fetch speed for large matchup results by tweaking `doms.doms_data` schema to support querying by primary value id. +- SDAP-493: + - Updated /job endpoint to use `executionId` terminology for consistency with existing `/cdmsresults` endpoint + - Updated /job endpoint with details about number of primary and secondary tiles. ### Deprecated ### Removed - SDAP-465: Removed `climatology` directory. +- SDAP-493: + - Removed `resultSizeLimit` from /match_spark endpoint ### Fixed - SDAP-474: Fixed bug in CSV attributes where secondary dataset would be rendered as comma separated characters - SDAP-475: Bug fixes for `/timeSeriesSpark` and `/timeAvgMapSpark` From 8942afc55d7f438b35a2df7392b09496c19813c9 Mon Sep 17 00:00:00 2001 From: skorper Date: Fri, 29 Sep 2023 10:59:32 -0700 Subject: [PATCH 39/70] add totalSecondaryMatched field to /job output --- analysis/webservice/webmodel/NexusExecutionResults.py | 1 + 1 file changed, 1 insertion(+) diff --git a/analysis/webservice/webmodel/NexusExecutionResults.py b/analysis/webservice/webmodel/NexusExecutionResults.py index c80914dd..961fd198 100644 --- a/analysis/webservice/webmodel/NexusExecutionResults.py +++ b/analysis/webservice/webmodel/NexusExecutionResults.py @@ -56,6 +56,7 @@ def construct_done(status, created, completed, execution_id, params, host, ) # Add stats to body job_body['totalPrimaryMatched'] = num_primary_matched + job_body['totalSecondaryMatched'] = num_secondary_matched job_body['averageSecondaryMatched'] = round(num_secondary_matched/num_primary_matched) # Construct urls From dd73036307a313eaba57203f208196da2f6a3ab0 Mon Sep 17 00:00:00 2001 From: skorper Date: Fri, 29 Sep 2023 14:25:21 -0700 Subject: [PATCH 40/70] num unique secondaries addition --- .../algorithms/doms/DomsInitialization.py | 5 +++-- .../algorithms/doms/ExecutionStatus.py | 3 ++- .../algorithms/doms/ResultsStorage.py | 20 ++++++++++--------- .../webservice/algorithms_spark/Matchup.py | 8 +++++--- .../webmodel/NexusExecutionResults.py | 7 +++++-- 5 files changed, 26 insertions(+), 17 deletions(-) diff --git a/analysis/webservice/algorithms/doms/DomsInitialization.py b/analysis/webservice/algorithms/doms/DomsInitialization.py index 43627b14..a10a7e70 100644 --- a/analysis/webservice/algorithms/doms/DomsInitialization.py +++ b/analysis/webservice/algorithms/doms/DomsInitialization.py @@ -173,7 +173,7 @@ def createDomsDataTable(self, session): def createDomsExecutionStatsTable(self, session): log = logging.getLogger(__name__) - log.info("Verifying doms_execuction_stats table") + log.info("Verifying doms_execution_stats table") cql = """ CREATE TABLE IF NOT EXISTS doms_execution_stats ( execution_id uuid PRIMARY KEY, @@ -181,7 +181,8 @@ def createDomsExecutionStatsTable(self, session): num_gridded_checked int, num_insitu_matched int, num_insitu_checked int, - time_to_complete int + time_to_complete int, + num_unique_secondaries int ); """ session.execute(cql) diff --git a/analysis/webservice/algorithms/doms/ExecutionStatus.py b/analysis/webservice/algorithms/doms/ExecutionStatus.py index 2add7b1f..eafdbbbf 100644 --- a/analysis/webservice/algorithms/doms/ExecutionStatus.py +++ b/analysis/webservice/algorithms/doms/ExecutionStatus.py @@ -73,5 +73,6 @@ def calc(self, request, **args): params=execution_params, host=host, num_primary_matched=execution_stats.get('numPrimaryMatched'), - num_secondary_matched=execution_stats.get('numSecondaryMatched') + num_secondary_matched=execution_stats.get('numSecondaryMatched'), + num_unique_secondaries=execution_stats.get('numUniqueSecondaries') ) diff --git a/analysis/webservice/algorithms/doms/ResultsStorage.py b/analysis/webservice/algorithms/doms/ResultsStorage.py index 99e3c6b7..48b2122d 100644 --- a/analysis/webservice/algorithms/doms/ResultsStorage.py +++ b/analysis/webservice/algorithms/doms/ResultsStorage.py @@ -166,17 +166,18 @@ def __insertParams(self, execution_id, params): def __insertStats(self, execution_id, stats): cql = """ INSERT INTO doms_execution_stats - (execution_id, num_gridded_matched, num_gridded_checked, num_insitu_matched, num_insitu_checked, time_to_complete) + (execution_id, num_gridded_matched, num_gridded_checked, num_insitu_matched, num_insitu_checked, time_to_complete, num_unique_secondaries) VALUES - (%s, %s, %s, %s, %s, %s) + (%s, %s, %s, %s, %s, %s, %s) """ self._session.execute(cql, ( execution_id, - stats["numPrimaryMatched"], + stats['numPrimaryMatched'], None, - stats["numSecondaryMatched"], + stats['numSecondaryMatched'], None, - stats["timeToComplete"] + stats['timeToComplete'], + stats['numUniqueSecondaries'] )) def __insertResults(self, execution_id, results): @@ -358,13 +359,14 @@ def __rowToDataEntry(self, row, trim_data=False): return entry def retrieveStats(self, id): - cql = "SELECT num_gridded_matched, num_insitu_matched, time_to_complete FROM doms_execution_stats where execution_id = %s limit 1" + cql = "SELECT num_gridded_matched, num_insitu_matched, time_to_complete, num_unique_secondaries FROM doms_execution_stats where execution_id = %s limit 1" rows = self._session.execute(cql, (id,)) for row in rows: stats = { - "timeToComplete": row.time_to_complete, - "numSecondaryMatched": row.num_insitu_matched, - "numPrimaryMatched": row.num_gridded_matched, + 'timeToComplete': row.time_to_complete, + 'numSecondaryMatched': row.num_insitu_matched, + 'numPrimaryMatched': row.num_gridded_matched, + 'numUniqueSecondaries': row.num_unique_secondaries } return stats diff --git a/analysis/webservice/algorithms_spark/Matchup.py b/analysis/webservice/algorithms_spark/Matchup.py index 77ecc346..46d1d89d 100644 --- a/analysis/webservice/algorithms_spark/Matchup.py +++ b/analysis/webservice/algorithms_spark/Matchup.py @@ -276,10 +276,12 @@ def async_calc(self, execution_id, tile_ids, bounding_polygon, primary_ds_name, total_keys = len(list(spark_result.keys())) total_values = sum(len(v) for v in spark_result.values()) + unique_values = len(set([point.data_id for point in spark_result.values()])) details = { - "timeToComplete": int((end - start).total_seconds()), - "numSecondaryMatched": total_values, - "numPrimaryMatched": total_keys + 'timeToComplete': int((end - start).total_seconds()), + 'numSecondaryMatched': total_values, + 'numPrimaryMatched': total_keys, + 'numUniqueSecondaries': unique_values } matches = Matchup.convert_to_matches(spark_result) diff --git a/analysis/webservice/webmodel/NexusExecutionResults.py b/analysis/webservice/webmodel/NexusExecutionResults.py index 961fd198..2b0007ac 100644 --- a/analysis/webservice/webmodel/NexusExecutionResults.py +++ b/analysis/webservice/webmodel/NexusExecutionResults.py @@ -45,7 +45,7 @@ def construct_job_status(job_state, created, updated, execution_id, params, host def construct_done(status, created, completed, execution_id, params, host, - num_primary_matched, num_secondary_matched): + num_primary_matched, num_secondary_matched, num_unique_secondaries): job_body = construct_job_status( status, created, @@ -58,6 +58,7 @@ def construct_done(status, created, completed, execution_id, params, host, job_body['totalPrimaryMatched'] = num_primary_matched job_body['totalSecondaryMatched'] = num_secondary_matched job_body['averageSecondaryMatched'] = round(num_secondary_matched/num_primary_matched) + job_body['totalUniqueSecondaryMatched'] = num_unique_secondaries # Construct urls formats = [ @@ -118,7 +119,7 @@ def construct_cancelled(status, created, completed, execution_id, params, host): class NexusExecutionResults: def __init__(self, status=None, created=None, completed=None, execution_id=None, message='', params=None, host=None, status_code=200, num_primary_matched=None, - num_secondary_matched=None): + num_secondary_matched=None, num_unique_secondaries=None): self.status_code = status_code self.status = status self.created = created @@ -129,6 +130,7 @@ def __init__(self, status=None, created=None, completed=None, execution_id=None, self.host = host self.num_primary_matched = num_primary_matched self.num_secondary_matched = num_secondary_matched + self.num_unique_secondaries = num_unique_secondaries def toJson(self): params = { @@ -142,6 +144,7 @@ def toJson(self): params['completed'] = self.completed params['num_primary_matched'] = self.num_primary_matched params['num_secondary_matched'] = self.num_secondary_matched + params['num_unique_secondaries'] = self.num_unique_secondaries construct = construct_done elif self.status == ExecutionStatus.RUNNING: construct = construct_running From db68d4fa8f1feb4682500b9947c875d286d191ff Mon Sep 17 00:00:00 2001 From: skorper Date: Fri, 13 Oct 2023 10:12:01 -0700 Subject: [PATCH 41/70] updated docs to use correct sea_water_temperature param name --- analysis/webservice/apidocs/openapi.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/analysis/webservice/apidocs/openapi.yml b/analysis/webservice/apidocs/openapi.yml index f5c57a3e..b719ad85 100644 --- a/analysis/webservice/apidocs/openapi.yml +++ b/analysis/webservice/apidocs/openapi.yml @@ -139,8 +139,7 @@ paths: required: false schema: type: string - default: sea_surface_temperature - example: sea_surface_temperature + example: sea_water_temperature - in: query name: matchOnce description: | From a8be9b8c599f949ebe00a595c879973132956b4f Mon Sep 17 00:00:00 2001 From: skorper Date: Wed, 1 Nov 2023 15:28:43 -0700 Subject: [PATCH 42/70] bugfix --- analysis/webservice/algorithms_spark/Matchup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/webservice/algorithms_spark/Matchup.py b/analysis/webservice/algorithms_spark/Matchup.py index 46d1d89d..8955d95c 100644 --- a/analysis/webservice/algorithms_spark/Matchup.py +++ b/analysis/webservice/algorithms_spark/Matchup.py @@ -276,7 +276,7 @@ def async_calc(self, execution_id, tile_ids, bounding_polygon, primary_ds_name, total_keys = len(list(spark_result.keys())) total_values = sum(len(v) for v in spark_result.values()) - unique_values = len(set([point.data_id for point in spark_result.values()])) + unique_values = len(set([point.data_id for v in spark_result.values() for point in v])) details = { 'timeToComplete': int((end - start).total_seconds()), 'numSecondaryMatched': total_values, From 62de86772600c94110c748b3a9358712094cab8f Mon Sep 17 00:00:00 2001 From: skorper Date: Mon, 6 Nov 2023 13:39:02 -0800 Subject: [PATCH 43/70] fix division by zero bug --- analysis/webservice/webmodel/NexusExecutionResults.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analysis/webservice/webmodel/NexusExecutionResults.py b/analysis/webservice/webmodel/NexusExecutionResults.py index 2b0007ac..7dd7af99 100644 --- a/analysis/webservice/webmodel/NexusExecutionResults.py +++ b/analysis/webservice/webmodel/NexusExecutionResults.py @@ -57,7 +57,8 @@ def construct_done(status, created, completed, execution_id, params, host, # Add stats to body job_body['totalPrimaryMatched'] = num_primary_matched job_body['totalSecondaryMatched'] = num_secondary_matched - job_body['averageSecondaryMatched'] = round(num_secondary_matched/num_primary_matched) + job_body['averageSecondaryMatched'] = round(num_secondary_matched/num_primary_matched) \ + if num_primary_matched > 0 else 0 job_body['totalUniqueSecondaryMatched'] = num_unique_secondaries # Construct urls From 972f3ddf076af8c51311a62e8e23a84a9f926d91 Mon Sep 17 00:00:00 2001 From: rileykk Date: Wed, 8 Nov 2023 14:58:11 -0800 Subject: [PATCH 44/70] add params to dataset management handler classes --- analysis/webservice/management/Datasets.py | 42 ++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/analysis/webservice/management/Datasets.py b/analysis/webservice/management/Datasets.py index ded1e8a2..40b267fd 100644 --- a/analysis/webservice/management/Datasets.py +++ b/analysis/webservice/management/Datasets.py @@ -93,7 +93,24 @@ def toJson(self): class DatasetAdd(DatasetManagement): name = 'Add dataset' path = '/datasets/add' - description = "Add new dataset to running SDAP instance" + description = "Add new Zarr dataset to running SDAP instance" + params = { + "name": { + "name": "Dataset name", + "type": "string", + "description": "Name of new dataset to add" + }, + "path": { + "name": "Path or URL", + "type": "string", + "description": "Path/URL of Zarr group" + }, + "body": { + "name": "Request body", + "type": "application/json OR application/yaml", + "description": "POST request body. Config options for Zarr (variabe, coords, aws (if applicable))" + } + } def __init__(self, **args): pass @@ -147,7 +164,19 @@ def calc(self, request: NexusRequestObject, **args): class DatasetUpdate(DatasetManagement): name = 'Update dynamically added dataset' path = '/datasets/update' - description = "Update dataset in running SDAP instance" + description = "Update Zarr dataset in running SDAP instance" + params = { + "name": { + "name": "Dataset name", + "type": "string", + "description": "Name of dataset to update" + }, + "body": { + "name": "Request body", + "type": "application/json OR application/yaml", + "description": "POST request body. Config options for Zarr (variabe, coords, aws (if applicable))" + } + } def __init__(self, **args): pass @@ -182,7 +211,14 @@ def calc(self, request: NexusRequestObject, **args): class DatasetDelete(DatasetManagement): name = 'Remove dataset' path = '/datasets/remove' - description = "Remove dataset from running SDAP instance" + description = "Remove Zarr dataset from running SDAP instance" + params = { + "name": { + "name": "Dataset name", + "type": "string", + "description": "Name of dataset to remove" + } + } def __init__(self, **args): pass From 831ca37b4d4b524cfd6804edcc3aa5d874a39271 Mon Sep 17 00:00:00 2001 From: skorper Date: Thu, 16 Nov 2023 09:23:30 -0800 Subject: [PATCH 45/70] add page number to default filename for matchup output --- CHANGELOG.md | 1 + analysis/webservice/algorithms/doms/BaseDomsHandler.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e0a4e98..64e65c95 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SDAP-493: - Updated /job endpoint to use `executionId` terminology for consistency with existing `/cdmsresults` endpoint - Updated /job endpoint with details about number of primary and secondary tiles. +- SDAP-499: Added page number to default filename for matchup output ### Deprecated ### Removed - SDAP-465: Removed `climatology` directory. diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py index 84c91633..d4dcd512 100644 --- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py +++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py @@ -114,7 +114,7 @@ def toNetCDF(self): return DomsNetCDFFormatter.create(self.__executionId, self.results(), self.__args, self.__details) def filename(self): - return f'CDMS_{self.__executionId}' + return f'CDMS_{self.__executionId}_page{self.__details["pageNum"]}' class DomsCSVFormatter: From 4ab2f9b4a4f72a34b922cb496cedb87e684335b7 Mon Sep 17 00:00:00 2001 From: skorper Date: Thu, 16 Nov 2023 13:22:09 -0800 Subject: [PATCH 46/70] pagination improvements --- .../algorithms/doms/ExecutionStatus.py | 5 ++- .../algorithms/doms/ResultsRetrieval.py | 2 + .../webservice/algorithms_spark/Matchup.py | 10 ++--- analysis/webservice/apidocs/openapi.yml | 14 +++++++ .../webmodel/NexusExecutionResults.py | 40 ++++++++++++------- 5 files changed, 50 insertions(+), 21 deletions(-) diff --git a/analysis/webservice/algorithms/doms/ExecutionStatus.py b/analysis/webservice/algorithms/doms/ExecutionStatus.py index 17c6ca95..63cf423b 100644 --- a/analysis/webservice/algorithms/doms/ExecutionStatus.py +++ b/analysis/webservice/algorithms/doms/ExecutionStatus.py @@ -42,6 +42,8 @@ def calc(self, request, **args): except ValueError: raise NexusProcessingException(reason='"id" argument must be a valid uuid', code=400) + filename = request.get_argument('filename', None) + # Check if the job is done with ResultsRetrieval(self.config) as retrieval: try: @@ -74,5 +76,6 @@ def calc(self, request, **args): host=host, num_primary_matched=execution_stats.get('numPrimaryMatched'), num_secondary_matched=execution_stats.get('numSecondaryMatched'), - num_unique_secondaries=execution_stats.get('numUniqueSecondaries') + num_unique_secondaries=execution_stats.get('numUniqueSecondaries'), + filename=filename ) diff --git a/analysis/webservice/algorithms/doms/ResultsRetrieval.py b/analysis/webservice/algorithms/doms/ResultsRetrieval.py index f03c1caa..cdec9294 100644 --- a/analysis/webservice/algorithms/doms/ResultsRetrieval.py +++ b/analysis/webservice/algorithms/doms/ResultsRetrieval.py @@ -45,6 +45,8 @@ def calc(self, computeOptions, **args): simple_results = computeOptions.get_boolean_arg("simpleResults", default=False) + filename = computeOptions.get_argument("filename", default=None) + with ResultsStorage.ResultsRetrieval(self.config) as storage: params, stats, data = storage.retrieveResults(execution_id, trim_data=simple_results, page_num=page_num, page_size=page_size) diff --git a/analysis/webservice/algorithms_spark/Matchup.py b/analysis/webservice/algorithms_spark/Matchup.py index 8955d95c..7c7f551b 100644 --- a/analysis/webservice/algorithms_spark/Matchup.py +++ b/analysis/webservice/algorithms_spark/Matchup.py @@ -219,12 +219,13 @@ def parse_arguments(self, request): end_seconds_from_epoch = int((end_time - EPOCH).total_seconds()) prioritize_distance = request.get_boolean_arg("prioritizeDistance", default=True) + filename = request.get_argument('filename', default=None) return bounding_polygon, primary_ds_name, secondary_ds_names, parameter_s, \ start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, \ depth_min, depth_max, time_tolerance, radius_tolerance, \ - platforms, match_once, prioritize_distance + platforms, match_once, prioritize_distance, filename def get_job_pool(self, tile_ids): if len(tile_ids) > LARGE_JOB_THRESHOLD: @@ -302,7 +303,7 @@ def calc(self, request, tornado_io_loop, **args): bounding_polygon, primary_ds_name, secondary_ds_names, parameter_s, \ start_time, start_seconds_from_epoch, end_time, end_seconds_from_epoch, \ depth_min, depth_max, time_tolerance, radius_tolerance, \ - platforms, match_once, prioritize_distance = self.parse_arguments(request) + platforms, match_once, prioritize_distance, filename = self.parse_arguments(request) args = { "primary": primary_ds_name, @@ -375,9 +376,8 @@ def calc(self, request, tornado_io_loop, **args): start=start, prioritize_distance=prioritize_distance )) - - request.requestHandler.redirect(f'/job?id={execution_id}') - + filename_param = f'&filename={filename}' if filename else '' + request.requestHandler.redirect(f'/job?id={execution_id}{filename_param}') @classmethod def convert_to_matches(cls, spark_result): diff --git a/analysis/webservice/apidocs/openapi.yml b/analysis/webservice/apidocs/openapi.yml index b719ad85..8c6efdc9 100644 --- a/analysis/webservice/apidocs/openapi.yml +++ b/analysis/webservice/apidocs/openapi.yml @@ -166,6 +166,13 @@ paths: type: boolean default: true example: true + - in: query + name: filename + description: | + Optional filename. Will be passed into /job and results links + required: false + schema: + type: string responses: '200': description: Successful operation @@ -689,6 +696,13 @@ paths: type: string format: uuid example: c864a51b-3d87-4872-9070-632820b1cae2 + - in: query + name: filename + description: | + Optional filename. Will be passed into /job results links + required: false + schema: + type: string /job/cancel: get: summary: | diff --git a/analysis/webservice/webmodel/NexusExecutionResults.py b/analysis/webservice/webmodel/NexusExecutionResults.py index 7dd7af99..47a891a9 100644 --- a/analysis/webservice/webmodel/NexusExecutionResults.py +++ b/analysis/webservice/webmodel/NexusExecutionResults.py @@ -27,15 +27,17 @@ class ExecutionStatus(Enum): CANCELLED = 'cancelled' -def construct_job_status(job_state, created, updated, execution_id, params, host, message=''): +def construct_job_status(job_state, created, updated, execution_id, params, host, message='', + filename=None): + filename_param = f'&filename={filename}' if filename else '' return { 'status': job_state.value, 'message': message, 'createdAt': created, 'updatedAt': updated, 'links': [{ - 'href': f'{host}/job?id={execution_id}', - 'title': 'The current page', + 'href': f'{host}/job?id={execution_id}{filename_param}', + 'title': 'Get job status - the current page', 'type': 'application/json', 'rel': 'self' }], @@ -45,14 +47,15 @@ def construct_job_status(job_state, created, updated, execution_id, params, host def construct_done(status, created, completed, execution_id, params, host, - num_primary_matched, num_secondary_matched, num_unique_secondaries): + num_primary_matched, num_secondary_matched, num_unique_secondaries, filename): job_body = construct_job_status( status, created, completed, execution_id, params, - host + host, + filename=filename ) # Add stats to body job_body['totalPrimaryMatched'] = num_primary_matched @@ -61,6 +64,8 @@ def construct_done(status, created, completed, execution_id, params, host, if num_primary_matched > 0 else 0 job_body['totalUniqueSecondaryMatched'] = num_unique_secondaries + filename_param = f'&filename={filename}' if filename else '' + # Construct urls formats = [ ('CSV', 'text/csv'), @@ -68,8 +73,8 @@ def construct_done(status, created, completed, execution_id, params, host, ('NETCDF', 'binary/octet-stream') ] data_links = [{ - 'href': f'{host}/cdmsresults?id={execution_id}&output={output_format}', - 'title': 'Download results', + 'href': f'{host}/cdmsresults?id={execution_id}&output={output_format}{filename_param}', + 'title': f'Download {output_format} results', 'type': mime, 'rel': 'data' } for output_format, mime in formats] @@ -77,14 +82,15 @@ def construct_done(status, created, completed, execution_id, params, host, return job_body -def construct_running(status, created, execution_id, params, host): +def construct_running(status, created, execution_id, params, host, filename): job_body = construct_job_status( status, created, None, execution_id, params, - host + host, + filename=filename ) job_body['links'].append({ 'href': f'{host}/job/cancel?id={execution_id}', @@ -94,7 +100,7 @@ def construct_running(status, created, execution_id, params, host): return job_body -def construct_error(status, created, completed, execution_id, message, params, host): +def construct_error(status, created, completed, execution_id, message, params, host, filename): return construct_job_status( status, created, @@ -102,25 +108,27 @@ def construct_error(status, created, completed, execution_id, message, params, h execution_id, params, host, - message + message, + filename=filename ) -def construct_cancelled(status, created, completed, execution_id, params, host): +def construct_cancelled(status, created, completed, execution_id, params, host, filename): return construct_job_status( status, created, completed, execution_id, params, - host + host, + filename=filename ) class NexusExecutionResults: def __init__(self, status=None, created=None, completed=None, execution_id=None, message='', params=None, host=None, status_code=200, num_primary_matched=None, - num_secondary_matched=None, num_unique_secondaries=None): + num_secondary_matched=None, num_unique_secondaries=None, filename=None): self.status_code = status_code self.status = status self.created = created @@ -132,6 +140,7 @@ def __init__(self, status=None, created=None, completed=None, execution_id=None, self.num_primary_matched = num_primary_matched self.num_secondary_matched = num_secondary_matched self.num_unique_secondaries = num_unique_secondaries + self.filename = filename def toJson(self): params = { @@ -139,7 +148,8 @@ def toJson(self): 'created': self.created, 'execution_id': self.execution_id, 'params': self.execution_params, - 'host': self.host + 'host': self.host, + 'filename': self.filename } if self.status == ExecutionStatus.SUCCESS: params['completed'] = self.completed From 3677c11db7b9346933a185f5129fd670234cd4ca Mon Sep 17 00:00:00 2001 From: skorper Date: Thu, 16 Nov 2023 13:26:48 -0800 Subject: [PATCH 47/70] removed debugging line --- analysis/webservice/algorithms/doms/ResultsRetrieval.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/analysis/webservice/algorithms/doms/ResultsRetrieval.py b/analysis/webservice/algorithms/doms/ResultsRetrieval.py index cdec9294..f03c1caa 100644 --- a/analysis/webservice/algorithms/doms/ResultsRetrieval.py +++ b/analysis/webservice/algorithms/doms/ResultsRetrieval.py @@ -45,8 +45,6 @@ def calc(self, computeOptions, **args): simple_results = computeOptions.get_boolean_arg("simpleResults", default=False) - filename = computeOptions.get_argument("filename", default=None) - with ResultsStorage.ResultsRetrieval(self.config) as storage: params, stats, data = storage.retrieveResults(execution_id, trim_data=simple_results, page_num=page_num, page_size=page_size) From 86f1348d6d283eb688b1b619f95dde2f8e635218 Mon Sep 17 00:00:00 2001 From: skorper Date: Thu, 16 Nov 2023 13:27:43 -0800 Subject: [PATCH 48/70] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e0a4e98..6ffff5dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SDAP-493: - Updated /job endpoint to use `executionId` terminology for consistency with existing `/cdmsresults` endpoint - Updated /job endpoint with details about number of primary and secondary tiles. +- SDAP-500: Improvements to SDAP Asynchronous Jobs ### Deprecated ### Removed - SDAP-465: Removed `climatology` directory. From 1e8cc4e9d31d295e172c0db4bba61a5776642bea Mon Sep 17 00:00:00 2001 From: Riley Kuttruff <72955101+RKuttruff@users.noreply.github.com> Date: Mon, 27 Nov 2023 15:44:38 -0800 Subject: [PATCH 49/70] Update helm cassandra dependency (#289) * Update helm cassandra dependency * Bump default cassandra PV to 4 * Bump default cassandra PV to 4 in tools * Changelog * Fixed small documentation issue --------- Co-authored-by: rileykk --- CHANGELOG.md | 1 + analysis/webservice/algorithms/doms/ResultsStorage.py | 3 +++ analysis/webservice/algorithms/doms/domsconfig.ini.default | 2 +- data-access/nexustiles/config/datastores.ini.default | 2 +- helm/requirements.yaml | 2 +- helm/values.yaml | 3 +-- tools/deletebyquery/deletebyquery.py | 2 +- tools/doms-data-tools/update_doms_data_pk.py | 2 +- tools/domspurge/README.md | 2 +- tools/domspurge/purge.py | 2 +- 10 files changed, 12 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ffff5dc..5e36c0a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Status code for results endpoint if execution id is not found fixed to be `404` instead of `500`. - Ensured links in the `/job` endpoint are https - SDAP-488: Workaround to build issue on Apple Silicon (M1/M2). Image build installs nexusproto through PyPI instead of building from source. A build arg `BUILD_NEXUSPROTO` was defined to allow building from source if desired +- SDAP-496: Fix `solr-cloud-init` image failing to run. ### Security ## [1.1.0] - 2023-04-26 diff --git a/analysis/webservice/algorithms/doms/ResultsStorage.py b/analysis/webservice/algorithms/doms/ResultsStorage.py index 48b2122d..1dea1610 100644 --- a/analysis/webservice/algorithms/doms/ResultsStorage.py +++ b/analysis/webservice/algorithms/doms/ResultsStorage.py @@ -65,6 +65,9 @@ def __enter__(self): dc_policy = DCAwareRoundRobinPolicy(cassDatacenter) token_policy = TokenAwarePolicy(dc_policy) + logger.info(f'Connecting to Cassandra cluster @ {[host for host in cassHost.split(",")]}; datacenter: ' + f'{cassDatacenter}; protocol version: {cassVersion}') + self._cluster = Cluster([host for host in cassHost.split(',')], load_balancing_policy=token_policy, protocol_version=cassVersion, auth_provider=auth_provider) diff --git a/analysis/webservice/algorithms/doms/domsconfig.ini.default b/analysis/webservice/algorithms/doms/domsconfig.ini.default index 55f9b16c..f4e44960 100644 --- a/analysis/webservice/algorithms/doms/domsconfig.ini.default +++ b/analysis/webservice/algorithms/doms/domsconfig.ini.default @@ -18,7 +18,7 @@ host=localhost port=9042 keyspace=doms local_datacenter=datacenter1 -protocol_version=3 +protocol_version=4 dc_policy=DCAwareRoundRobinPolicy username= password= diff --git a/data-access/nexustiles/config/datastores.ini.default b/data-access/nexustiles/config/datastores.ini.default index d8db1902..51455a38 100644 --- a/data-access/nexustiles/config/datastores.ini.default +++ b/data-access/nexustiles/config/datastores.ini.default @@ -18,7 +18,7 @@ host=localhost port=9042 keyspace=nexustiles local_datacenter=datacenter1 -protocol_version=3 +protocol_version=4 dc_policy=DCAwareRoundRobinPolicy username= password= diff --git a/helm/requirements.yaml b/helm/requirements.yaml index a9996586..1de8cf0f 100644 --- a/helm/requirements.yaml +++ b/helm/requirements.yaml @@ -12,7 +12,7 @@ dependencies: repository: https://raw.githubusercontent.com/bitnami/charts/archive-full-index/bitnami condition: solr.enabled - name: cassandra - version: 5.5.3 + version: 9.1.7 repository: https://raw.githubusercontent.com/bitnami/charts/archive-full-index/bitnami condition: cassandra.enabled diff --git a/helm/values.yaml b/helm/values.yaml index 4105362e..fe2481ef 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -195,8 +195,7 @@ cassandra: dbUser: user: cassandra password: cassandra - cluster: - replicaCount: 1 + replicaCount: 1 persistence: storageClass: hostpath size: 8Gi diff --git a/tools/deletebyquery/deletebyquery.py b/tools/deletebyquery/deletebyquery.py index 4fb7bd66..8b98111a 100644 --- a/tools/deletebyquery/deletebyquery.py +++ b/tools/deletebyquery/deletebyquery.py @@ -262,7 +262,7 @@ def parse_args(): help='The version of the Cassandra protocol the driver should use.', required=False, choices=['1', '2', '3', '4', '5'], - default='3') + default='4') parser.add_argument('--solr-rows', help='Number of rows to fetch with each Solr query to build the list of tiles to delete', diff --git a/tools/doms-data-tools/update_doms_data_pk.py b/tools/doms-data-tools/update_doms_data_pk.py index ed8dbe5e..749995da 100644 --- a/tools/doms-data-tools/update_doms_data_pk.py +++ b/tools/doms-data-tools/update_doms_data_pk.py @@ -114,7 +114,7 @@ def main(): request_timeout=60.0, ) }, - protocol_version=3, + protocol_version=4, auth_provider=auth_provider) as cluster: session = cluster.connect('doms') diff --git a/tools/domspurge/README.md b/tools/domspurge/README.md index 92f7cfb1..e88b62f5 100644 --- a/tools/domspurge/README.md +++ b/tools/domspurge/README.md @@ -33,5 +33,5 @@ You can build an image for this script to run it in a Kubernetes CronJob. ```shell cd /incubator-sdap-nexus -docker build . -f Dockerfile -t sdap-local/DomsPurge: +docker build . -f tools/domspurge/Dockerfile -t sdap-local/DomsPurge: ``` diff --git a/tools/domspurge/purge.py b/tools/domspurge/purge.py index 4fb2fc37..d4bb15a8 100644 --- a/tools/domspurge/purge.py +++ b/tools/domspurge/purge.py @@ -270,7 +270,7 @@ def parse_args(): required=False, dest='pv', choices=['1', '2', '3', '4', '5'], - default='3') + default='4') time_before = purge_options.add_mutually_exclusive_group(required=True) From faed801d106dddf827faddf077f961d2a39c5492 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 14 Dec 2023 10:32:00 -0800 Subject: [PATCH 50/70] Register dataset docs with nexusproto backend + static getters --- data-access/nexustiles/AbstractTileService.py | 10 +++++++ .../nexustiles/backends/nexusproto/backend.py | 29 +++++++++++++++++++ .../nexustiles/backends/zarr/backend.py | 3 ++ data-access/nexustiles/nexustiles.py | 9 ++++++ 4 files changed, 51 insertions(+) diff --git a/data-access/nexustiles/AbstractTileService.py b/data-access/nexustiles/AbstractTileService.py index 20467784..c418180e 100644 --- a/data-access/nexustiles/AbstractTileService.py +++ b/data-access/nexustiles/AbstractTileService.py @@ -18,6 +18,7 @@ import numpy as np import numpy.ma as ma +from copy import deepcopy class AbstractTileService(ABC): @@ -32,6 +33,7 @@ class AbstractTileService(ABC): def __init__(self, dataset_name): self._name = dataset_name + self._ds_info = {} @abstractmethod def get_dataseries_list(self, simple=False): @@ -200,3 +202,11 @@ def fetch_data_for_tiles(self, *tiles): def _metadata_store_docs_to_tiles(self, *store_docs): raise NotImplementedError() + @abstractmethod + def update_metadata(self, solr_doc): + raise NotImplementedError() + + def get_metadata(self, dataset=None): # ds as param for nexusproto backend + return deepcopy(self._ds_info) + + diff --git a/data-access/nexustiles/backends/nexusproto/backend.py b/data-access/nexustiles/backends/nexusproto/backend.py index 690b109c..3b9390ff 100644 --- a/data-access/nexustiles/backends/nexusproto/backend.py +++ b/data-access/nexustiles/backends/nexusproto/backend.py @@ -14,6 +14,7 @@ # limitations under the License. import configparser +import copy import logging import sys import json @@ -551,6 +552,34 @@ def pingSolr(self): else: return False + def update_metadata(self, solr_doc): + variables = solr_doc.get('variables_s', None) + + dataset = solr_doc['dataset_s'] + + if dataset not in self._ds_info: + self._ds_info[dataset] = {} + + if variables is not None: + variables = json.loads(variables) + + if isinstance(variables, dict): + variables = [variables] + else: + variables = [] + + self._ds_info[dataset]['variables'] = variables + + # print(self._ds_info) + + def get_metadata(self, dataset=None): + if dataset is None: + logger.error('Cannot pull metadata for nexusproto without specifying dataset name') + return {} + else: + return copy.deepcopy(self._ds_info[dataset]) + + @staticmethod def _get_config_files(filename): log = logging.getLogger(__name__) diff --git a/data-access/nexustiles/backends/zarr/backend.py b/data-access/nexustiles/backends/zarr/backend.py index e1d0a0c1..86081a27 100644 --- a/data-access/nexustiles/backends/zarr/backend.py +++ b/data-access/nexustiles/backends/zarr/backend.py @@ -463,6 +463,9 @@ def __fetch_data_for_tile(self, tile: Tile): def _metadata_store_docs_to_tiles(self, *store_docs): return [ZarrBackend.__nts_url_to_tile(d) for d in store_docs] + def update_metadata(self, solr_doc): + raise NotImplementedError() + @staticmethod def __nts_url_to_tile(nts_url): tile = Tile() diff --git a/data-access/nexustiles/nexustiles.py b/data-access/nexustiles/nexustiles.py index b4fd6bba..ef64e8f8 100644 --- a/data-access/nexustiles/nexustiles.py +++ b/data-access/nexustiles/nexustiles.py @@ -250,6 +250,7 @@ def _update_datasets(): if store_type == 'nexus_proto' or store_type == 'nexusproto': update_logger.info(f"Detected new nexusproto dataset {d_id}, using default nexusproto backend") NexusTileService.backends[d_id] = NexusTileService.backends[None] + NexusTileService.backends[d_id]['backend'].update_metadata(dataset) elif store_type == 'zarr': update_logger.info(f"Detected new zarr dataset {d_id}, opening new zarr backend") @@ -371,6 +372,14 @@ def user_ds_delete(name): return {'success': True} + @staticmethod + def get_metadata_for_dataset(ds_name): + try: + backend = NexusTileService._get_backend(ds_name) + return backend.get_metadata(ds_name) + except: + return None + def override_config(self, config): for section in config.sections(): if self._config.has_section(section): # only override preexisting section, ignores the other From 20902ebde81eacc6ed643977b57a64b615fbb3d6 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 14 Dec 2023 10:32:18 -0800 Subject: [PATCH 51/70] Matchup impl --- .../webservice/algorithms_spark/Matchup.py | 25 +++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/analysis/webservice/algorithms_spark/Matchup.py b/analysis/webservice/algorithms_spark/Matchup.py index 30e9bb6a..c15b1e5d 100644 --- a/analysis/webservice/algorithms_spark/Matchup.py +++ b/analysis/webservice/algorithms_spark/Matchup.py @@ -41,6 +41,8 @@ from webservice.webmodel import NexusProcessingException from webservice.webmodel.NexusExecutionResults import ExecutionStatus +from nexustiles.nexustiles import NexusTileService + EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' @@ -494,14 +496,33 @@ def from_nexus_point(nexus_point, tile=None): else: data_vals = [nexus_point.data_vals] + ds_metadata = NexusTileService.get_metadata_for_dataset(tile.dataset) + + if ds_metadata is not None: + ds_vars = ds_metadata.get('variables', []) + else: + ds_vars = [] + + variable_dict = {} + + for v in ds_vars: + variable_dict[v['name']] = v + data = [] for data_val, variable in zip(data_vals, tile.variables): if data_val: + if variable.variable_name in variable_dict: + standard_name = variable_dict[variable.variable_name]['cf_standard_name'] + unit = variable_dict[variable.variable_name]['unit'] + else: + standard_name = variable.standard_name + unit = None + data.append(DataPoint( variable_name=variable.variable_name, variable_value=data_val, - cf_variable_name=variable.standard_name, - variable_unit=None + cf_variable_name=standard_name, + variable_unit=unit )) point.data = data From 1af0c41185adfc719ef27da386209e28431028f5 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 18 Dec 2023 15:33:56 -0800 Subject: [PATCH 52/70] Add vars to headers in CDMS subsetter --- .../webservice/algorithms/doms/subsetter.py | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/analysis/webservice/algorithms/doms/subsetter.py b/analysis/webservice/algorithms/doms/subsetter.py index bf63fc88..32e64a4e 100644 --- a/analysis/webservice/algorithms/doms/subsetter.py +++ b/analysis/webservice/algorithms/doms/subsetter.py @@ -24,6 +24,8 @@ from webservice.algorithms.doms.insitu import query_insitu from webservice.webmodel import NexusProcessingException, NexusResults +from nexustiles.nexustiles import NexusTileService + from . import BaseDomsHandler ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' @@ -302,6 +304,20 @@ def toCsv(self): logging.info('Converting result to CSV') for dataset_name, results in dataset_results.items(): + try: + ds_metadata = NexusTileService.get_metadata_for_dataset(dataset_name) + except: + ds_metadata = {} + + ds_vars = ds_metadata.get('variables', []) + + variable_dict = {} + variable_dict_cf = {} + + for v in ds_vars: + variable_dict[v['name']] = v + variable_dict_cf[v['cf_standard_name']] = v + rows = [] headers = [ @@ -309,13 +325,25 @@ def toCsv(self): 'latitude', 'time' ] - data_variables = list(set([keys for result in results for keys in result['data'].keys()])) - data_variables.sort() + + data_variables = [] + data_variable_headers = [] + + for dv in sorted(list(set([keys for result in results for keys in result['data'].keys()]))): + data_variables.append(dv) + + if dv in variable_dict_cf: + data_variable_headers.append(f'{dv} ({variable_dict_cf[dv]["unit"]})') + elif dv in variable_dict: + data_variable_headers.append(f'{dv} ({variable_dict[dv]["unit"]})') + else: + data_variable_headers.append(dv) if 'id' in list(set([keys for result in results for keys in result.keys()])): headers.append('id') - headers.extend(data_variables) + headers.extend(data_variable_headers) + for i, result in enumerate(results): cols = [] From 8a069db22f72c8405434c1ab164c13057ac34479 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 21 Dec 2023 13:07:59 -0800 Subject: [PATCH 53/70] Add units to all matchup result formats --- analysis/webservice/algorithms/doms/BaseDomsHandler.py | 9 +++++++-- analysis/webservice/algorithms_spark/Matchup.py | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py index 84c91633..66c4079a 100644 --- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py +++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py @@ -142,7 +142,11 @@ def is_empty(s): name = variable['cf_variable_name'] - return name if not is_empty(name) else variable['variable_name'] + header_name = name if not is_empty(name) else variable['variable_name'] + + unit = variable.get('variable_unit', None) + + return f'{header_name} ({unit})' if unit is not None else header_name @staticmethod def __packValues(csv_mem_file, results): @@ -541,7 +545,8 @@ def writeGroup(self): self.__enrichVariable(data_variable, min_data, max_data, has_depth=None, unit=units[variable]) data_variable[:] = np.ma.masked_invalid(variables[variable]) data_variable.long_name = name - data_variable.standard_name = cf_name + if cf_name: + data_variable.standard_name = cf_name # # Lists may include 'None" values, to calc min these must be filtered out diff --git a/analysis/webservice/algorithms_spark/Matchup.py b/analysis/webservice/algorithms_spark/Matchup.py index c15b1e5d..5cb1f32b 100644 --- a/analysis/webservice/algorithms_spark/Matchup.py +++ b/analysis/webservice/algorithms_spark/Matchup.py @@ -518,6 +518,9 @@ def from_nexus_point(nexus_point, tile=None): standard_name = variable.standard_name unit = None + if standard_name is None or standard_name == '': + standard_name = variable.standard_name + data.append(DataPoint( variable_name=variable.variable_name, variable_value=data_val, From 0c39b075b070fe01f1e44583d01b95a4e5cc3c52 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 21 Dec 2023 13:08:48 -0800 Subject: [PATCH 54/70] Formatting for units in subsetter headers When units are absent, don't write var_name (None) as the header --- analysis/webservice/algorithms/doms/subsetter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis/webservice/algorithms/doms/subsetter.py b/analysis/webservice/algorithms/doms/subsetter.py index 32e64a4e..c8ae8d79 100644 --- a/analysis/webservice/algorithms/doms/subsetter.py +++ b/analysis/webservice/algorithms/doms/subsetter.py @@ -332,9 +332,9 @@ def toCsv(self): for dv in sorted(list(set([keys for result in results for keys in result['data'].keys()]))): data_variables.append(dv) - if dv in variable_dict_cf: + if dv in variable_dict_cf and variable_dict_cf[dv]["unit"] is not None: data_variable_headers.append(f'{dv} ({variable_dict_cf[dv]["unit"]})') - elif dv in variable_dict: + elif dv in variable_dict and variable_dict[dv]["unit"] is not None: data_variable_headers.append(f'{dv} ({variable_dict[dv]["unit"]})') else: data_variable_headers.append(dv) From 32ca3d709237d324decada84fe92a4a4044b6521 Mon Sep 17 00:00:00 2001 From: skorper Date: Fri, 5 Jan 2024 15:33:49 -0800 Subject: [PATCH 55/70] stac catalog --- .../algorithms/doms/ResultsStorage.py | 15 +- .../webservice/algorithms/doms/StacCatalog.py | 166 ++++++++++++++++++ .../webservice/algorithms/doms/__init__.py | 1 + .../webmodel/NexusExecutionResults.py | 6 + 4 files changed, 180 insertions(+), 8 deletions(-) create mode 100644 analysis/webservice/algorithms/doms/StacCatalog.py diff --git a/analysis/webservice/algorithms/doms/ResultsStorage.py b/analysis/webservice/algorithms/doms/ResultsStorage.py index 39db27b3..6b4cc1c2 100644 --- a/analysis/webservice/algorithms/doms/ResultsStorage.py +++ b/analysis/webservice/algorithms/doms/ResultsStorage.py @@ -286,7 +286,7 @@ def retrieveResults(self, execution_id, trim_data=False, page_num=1, page_size=1 execution_id = uuid.UUID(execution_id) params = self.retrieveParams(execution_id) - stats = self.__retrieveStats(execution_id) + stats = self.retrieveStats(execution_id) data = self.__retrieveData(execution_id, trim_data=trim_data, page_num=page_num, page_size=page_size) return params, stats, data @@ -357,19 +357,18 @@ def __rowToDataEntry(self, row, trim_data=False): return entry - def __retrieveStats(self, id): - cql = "SELECT num_gridded_matched, num_insitu_matched, time_to_complete FROM doms_execution_stats where execution_id = %s limit 1" + def retrieveStats(self, id): + cql = "SELECT num_gridded_matched, num_insitu_matched, time_to_complete, num_unique_secondaries FROM doms_execution_stats where execution_id = %s limit 1" rows = self._session.execute(cql, (id,)) for row in rows: stats = { - "timeToComplete": row.time_to_complete, - "numSecondaryMatched": row.num_insitu_matched, - "numPrimaryMatched": row.num_gridded_matched, + 'timeToComplete': row.time_to_complete, + 'numSecondaryMatched': row.num_insitu_matched, + 'numPrimaryMatched': row.num_gridded_matched, + 'numUniqueSecondaries': row.num_unique_secondaries } return stats - raise NexusProcessingException(reason=f'No stats found for id {str(id)}', code=404) - def retrieveParams(self, id): cql = "SELECT * FROM doms_params where execution_id = %s limit 1" rows = self._session.execute(cql, (id,)) diff --git a/analysis/webservice/algorithms/doms/StacCatalog.py b/analysis/webservice/algorithms/doms/StacCatalog.py new file mode 100644 index 00000000..2c1aa125 --- /dev/null +++ b/analysis/webservice/algorithms/doms/StacCatalog.py @@ -0,0 +1,166 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the 'License'); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import re +import uuid +from typing import List + +from webservice.NexusHandler import nexus_handler +from webservice.algorithms.doms.ResultsStorage import ResultsRetrieval +from webservice.webmodel import NexusProcessingException +from webservice.webmodel import NexusResults + +from . import BaseDomsHandler + + +class StacResults(NexusResults): + def __init__(self, contents): + NexusResults.__init__(self) + self.contents = contents + + def toJson(self): + return json.dumps(self.contents, indent=4) + + +@nexus_handler +class StacCatalog(BaseDomsHandler.BaseDomsQueryCalcHandler): + name = 'STAC Catalog Handler' + path = '^/cdmscatalog/?.*$' + description = '' + params = {} + singleton = True + + def __init__(self, tile_service_factory, config=None): + BaseDomsHandler.BaseDomsQueryCalcHandler.__init__(self, tile_service_factory) + self.config = config + + def construct_catalog(self, execution_id: str): + return { + 'stac_version': '1.0.0', + 'type': 'Catalog', + 'id': str(execution_id), + 'description': 'STAC Catalog for CDMS output', + 'links': [ + { + 'rel': 'collection', + 'href': f'https://{self.host}/cdmscatalog/{execution_id}/{output_format}', + 'title': f'Collection of pages for {execution_id} {output_format} output' + } + for output_format in ['CSV', 'JSON', 'NETCDF'] + ] + } + + def construct_collection(self, execution_id: str, output_format: str, + num_primary_matched: int, page_size: int, start_time: str, + end_time: str, bbox: List[float]): + links = [ + { + 'rel': 'self', + 'href': f'https://{self.host}/cdmscatalog/{execution_id}/{output_format}', + 'title': 'The current page', + 'type': 'application/json' + }, + { + 'rel': 'root', + 'href': f'https://{self.host}/cdmscatalog/{execution_id}', + 'title': f'Root catalog for {execution_id}', + } + ] + + url = f'https://{self.host}/cdmsresults?id={execution_id}&output={output_format}' + for page_num in range(1, num_primary_matched, page_size): + links.append({ + 'rel': 'data', + 'href': f'{url}&pageNum={page_num}&pageSize={page_size}' + }) + + return { + 'stac_version': '1.0.0', + 'type': 'Collection', + 'license': 'not-provided', + 'id': f'{execution_id}.{output_format}', + 'description': 'Collection of results for CDMS execution and result format', + 'extent': { + 'spatial': { + 'bbox': bbox + }, + 'temporal': { + 'interval': [start_time, end_time] + } + }, + 'links': links, + } + + def calc(self, request, **args): + page_size = request.get_int_arg('pageSize', default=1000) + url_path_regex = '^\/cdmscatalog\/?(?P[a-zA-Z0-9-]*)\/?(?P[a-zA-Z0-9]*)' + match = re.search(url_path_regex, request.requestHandler.request.path) + + execution_id = match.group('id') + output_format = match.group('format') + + self.host = request.requestHandler.request.host + + if not execution_id: + raise NexusProcessingException( + reason=f'Execution ID path param must be provided.', + code=400 + ) + + if execution_id: + try: + execution_id = uuid.UUID(execution_id) + except ValueError: + raise NexusProcessingException( + reason=f'"{execution_id}" is not a valid uuid', + code=400 + ) + + if output_format and output_format.upper() not in ['CSV', 'JSON', 'NETCDF']: + raise NexusProcessingException( + reason=f'"{output_format}" is not a valid format. Should be CSV, JSON, or NETCDF.', + code=400 + ) + + if execution_id and not output_format: + # Route to STAC catalog for execution + stac_output = self.construct_catalog(execution_id) + elif execution_id and output_format: + # Route to STAC collection for execution+format + + with ResultsRetrieval(self.config) as retrieval: + try: + execution_stats = retrieval.retrieveStats(execution_id) + execution_params = retrieval.retrieveParams(execution_id) + except NexusProcessingException: + execution_stats = {} + + num_primary_matched = execution_stats.get('numPrimaryMatched', 0) + start_time = execution_params['startTime'].isoformat() + end_time = execution_params['endTime'].isoformat() + bbox = list(map(float, execution_params['bbox'].split(','))) + + stac_output = self.construct_collection( + execution_id, output_format, num_primary_matched, page_size, + start_time, end_time, bbox + ) + else: + raise NexusProcessingException( + reason=f'Invalid path parameters were provided', + code=400 + ) + + return StacResults(stac_output) diff --git a/analysis/webservice/algorithms/doms/__init__.py b/analysis/webservice/algorithms/doms/__init__.py index bc568f83..7e5715f4 100644 --- a/analysis/webservice/algorithms/doms/__init__.py +++ b/analysis/webservice/algorithms/doms/__init__.py @@ -20,6 +20,7 @@ from . import DatasetListQuery from . import DomsInitialization from . import MatchupQuery +from . import StacCatalog from . import MetadataQuery from . import ResultsPlotQuery from . import ResultsRetrieval diff --git a/analysis/webservice/webmodel/NexusExecutionResults.py b/analysis/webservice/webmodel/NexusExecutionResults.py index d5c12046..be9d332a 100644 --- a/analysis/webservice/webmodel/NexusExecutionResults.py +++ b/analysis/webservice/webmodel/NexusExecutionResults.py @@ -60,6 +60,12 @@ def construct_done(status, created, completed, execution_id, params, host): ('JSON', 'application/json'), ('NETCDF', 'binary/octet-stream') ] + job_body['links'].append({ + 'href': f'{host}/cdmscatalog/{execution_id}', + 'title': 'STAC Catalog for execution results', + 'type': 'application/json', + 'rel': 'stac' + }) data_links = [{ 'href': f'{host}/cdmsresults?id={execution_id}&output={output_format}', 'title': 'Download results', From 3563ae9820f3c8699e15f348f61b94d7f5aa65b5 Mon Sep 17 00:00:00 2001 From: skorper Date: Fri, 5 Jan 2024 16:03:33 -0800 Subject: [PATCH 56/70] Updated openapi spec --- analysis/webservice/apidocs/openapi.yml | 52 +++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/analysis/webservice/apidocs/openapi.yml b/analysis/webservice/apidocs/openapi.yml index ea9b16ba..0420bf9d 100644 --- a/analysis/webservice/apidocs/openapi.yml +++ b/analysis/webservice/apidocs/openapi.yml @@ -721,6 +721,58 @@ paths: type: string format: uuid example: c864a51b-3d87-4872-9070-632820b1cae2 + /cdmscatalog/{executionId}: + get: + summary: | + Get STAC Catalog for execution + operationId: cdmscatalog + tags: + - Analytics + description: "Get STAC catalog by execution id" + parameters: + - in: path + name: executionId + description: | + The job execution ID + required: true + schema: + type: string + format: uuid + example: c864a51b-3d87-4872-9070-632820b1cae2 + /cdmscatalog/{executionId}/{format}: + get: + summary: | + Get STAC Catalog format catalog for execution + operationId: cdmscatalogcollection + tags: + - Analytics + description: "Get STAC catalog by execution id" + parameters: + - in: path + name: executionId + description: | + The job execution ID + required: true + schema: + type: string + format: uuid + example: c864a51b-3d87-4872-9070-632820b1cae2 + - in: path + name: format + description: | + CDMS results format + required: true + schema: + type: string + enum: [JSON,CSV,NETCDF] + example: JSON + - in: query + name: pageSize + description: | + How many primary matches on each page of CDMS results + required: false + schema: + type: integer externalDocs: description: Documentation url: https://incubator-sdap-nexus.readthedocs.io/en/latest/index.html From 0691d87932ec0a7c54ef77f1839100681848df37 Mon Sep 17 00:00:00 2001 From: skorper Date: Fri, 5 Jan 2024 16:15:40 -0800 Subject: [PATCH 57/70] move stac endpoints to matchup tag in openapi spec --- analysis/webservice/apidocs/openapi.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analysis/webservice/apidocs/openapi.yml b/analysis/webservice/apidocs/openapi.yml index 0420bf9d..3bb5103a 100644 --- a/analysis/webservice/apidocs/openapi.yml +++ b/analysis/webservice/apidocs/openapi.yml @@ -727,7 +727,7 @@ paths: Get STAC Catalog for execution operationId: cdmscatalog tags: - - Analytics + - Matchup description: "Get STAC catalog by execution id" parameters: - in: path @@ -745,7 +745,7 @@ paths: Get STAC Catalog format catalog for execution operationId: cdmscatalogcollection tags: - - Analytics + - Matchup description: "Get STAC catalog by execution id" parameters: - in: path From 61e6223c21fbf00951081785a9e1ad815d6c9519 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 8 Jan 2024 14:36:12 -0800 Subject: [PATCH 58/70] Meta field in matchup result - all formats --- .../algorithms/doms/BaseDomsHandler.py | 62 ++++++++++++++++--- .../algorithms/doms/ResultsRetrieval.py | 24 ++++++- .../nexustiles/backends/nexusproto/backend.py | 10 ++- 3 files changed, 87 insertions(+), 9 deletions(-) diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py index 66c4079a..4c019bd4 100644 --- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py +++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py @@ -85,14 +85,15 @@ def default(self, obj): class DomsQueryResults(NexusResults): def __init__(self, results=None, args=None, bounds=None, count=None, details=None, computeOptions=None, - executionId=None, status_code=200, page_num=None, page_size=None): - NexusResults.__init__(self, results=results, meta=None, stats=None, computeOptions=computeOptions, + executionId=None, status_code=200, page_num=None, page_size=None, meta=None): + NexusResults.__init__(self, results=results, meta=meta, stats=None, computeOptions=computeOptions, status_code=status_code) self.__args = args self.__bounds = bounds self.__count = count self.__details = details self.__executionId = str(executionId) + self.__meta = meta if meta is not None else {} if self.__details is None: self.__details = {} @@ -105,13 +106,13 @@ def toJson(self): bounds = self.__bounds.toMap() if self.__bounds is not None else {} return json.dumps( {"executionId": self.__executionId, "data": self.results(), "params": self.__args, "bounds": bounds, - "count": self.__count, "details": self.__details}, indent=4, cls=DomsEncoder) + "count": self.__count, "details": self.__details, "metadata": self.__meta}, indent=4, cls=DomsEncoder) def toCSV(self): - return DomsCSVFormatter.create(self.__executionId, self.results(), self.__args, self.__details) + return DomsCSVFormatter.create(self.__executionId, self.results(), self.__args, self.__details, self.__meta) def toNetCDF(self): - return DomsNetCDFFormatter.create(self.__executionId, self.results(), self.__args, self.__details) + return DomsNetCDFFormatter.create(self.__executionId, self.results(), self.__args, self.__details, self.__meta) def filename(self): return f'CDMS_{self.__executionId}' @@ -119,13 +120,15 @@ def filename(self): class DomsCSVFormatter: @staticmethod - def create(executionId, results, params, details): + def create(executionId, results, params, details, metadata): csv_mem_file = io.StringIO() try: DomsCSVFormatter.__addConstants(csv_mem_file) DomsCSVFormatter.__addDynamicAttrs(csv_mem_file, executionId, results, params, details) csv.writer(csv_mem_file).writerow([]) + DomsCSVFormatter.__addMetadata(csv_mem_file, metadata) + csv.writer(csv_mem_file).writerow([]) DomsCSVFormatter.__packValues(csv_mem_file, results) @@ -299,10 +302,31 @@ def __addDynamicAttrs(csvfile, executionId, results, params, details): writer.writerows(global_attrs) + @staticmethod + def __addMetadata(csvfile, meta): + def meta_dict_to_list(meta_dict: dict, prefix='metadata') -> list: + attrs = [] + + for key in meta_dict: + new_key = key if prefix == '' else f'{prefix}.{key}' + value = meta_dict[key] + + if isinstance(value, dict): + attrs.extend(meta_dict_to_list(value, new_key)) + else: + attrs.append(dict(MetadataAttribute=new_key, Value=value)) + + return attrs + + metadata_attrs = meta_dict_to_list(meta) + + writer = csv.DictWriter(csvfile, sorted(next(iter(metadata_attrs)).keys())) + writer.writerows(metadata_attrs) + class DomsNetCDFFormatter: @staticmethod - def create(executionId, results, params, details): + def create(executionId, results, params, details, metadata): t = tempfile.mkstemp(prefix="cdms_", suffix=".nc") tempFileName = t[1] @@ -346,6 +370,30 @@ def create(executionId, results, params, details): dataset.CDMS_page_num = details["pageNum"] dataset.CDMS_page_size = details["pageSize"] + ####TEST + + def meta_dict_to_list(meta_dict: dict, prefix='metadata') -> list: + attrs = [] + + for key in meta_dict: + new_key = key if prefix == '' else f'{prefix}.{key}' + value = meta_dict[key] + + if value is None: + value = 'NULL' + elif isinstance(value, list): + value = json.dumps(value) + + if isinstance(value, dict): + attrs.extend(meta_dict_to_list(value, new_key)) + else: + attrs.append((new_key, value)) + + return attrs + + for attr in meta_dict_to_list(metadata): + setattr(dataset, *attr) + insituDatasets = params["matchup"] insituLinks = set() for insitu in insituDatasets: diff --git a/analysis/webservice/algorithms/doms/ResultsRetrieval.py b/analysis/webservice/algorithms/doms/ResultsRetrieval.py index f03c1caa..0b26056a 100644 --- a/analysis/webservice/algorithms/doms/ResultsRetrieval.py +++ b/analysis/webservice/algorithms/doms/ResultsRetrieval.py @@ -19,6 +19,11 @@ from . import ResultsStorage from webservice.NexusHandler import nexus_handler from webservice.webmodel import NexusProcessingException +from nexustiles.nexustiles import NexusTileService + +import logging + +log = logging.getLogger(__name__) @nexus_handler @@ -48,5 +53,22 @@ def calc(self, computeOptions, **args): with ResultsStorage.ResultsRetrieval(self.config) as storage: params, stats, data = storage.retrieveResults(execution_id, trim_data=simple_results, page_num=page_num, page_size=page_size) + try: + ds_metadata = {} + ds_meta_primary_name = params['primary'] + + primary_metadata = NexusTileService.get_metadata_for_dataset(ds_meta_primary_name) + + ds_metadata['primary'] = {ds_meta_primary_name: primary_metadata} + + ds_metadata['secondary'] = {} + + for secondary_ds_name in params['matchup'].split(','): + ds_metadata['secondary'][secondary_ds_name] = NexusTileService.get_metadata_for_dataset(secondary_ds_name) + except: + log.warning('Could not build dataset metadata dict due to an error') + ds_metadata = {} + return BaseDomsHandler.DomsQueryResults(results=data, args=params, details=stats, bounds=None, count=len(data), - computeOptions=None, executionId=execution_id, page_num=page_num, page_size=page_size) + computeOptions=None, executionId=execution_id, page_num=page_num, + page_size=page_size, meta=dict(datasets=ds_metadata)) diff --git a/data-access/nexustiles/backends/nexusproto/backend.py b/data-access/nexustiles/backends/nexusproto/backend.py index 3b9390ff..d86a594a 100644 --- a/data-access/nexustiles/backends/nexusproto/backend.py +++ b/data-access/nexustiles/backends/nexusproto/backend.py @@ -568,9 +568,17 @@ def update_metadata(self, solr_doc): else: variables = [] + extra_meta = solr_doc.get('meta_s', None) + self._ds_info[dataset]['variables'] = variables - # print(self._ds_info) + if extra_meta is not None: + try: + extra_meta = json.loads(extra_meta) + except json.JSONDecodeError: + pass + + self._ds_info[dataset]['metadata'] = extra_meta def get_metadata(self, dataset=None): if dataset is None: From e02fc78f64b76e0974cd9bbc6f09b01cd9593447 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 11 Jan 2024 10:49:19 -0800 Subject: [PATCH 59/70] SDAP-507 - Changes to remove geos sub-dependency --- analysis/webservice/algorithms/doms/BaseDomsHandler.py | 7 ------- analysis/webservice/algorithms/doms/ResultsPlotQuery.py | 2 +- analysis/webservice/algorithms/doms/__init__.py | 2 +- .../nexus_tornado/app_builders/HandlerArgsBuilder.py | 7 ++++--- docker/nexus-webapp/Dockerfile | 4 ++-- 5 files changed, 8 insertions(+), 14 deletions(-) diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py index 84c91633..faa384f7 100644 --- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py +++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py @@ -35,13 +35,6 @@ EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' -try: - from osgeo import gdal - from osgeo.gdalnumeric import * -except ImportError: - import gdal - from gdalnumeric import * - from netCDF4 import Dataset import netCDF4 import tempfile diff --git a/analysis/webservice/algorithms/doms/ResultsPlotQuery.py b/analysis/webservice/algorithms/doms/ResultsPlotQuery.py index 950c7964..864cdc3b 100644 --- a/analysis/webservice/algorithms/doms/ResultsPlotQuery.py +++ b/analysis/webservice/algorithms/doms/ResultsPlotQuery.py @@ -26,7 +26,7 @@ class PlotTypes: HISTOGRAM = "histogram" -@nexus_handler +# @nexus_handler class DomsResultsPlotHandler(BaseDomsHandler.BaseDomsQueryCalcHandler): name = "DOMS Results Plotting" path = "/domsplot" diff --git a/analysis/webservice/algorithms/doms/__init__.py b/analysis/webservice/algorithms/doms/__init__.py index bc568f83..8a94798e 100644 --- a/analysis/webservice/algorithms/doms/__init__.py +++ b/analysis/webservice/algorithms/doms/__init__.py @@ -21,7 +21,7 @@ from . import DomsInitialization from . import MatchupQuery from . import MetadataQuery -from . import ResultsPlotQuery +# from . import ResultsPlotQuery from . import ResultsRetrieval from . import ResultsStorage from . import StatsQuery diff --git a/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py b/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py index 2a84ae7e..3b8b480f 100644 --- a/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py +++ b/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py @@ -37,9 +37,10 @@ def handler_needs_algorithm_config(class_wrapper): class_wrapper == webservice.algorithms_spark.Matchup.Matchup or class_wrapper == webservice.algorithms_spark.MatchupDoms.MatchupDoms or issubclass(class_wrapper, webservice.algorithms.doms.BaseDomsHandler.BaseDomsQueryCalcHandler) - or issubclass(class_wrapper, webservice.algorithms_spark.NexusCalcSparkTornadoHandler.NexusCalcSparkTornadoHandler) + or issubclass(class_wrapper, + webservice.algorithms_spark.NexusCalcSparkTornadoHandler.NexusCalcSparkTornadoHandler) or class_wrapper == webservice.algorithms.doms.ResultsRetrieval.DomsResultsRetrievalHandler - or class_wrapper == webservice.algorithms.doms.ResultsPlotQuery.DomsResultsPlotHandler + # or class_wrapper == webservice.algorithms.doms.ResultsPlotQuery.DomsResultsPlotHandler ) @staticmethod @@ -50,7 +51,7 @@ def get_args(self, clazz_wrapper): args = dict( clazz=clazz_wrapper, tile_service_factory=self.tile_service_factory, - thread_pool=self. request_thread_pool + thread_pool=self.request_thread_pool ) if issubclass(clazz_wrapper, webservice.algorithms_spark.NexusCalcSparkHandler.NexusCalcSparkHandler): diff --git a/docker/nexus-webapp/Dockerfile b/docker/nexus-webapp/Dockerfile index 515d6ab0..6f13f9f4 100644 --- a/docker/nexus-webapp/Dockerfile +++ b/docker/nexus-webapp/Dockerfile @@ -95,10 +95,10 @@ RUN python3 setup.py install clean WORKDIR /incubator-sdap-nexus/analysis RUN python3 setup.py install clean && mamba clean -afy +RUN pip install shapely==1.7.1 WORKDIR /incubator-sdap-nexus/tools/deletebyquery -RUN pip3 install cassandra-driver==3.20.1 -RUN pip3 install pyspark py4j +RUN pip3 install cassandra-driver==3.20.1 pyspark py4j RUN pip3 install -r requirements.txt RUN pip3 install cython RUN rm requirements.txt From 51231cad14ba5242bbee1814ba1141a366d33b61 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 11 Jan 2024 10:54:14 -0800 Subject: [PATCH 60/70] SDAP-507 - Changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 01d62724..793c6017 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SDAP-482: Updated Saildrone in situ endpoint in config file - SDAP-485: Improved behavior for retrying failed Cassandra inserts when saving matchup results. - SDAP-487: Improved result fetch speed for large matchup results by tweaking `doms.doms_data` schema to support querying by primary value id. +- SDAP-507: Changes to remove `geos` sub-dependency from core image build: + - Removed `gdal` and `basemap` as core dependencies + - Moved `shapely` installation in docker build from conda install to pip install + - Disabled `/domsplot` endpoint & commented out references to its source file as it depends on `basemap` and raises `ImportError`s at startup ### Deprecated ### Removed - SDAP-465: Removed `climatology` directory. From 5c755736517ac33c526e6f00ab3331fd02c48411 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 11 Jan 2024 10:49:19 -0800 Subject: [PATCH 61/70] SDAP-507 - Changes to remove geos sub-dependency --- analysis/webservice/algorithms/doms/BaseDomsHandler.py | 7 ------- analysis/webservice/algorithms/doms/ResultsPlotQuery.py | 2 +- analysis/webservice/algorithms/doms/__init__.py | 2 +- .../nexus_tornado/app_builders/HandlerArgsBuilder.py | 7 ++++--- docker/nexus-webapp/Dockerfile | 4 ++-- 5 files changed, 8 insertions(+), 14 deletions(-) diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py index 84c91633..faa384f7 100644 --- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py +++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py @@ -35,13 +35,6 @@ EPOCH = timezone('UTC').localize(datetime(1970, 1, 1)) ISO_8601 = '%Y-%m-%dT%H:%M:%S%z' -try: - from osgeo import gdal - from osgeo.gdalnumeric import * -except ImportError: - import gdal - from gdalnumeric import * - from netCDF4 import Dataset import netCDF4 import tempfile diff --git a/analysis/webservice/algorithms/doms/ResultsPlotQuery.py b/analysis/webservice/algorithms/doms/ResultsPlotQuery.py index 950c7964..864cdc3b 100644 --- a/analysis/webservice/algorithms/doms/ResultsPlotQuery.py +++ b/analysis/webservice/algorithms/doms/ResultsPlotQuery.py @@ -26,7 +26,7 @@ class PlotTypes: HISTOGRAM = "histogram" -@nexus_handler +# @nexus_handler class DomsResultsPlotHandler(BaseDomsHandler.BaseDomsQueryCalcHandler): name = "DOMS Results Plotting" path = "/domsplot" diff --git a/analysis/webservice/algorithms/doms/__init__.py b/analysis/webservice/algorithms/doms/__init__.py index bc568f83..8a94798e 100644 --- a/analysis/webservice/algorithms/doms/__init__.py +++ b/analysis/webservice/algorithms/doms/__init__.py @@ -21,7 +21,7 @@ from . import DomsInitialization from . import MatchupQuery from . import MetadataQuery -from . import ResultsPlotQuery +# from . import ResultsPlotQuery from . import ResultsRetrieval from . import ResultsStorage from . import StatsQuery diff --git a/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py b/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py index 2a84ae7e..3b8b480f 100644 --- a/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py +++ b/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py @@ -37,9 +37,10 @@ def handler_needs_algorithm_config(class_wrapper): class_wrapper == webservice.algorithms_spark.Matchup.Matchup or class_wrapper == webservice.algorithms_spark.MatchupDoms.MatchupDoms or issubclass(class_wrapper, webservice.algorithms.doms.BaseDomsHandler.BaseDomsQueryCalcHandler) - or issubclass(class_wrapper, webservice.algorithms_spark.NexusCalcSparkTornadoHandler.NexusCalcSparkTornadoHandler) + or issubclass(class_wrapper, + webservice.algorithms_spark.NexusCalcSparkTornadoHandler.NexusCalcSparkTornadoHandler) or class_wrapper == webservice.algorithms.doms.ResultsRetrieval.DomsResultsRetrievalHandler - or class_wrapper == webservice.algorithms.doms.ResultsPlotQuery.DomsResultsPlotHandler + # or class_wrapper == webservice.algorithms.doms.ResultsPlotQuery.DomsResultsPlotHandler ) @staticmethod @@ -50,7 +51,7 @@ def get_args(self, clazz_wrapper): args = dict( clazz=clazz_wrapper, tile_service_factory=self.tile_service_factory, - thread_pool=self. request_thread_pool + thread_pool=self.request_thread_pool ) if issubclass(clazz_wrapper, webservice.algorithms_spark.NexusCalcSparkHandler.NexusCalcSparkHandler): diff --git a/docker/nexus-webapp/Dockerfile b/docker/nexus-webapp/Dockerfile index 6aaadda8..c38c5f29 100644 --- a/docker/nexus-webapp/Dockerfile +++ b/docker/nexus-webapp/Dockerfile @@ -95,11 +95,11 @@ RUN python3 setup.py install clean WORKDIR /incubator-sdap-nexus/analysis RUN python3 setup.py install clean && mamba clean -afy +RUN pip install shapely==1.7.1 WORKDIR /incubator-sdap-nexus/tools/deletebyquery ARG CASS_DRIVER_BUILD_CONCURRENCY=8 -RUN pip3 install cassandra-driver==3.20.1 -RUN pip3 install pyspark py4j +RUN pip3 install cassandra-driver==3.20.1 pyspark py4j RUN pip3 install -r requirements.txt RUN pip3 install cython From 7f717c0fcf1f31701cff258a19145327592562ff Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 11 Jan 2024 10:54:14 -0800 Subject: [PATCH 62/70] SDAP-507 - Changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5396fdde..55c5bc6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Support for deploying on k8s version 1.25: - Upgraded Cassandra Helm chart dependency version - Bumped default Cassandra protocol version 3 -> 4 in webapp and tools +- SDAP-507: Changes to remove `geos` sub-dependency from core image build: + - Removed `gdal` and `basemap` as core dependencies + - Moved `shapely` installation in docker build from conda install to pip install + - Disabled `/domsplot` endpoint & commented out references to its source file as it depends on `basemap` and raises `ImportError`s at startup ### Deprecated ### Removed - SDAP-465: Removed `climatology` directory. From 9779f409bd8b5aff9d9d17b244ab6cfbddedfb28 Mon Sep 17 00:00:00 2001 From: rileykk Date: Fri, 19 Jan 2024 09:15:01 -0800 Subject: [PATCH 63/70] delete instead of comment out --- analysis/webservice/algorithms/doms/__init__.py | 1 - .../webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py | 1 - 2 files changed, 2 deletions(-) diff --git a/analysis/webservice/algorithms/doms/__init__.py b/analysis/webservice/algorithms/doms/__init__.py index 8a94798e..8bddad9e 100644 --- a/analysis/webservice/algorithms/doms/__init__.py +++ b/analysis/webservice/algorithms/doms/__init__.py @@ -21,7 +21,6 @@ from . import DomsInitialization from . import MatchupQuery from . import MetadataQuery -# from . import ResultsPlotQuery from . import ResultsRetrieval from . import ResultsStorage from . import StatsQuery diff --git a/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py b/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py index 3b8b480f..f2d6f1b4 100644 --- a/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py +++ b/analysis/webservice/nexus_tornado/app_builders/HandlerArgsBuilder.py @@ -40,7 +40,6 @@ def handler_needs_algorithm_config(class_wrapper): or issubclass(class_wrapper, webservice.algorithms_spark.NexusCalcSparkTornadoHandler.NexusCalcSparkTornadoHandler) or class_wrapper == webservice.algorithms.doms.ResultsRetrieval.DomsResultsRetrievalHandler - # or class_wrapper == webservice.algorithms.doms.ResultsPlotQuery.DomsResultsPlotHandler ) @staticmethod From 937876031c63d7d1eac279eb7ba34c1a9a62355c Mon Sep 17 00:00:00 2001 From: skorper Date: Fri, 19 Jan 2024 09:45:40 -0800 Subject: [PATCH 64/70] Revert "Update helm cassandra dependency (#289)" This reverts commit 1e8cc4e9d31d295e172c0db4bba61a5776642bea. --- CHANGELOG.md | 1 - analysis/webservice/algorithms/doms/ResultsStorage.py | 3 --- analysis/webservice/algorithms/doms/domsconfig.ini.default | 2 +- data-access/nexustiles/config/datastores.ini.default | 2 +- helm/requirements.yaml | 2 +- helm/values.yaml | 3 ++- tools/deletebyquery/deletebyquery.py | 2 +- tools/doms-data-tools/update_doms_data_pk.py | 2 +- tools/domspurge/README.md | 2 +- tools/domspurge/purge.py | 2 +- 10 files changed, 9 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e686e60..ed72f245 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,7 +42,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Status code for results endpoint if execution id is not found fixed to be `404` instead of `500`. - Ensured links in the `/job` endpoint are https - SDAP-488: Workaround to build issue on Apple Silicon (M1/M2). Image build installs nexusproto through PyPI instead of building from source. A build arg `BUILD_NEXUSPROTO` was defined to allow building from source if desired -- SDAP-496: Fix `solr-cloud-init` image failing to run. ### Security ## [1.1.0] - 2023-04-26 diff --git a/analysis/webservice/algorithms/doms/ResultsStorage.py b/analysis/webservice/algorithms/doms/ResultsStorage.py index 1dea1610..48b2122d 100644 --- a/analysis/webservice/algorithms/doms/ResultsStorage.py +++ b/analysis/webservice/algorithms/doms/ResultsStorage.py @@ -65,9 +65,6 @@ def __enter__(self): dc_policy = DCAwareRoundRobinPolicy(cassDatacenter) token_policy = TokenAwarePolicy(dc_policy) - logger.info(f'Connecting to Cassandra cluster @ {[host for host in cassHost.split(",")]}; datacenter: ' - f'{cassDatacenter}; protocol version: {cassVersion}') - self._cluster = Cluster([host for host in cassHost.split(',')], load_balancing_policy=token_policy, protocol_version=cassVersion, auth_provider=auth_provider) diff --git a/analysis/webservice/algorithms/doms/domsconfig.ini.default b/analysis/webservice/algorithms/doms/domsconfig.ini.default index f4e44960..55f9b16c 100644 --- a/analysis/webservice/algorithms/doms/domsconfig.ini.default +++ b/analysis/webservice/algorithms/doms/domsconfig.ini.default @@ -18,7 +18,7 @@ host=localhost port=9042 keyspace=doms local_datacenter=datacenter1 -protocol_version=4 +protocol_version=3 dc_policy=DCAwareRoundRobinPolicy username= password= diff --git a/data-access/nexustiles/config/datastores.ini.default b/data-access/nexustiles/config/datastores.ini.default index 51455a38..d8db1902 100644 --- a/data-access/nexustiles/config/datastores.ini.default +++ b/data-access/nexustiles/config/datastores.ini.default @@ -18,7 +18,7 @@ host=localhost port=9042 keyspace=nexustiles local_datacenter=datacenter1 -protocol_version=4 +protocol_version=3 dc_policy=DCAwareRoundRobinPolicy username= password= diff --git a/helm/requirements.yaml b/helm/requirements.yaml index 1de8cf0f..a9996586 100644 --- a/helm/requirements.yaml +++ b/helm/requirements.yaml @@ -12,7 +12,7 @@ dependencies: repository: https://raw.githubusercontent.com/bitnami/charts/archive-full-index/bitnami condition: solr.enabled - name: cassandra - version: 9.1.7 + version: 5.5.3 repository: https://raw.githubusercontent.com/bitnami/charts/archive-full-index/bitnami condition: cassandra.enabled diff --git a/helm/values.yaml b/helm/values.yaml index fe2481ef..4105362e 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -195,7 +195,8 @@ cassandra: dbUser: user: cassandra password: cassandra - replicaCount: 1 + cluster: + replicaCount: 1 persistence: storageClass: hostpath size: 8Gi diff --git a/tools/deletebyquery/deletebyquery.py b/tools/deletebyquery/deletebyquery.py index 8b98111a..4fb7bd66 100644 --- a/tools/deletebyquery/deletebyquery.py +++ b/tools/deletebyquery/deletebyquery.py @@ -262,7 +262,7 @@ def parse_args(): help='The version of the Cassandra protocol the driver should use.', required=False, choices=['1', '2', '3', '4', '5'], - default='4') + default='3') parser.add_argument('--solr-rows', help='Number of rows to fetch with each Solr query to build the list of tiles to delete', diff --git a/tools/doms-data-tools/update_doms_data_pk.py b/tools/doms-data-tools/update_doms_data_pk.py index 749995da..ed8dbe5e 100644 --- a/tools/doms-data-tools/update_doms_data_pk.py +++ b/tools/doms-data-tools/update_doms_data_pk.py @@ -114,7 +114,7 @@ def main(): request_timeout=60.0, ) }, - protocol_version=4, + protocol_version=3, auth_provider=auth_provider) as cluster: session = cluster.connect('doms') diff --git a/tools/domspurge/README.md b/tools/domspurge/README.md index e88b62f5..92f7cfb1 100644 --- a/tools/domspurge/README.md +++ b/tools/domspurge/README.md @@ -33,5 +33,5 @@ You can build an image for this script to run it in a Kubernetes CronJob. ```shell cd /incubator-sdap-nexus -docker build . -f tools/domspurge/Dockerfile -t sdap-local/DomsPurge: +docker build . -f Dockerfile -t sdap-local/DomsPurge: ``` diff --git a/tools/domspurge/purge.py b/tools/domspurge/purge.py index d4bb15a8..4fb2fc37 100644 --- a/tools/domspurge/purge.py +++ b/tools/domspurge/purge.py @@ -270,7 +270,7 @@ def parse_args(): required=False, dest='pv', choices=['1', '2', '3', '4', '5'], - default='4') + default='3') time_before = purge_options.add_mutually_exclusive_group(required=True) From 530314652bd1e74907962a9b03ceddfdc302c3ff Mon Sep 17 00:00:00 2001 From: rileykk Date: Fri, 19 Jan 2024 10:20:44 -0800 Subject: [PATCH 65/70] deleted disabled endpoint files --- .../algorithms/doms/ResultsPlotQuery.py | 56 ------ .../webservice/algorithms/doms/mapplot.py | 174 ------------------ 2 files changed, 230 deletions(-) delete mode 100644 analysis/webservice/algorithms/doms/ResultsPlotQuery.py delete mode 100644 analysis/webservice/algorithms/doms/mapplot.py diff --git a/analysis/webservice/algorithms/doms/ResultsPlotQuery.py b/analysis/webservice/algorithms/doms/ResultsPlotQuery.py deleted file mode 100644 index 864cdc3b..00000000 --- a/analysis/webservice/algorithms/doms/ResultsPlotQuery.py +++ /dev/null @@ -1,56 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import BaseDomsHandler -from . import histogramplot -from . import mapplot -from . import scatterplot -from webservice.NexusHandler import nexus_handler - - -class PlotTypes: - SCATTER = "scatter" - MAP = "map" - HISTOGRAM = "histogram" - - -# @nexus_handler -class DomsResultsPlotHandler(BaseDomsHandler.BaseDomsQueryCalcHandler): - name = "DOMS Results Plotting" - path = "/domsplot" - description = "" - params = {} - singleton = True - - def __init__(self, tile_service_factory, config=None): - BaseDomsHandler.BaseDomsQueryCalcHandler.__init__(self, tile_service_factory) - self.config = config - - def calc(self, computeOptions, **args): - id = computeOptions.get_argument("id", None) - parameter = computeOptions.get_argument('parameter', 'sst') - - plotType = computeOptions.get_argument("type", PlotTypes.SCATTER) - - normAndCurve = computeOptions.get_boolean_arg("normandcurve", False) - - if plotType == PlotTypes.SCATTER: - return scatterplot.createScatterPlot(id, parameter, config=self.config) - elif plotType == PlotTypes.MAP: - return mapplot.createMapPlot(id, parameter, config=self.config) - elif plotType == PlotTypes.HISTOGRAM: - return histogramplot.createHistogramPlot(id, parameter, normAndCurve, config=self.config) - else: - raise Exception("Unsupported plot type '%s' specified." % plotType) diff --git a/analysis/webservice/algorithms/doms/mapplot.py b/analysis/webservice/algorithms/doms/mapplot.py deleted file mode 100644 index 8b93d3c6..00000000 --- a/analysis/webservice/algorithms/doms/mapplot.py +++ /dev/null @@ -1,174 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import io -from multiprocessing import Process, Manager - -import matplotlib -import matplotlib.pyplot as plt -import numpy as np -from mpl_toolkits.basemap import Basemap - -from . import BaseDomsHandler -from . import ResultsStorage - -if not matplotlib.get_backend(): - matplotlib.use('Agg') - -PARAMETER_TO_FIELD = { - "sst": "sea_water_temperature", - "sss": "sea_water_salinity" -} - -PARAMETER_TO_UNITS = { - "sst": "($^\circ$ C)", - "sss": "(g/L)" -} - - -def __square(minLon, maxLon, minLat, maxLat): - if maxLat - minLat > maxLon - minLon: - a = ((maxLat - minLat) - (maxLon - minLon)) / 2.0 - minLon -= a - maxLon += a - elif maxLon - minLon > maxLat - minLat: - a = ((maxLon - minLon) - (maxLat - minLat)) / 2.0 - minLat -= a - maxLat += a - - return minLon, maxLon, minLat, maxLat - - -def render(d, lats, lons, z, primary, secondary, parameter): - fig = plt.figure() - ax = fig.add_axes([0.1, 0.1, 0.8, 0.8]) - - ax.set_title(f'{primary} vs. {secondary}') - # ax.set_ylabel('Latitude') - # ax.set_xlabel('Longitude') - - minLatA = np.min(lats) - maxLatA = np.max(lats) - minLonA = np.min(lons) - maxLonA = np.max(lons) - - minLat = minLatA - (abs(maxLatA - minLatA) * 0.1) - maxLat = maxLatA + (abs(maxLatA - minLatA) * 0.1) - - minLon = minLonA - (abs(maxLonA - minLonA) * 0.1) - maxLon = maxLonA + (abs(maxLonA - minLonA) * 0.1) - - minLon, maxLon, minLat, maxLat = __square(minLon, maxLon, minLat, maxLat) - - # m = Basemap(projection='mill', llcrnrlon=-180,llcrnrlat=-80,urcrnrlon=180,urcrnrlat=80,resolution='l') - m = Basemap(projection='mill', llcrnrlon=minLon, llcrnrlat=minLat, urcrnrlon=maxLon, urcrnrlat=maxLat, - resolution='l') - - m.drawparallels(np.arange(minLat, maxLat, (maxLat - minLat) / 5.0), labels=[1, 0, 0, 0], fontsize=10) - m.drawmeridians(np.arange(minLon, maxLon, (maxLon - minLon) / 5.0), labels=[0, 0, 0, 1], fontsize=10) - - m.drawcoastlines() - m.drawmapboundary(fill_color='#99ffff') - m.fillcontinents(color='#cc9966', lake_color='#99ffff') - - # lats, lons = np.meshgrid(lats, lons) - - masked_array = np.ma.array(z, mask=np.isnan(z)) - z = masked_array - - values = np.zeros(len(z)) - for i in range(0, len(z)): - values[i] = ((z[i] - np.min(z)) / (np.max(z) - np.min(z)) * 20.0) + 10 - - x, y = m(lons, lats) - - im1 = m.scatter(x, y, values) - - im1.set_array(z) - cb = m.colorbar(im1) - - units = PARAMETER_TO_UNITS[parameter] if parameter in PARAMETER_TO_UNITS else PARAMETER_TO_UNITS["sst"] - cb.set_label("Difference %s" % units) - - buf = io.BytesIO() - plt.savefig(buf, format='png') - plot = buf.getvalue() - if d is not None: - d['plot'] = plot - return plot - - -class DomsMapPlotQueryResults(BaseDomsHandler.DomsQueryResults): - def __init__(self, lats, lons, z, parameter, primary, secondary, args=None, bounds=None, count=None, details=None, - computeOptions=None, executionId=None, plot=None): - BaseDomsHandler.DomsQueryResults.__init__(self, results={"lats": lats, "lons": lons, "values": z}, args=args, - details=details, bounds=bounds, count=count, - computeOptions=computeOptions, executionId=executionId) - self.__lats = lats - self.__lons = lons - self.__z = np.array(z) - self.__parameter = parameter - self.__primary = primary - self.__secondary = secondary - self.__plot = plot - - def toImage(self): - return self.__plot - - -def renderAsync(x, y, z, primary, secondary, parameter): - manager = Manager() - d = manager.dict() - p = Process(target=render, args=(d, x, y, z, primary, secondary, parameter)) - p.start() - p.join() - return d['plot'] - - -def createMapPlot(id, parameter, config=None): - with ResultsStorage.ResultsRetrieval(config) as storage: - params, stats, data = storage.retrieveResults(id) - - primary = params["primary"] - secondary = params["matchup"][0] - - lats = [] - lons = [] - z = [] - - field = PARAMETER_TO_FIELD[parameter] if parameter in PARAMETER_TO_FIELD else PARAMETER_TO_FIELD["sst"] - - for entry in data: - for match in entry["matches"]: - if match["source"] == secondary: - - if field in entry and field in match: - a = entry[field] - b = match[field] - z.append((a - b)) - z.append((a - b)) - else: - z.append(1.0) - z.append(1.0) - lats.append(entry["y"]) - lons.append(entry["x"]) - lats.append(match["y"]) - lons.append(match["x"]) - - plot = renderAsync(lats, lons, z, primary, secondary, parameter) - r = DomsMapPlotQueryResults(lats=lats, lons=lons, z=z, parameter=parameter, primary=primary, secondary=secondary, - args=params, - details=stats, bounds=None, count=None, computeOptions=None, executionId=id, plot=plot) - return r From ee5e5c8da244af7b0e0a2864dd894459b0706350 Mon Sep 17 00:00:00 2001 From: skorper Date: Thu, 25 Jan 2024 12:14:25 -0800 Subject: [PATCH 66/70] fix bug where still-running jobs failed /job endpoint due to missing metadata --- analysis/webservice/algorithms/doms/ExecutionStatus.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/analysis/webservice/algorithms/doms/ExecutionStatus.py b/analysis/webservice/algorithms/doms/ExecutionStatus.py index 63cf423b..9817b070 100644 --- a/analysis/webservice/algorithms/doms/ExecutionStatus.py +++ b/analysis/webservice/algorithms/doms/ExecutionStatus.py @@ -63,6 +63,9 @@ def calc(self, request, **args): except NexusProcessingException: execution_stats = {} + if execution_stats is None: + execution_stats = {} + job_status = NexusExecutionResults.ExecutionStatus(execution_details['status']) host = f'https://{request.requestHandler.request.host}' From 40a80e2266a40fa710e16f8cc43ebc5ea6b17646 Mon Sep 17 00:00:00 2001 From: rileykk Date: Mon, 29 Jan 2024 12:57:46 -0800 Subject: [PATCH 67/70] Don't write an empty row between meta blocks in CSV writer --- analysis/webservice/algorithms/doms/BaseDomsHandler.py | 1 - 1 file changed, 1 deletion(-) diff --git a/analysis/webservice/algorithms/doms/BaseDomsHandler.py b/analysis/webservice/algorithms/doms/BaseDomsHandler.py index 11da3eca..bc7db09e 100644 --- a/analysis/webservice/algorithms/doms/BaseDomsHandler.py +++ b/analysis/webservice/algorithms/doms/BaseDomsHandler.py @@ -119,7 +119,6 @@ def create(executionId, results, params, details, metadata): try: DomsCSVFormatter.__addConstants(csv_mem_file) DomsCSVFormatter.__addDynamicAttrs(csv_mem_file, executionId, results, params, details) - csv.writer(csv_mem_file).writerow([]) DomsCSVFormatter.__addMetadata(csv_mem_file, metadata) csv.writer(csv_mem_file).writerow([]) From 34b7b95c273ddad1119a1e5ca5f9c3899551ca08 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 1 Feb 2024 15:30:09 -0800 Subject: [PATCH 68/70] Moved changelog entries --- CHANGELOG.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ba371e0..94e2fa60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,11 +41,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SDAP-482: Updated Saildrone in situ endpoint in config file - SDAP-485: Improved behavior for retrying failed Cassandra inserts when saving matchup results. - SDAP-487: Improved result fetch speed for large matchup results by tweaking `doms.doms_data` schema to support querying by primary value id. -- SDAP-493: - - Updated /job endpoint to use `executionId` terminology for consistency with existing `/cdmsresults` endpoint - - Updated /job endpoint with details about number of primary and secondary tiles. -- SDAP-500: Improvements to SDAP Asynchronous Jobs -- SDAP-499: Added page number to default filename for matchup output - Support for deploying on k8s version 1.25: - Upgraded Cassandra Helm chart dependency version - Bumped default Cassandra protocol version 3 -> 4 in webapp and tools @@ -57,8 +52,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Removed - SDAP-465: Removed `climatology` directory. - SDAP-501: Updated dependencies to remove `chardet` -- SDAP-493: - - Removed `resultSizeLimit` from /match_spark endpoint ### Fixed - SDAP-474: Fixed bug in CSV attributes where secondary dataset would be rendered as comma separated characters - SDAP-475: Bug fixes for `/timeSeriesSpark` and `/timeAvgMapSpark` From 6de825b3c83e1b15e261c6af70da010856961934 Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 1 Feb 2024 15:33:05 -0800 Subject: [PATCH 69/70] SDAP-472 changelog entries --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94e2fa60..84eced44 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,12 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - SDAP-506: - Added STAC Catalog endpoint for matchup outputs +- SDAP-472: + - Support for Zarr backend (gridded data only) + - Dataset management endpoints for Zarr datasets ### Changed - SDAP-493: - Updated /job endpoint to use `executionId` terminology for consistency with existing `/cdmsresults` endpoint - Updated /job endpoint with details about number of primary and secondary tiles. - SDAP-500: Improvements to SDAP Asynchronous Jobs - SDAP-499: Added page number to default filename for matchup output +- SDAP-472: Overhauled `data-access` to support multiple backends for simultaneous support of multiple ARD formats ### Deprecated ### Removed - SDAP-493: From 2aaf07b0d8170a76467abb80f62fdb26044d642a Mon Sep 17 00:00:00 2001 From: rileykk Date: Thu, 1 Feb 2024 15:34:20 -0800 Subject: [PATCH 70/70] SDAP-498 changelog entries --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84eced44..60cb6081 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SDAP-472: - Support for Zarr backend (gridded data only) - Dataset management endpoints for Zarr datasets +- SDAP-498: Support for satellite units & other dataset-level metadata ### Changed - SDAP-493: - Updated /job endpoint to use `executionId` terminology for consistency with existing `/cdmsresults` endpoint