diff --git a/common/common/parse_datetime_utils/ParseDatetimeUtils.py b/common/common/parse_datetime_utils/ParseDatetimeUtils.py new file mode 100644 index 0000000..ab18d99 --- /dev/null +++ b/common/common/parse_datetime_utils/ParseDatetimeUtils.py @@ -0,0 +1,32 @@ +from enum import Enum +import re +from functools import reduce +from datetime import datetime, timedelta +from typing import Tuple + + +class ParseDatetimeUtils: + def __init__(self, expression: str, sub_str_start: int =None, sub_str_end: int =None, is_range: bool =False): + self.expression = expression + + self.sub_str_start = sub_str_start + self.sub_str_end = sub_str_end + self.is_sub_str = self.sub_str_start is not None and self.sub_str_end is not None + + self.is_range = is_range + + def parse(self, input_str: str) -> Tuple[datetime, datetime]: + input_str = input_str[self.sub_str_start:self.sub_str_end] \ + if self.is_sub_str else input_str + + start_dt, end_dt = (None, None) + if self.is_range: + input_str_len = len(input_str) + input_str_mid = int(input_str_len / 2) + start_dt = datetime.strptime(input_str[0:input_str_mid], self.expression) + end_dt = datetime.strptime(input_str[input_str_mid+(input_str_len%2):], self.expression) + else: + start_dt = datetime.strptime(input_str, self.expression) + end_dt = start_dt + + return (start_dt, end_dt) diff --git a/common/common/parse_datetime_utils/__init__.py b/common/common/parse_datetime_utils/__init__.py new file mode 100644 index 0000000..8c5c699 --- /dev/null +++ b/common/common/parse_datetime_utils/__init__.py @@ -0,0 +1 @@ +from .ParseDatetimeUtils import ParseDatetimeUtils \ No newline at end of file diff --git a/common/common/parse_datetime_utils_test/ParseDatetimeUtilsTests.py b/common/common/parse_datetime_utils_test/ParseDatetimeUtilsTests.py new file mode 100644 index 0000000..3289f5e --- /dev/null +++ b/common/common/parse_datetime_utils_test/ParseDatetimeUtilsTests.py @@ -0,0 +1,42 @@ +from parse_datetime_utils import ParseDatetimeUtils +import unittest +from datetime import datetime + + +class ParseDatetimeUtilsTests(unittest.TestCase): + def test_parse(self): + filename = 'V4NA03_PM25_NA_20040115-RH35.nc' + expression = '%Y%m%d' + sub_str_start, sub_str_end = (15, 23) + parser = ParseDatetimeUtils(expression, sub_str_start=sub_str_start, + sub_str_end=sub_str_end) + start_datetime, end_datetime = parser.parse(filename) + + self.assertEqual(start_datetime, end_datetime) + self.assertEqual(end_datetime, datetime(2004, 1, 15)) + + def test_parse_range_with_separator(self): + filename = 'V5GL01.HybridPM25.NorthAmerica.200706-200908.nc' + expression = '%Y%m' + sub_str_start, sub_str_end = (31, 44) + parser = ParseDatetimeUtils(expression, sub_str_start=sub_str_start, + sub_str_end=sub_str_end, is_range=True) + start_datetime, end_datetime = parser.parse(filename) + + self.assertEqual(start_datetime, datetime(2007, 6, 1)) + self.assertEqual(end_datetime, datetime(2009, 8, 1)) + + def test_parse_range_without_separator(self): + filename = 'V5GL01.HybridPM25.NorthAmerica.200706200908.nc' + expression = '%Y%m' + sub_str_start, sub_str_end = (31, 43) + parser = ParseDatetimeUtils(expression, sub_str_start=sub_str_start, + sub_str_end=sub_str_end, is_range=True) + start_datetime, end_datetime = parser.parse(filename) + + self.assertEqual(start_datetime, datetime(2007, 6, 1)) + self.assertEqual(end_datetime, datetime(2009, 8, 1)) + + +if __name__ == '__main__': + unittest.main() diff --git a/common/common/parse_datetime_utils_test/__init__.py b/common/common/parse_datetime_utils_test/__init__.py new file mode 100644 index 0000000..5797a7d --- /dev/null +++ b/common/common/parse_datetime_utils_test/__init__.py @@ -0,0 +1 @@ +from .ParseDatetimeUtilsTests import ParseDatetimeUtilsTests \ No newline at end of file diff --git a/granule_ingester/granule_ingester/processors/TileSummarizingProcessor.py b/granule_ingester/granule_ingester/processors/TileSummarizingProcessor.py index 041cac5..3cecde4 100644 --- a/granule_ingester/granule_ingester/processors/TileSummarizingProcessor.py +++ b/granule_ingester/granule_ingester/processors/TileSummarizingProcessor.py @@ -12,8 +12,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import os import json import logging +import re +import datetime import numpy from nexusproto import DataTile_pb2 as nexusproto @@ -27,17 +30,45 @@ class NoTimeException(Exception): pass -def find_time_min_max(tile_data): +def find_time_min_max(tile_data, time_from_granule=None): if tile_data.time: if isinstance(tile_data.time, nexusproto.ShapedArray): time_data = from_shaped_array(tile_data.time) return int(numpy.nanmin(time_data).item()), int(numpy.nanmax(time_data).item()) - elif isinstance(tile_data.time, int): + elif isinstance(tile_data.time, int) and \ + tile_data.time > datetime.datetime(1970, 1, 1).timestamp(): # time should be at least greater than Epoch, right? return tile_data.time, tile_data.time + if time_from_granule: + return time_from_granule, time_from_granule + raise NoTimeException +def get_time_from_granule(granule: str) -> int: + """ + Get time from granule name. It makes the assumption that a datetime is + specificed in the granule name, and it has the following format "YYYYddd", + where YYYY is 4 digit year and ddd is day of year (i.e. 1 to 365). + + Note: This is a very narrow implmentation for a specific need. If you found + yourself needing to modify this function to accommodate more use cases, then + perhaps it is time to refactor this function to a more dynamic module. + """ + + # rs search for a sub str which starts with 19 or 20, and has 7 digits + search_res = re.search('((?:19|20)[0-9]{2})([0-9]{3})', os.path.basename(granule)) + if not search_res: + return None + + year, days = search_res.groups() + year = int(year) + days = int(days) + + # since datetime is set to 1/1 (+1 day), timedelta needs to -1 to cancel it out + return int((datetime.datetime(year, 1, 1) + datetime.timedelta(days-1)).timestamp()) + + class TileSummarizingProcessor(TileProcessor): def __init__(self, dataset_name: str, *args, **kwargs): @@ -92,7 +123,9 @@ def process(self, tile, dataset, *args, **kwargs): logger.debug(f'find min max time') try: - min_time, max_time = find_time_min_max(tile_data) + min_time, max_time = find_time_min_max( + tile_data, get_time_from_granule(tile.summary.granule) + ) logger.debug(f'set min max time') tile_summary.stats.min_time = min_time tile_summary.stats.max_time = max_time diff --git a/granule_ingester/tests/granules/A2017005.L3m_CHL_chlor_a_4km.nc b/granule_ingester/tests/granules/A2017005.L3m_CHL_chlor_a_4km.nc new file mode 100644 index 0000000..22a0f03 Binary files /dev/null and b/granule_ingester/tests/granules/A2017005.L3m_CHL_chlor_a_4km.nc differ diff --git a/granule_ingester/tests/reading_processors/test_TileSummarizingProcessor.py b/granule_ingester/tests/reading_processors/test_TileSummarizingProcessor.py index 3f78114..47dde86 100644 --- a/granule_ingester/tests/reading_processors/test_TileSummarizingProcessor.py +++ b/granule_ingester/tests/reading_processors/test_TileSummarizingProcessor.py @@ -1,3 +1,4 @@ +from datetime import datetime import json import unittest from os import path @@ -108,4 +109,39 @@ def test_hls_multiple_var_01(self): new_tile = tile_summary_processor.process(tile=output_tile, dataset=ds) self.assertEqual('[null, null, null, null, null, null, null, null, null, null, null]', new_tile.summary.standard_name, f'wrong new_tile.summary.standard_name') self.assertEqual([None for _ in range(11)], json.loads(new_tile.summary.standard_name), f'unable to convert new_tile.summary.standard_name from JSON') - self.assertTrue(abs(new_tile.summary.stats.mean - 0.26523) < 0.001, f'mean value is not close expected: 0.26523. actual: {new_tile.summary.stats.mean}') \ No newline at end of file + self.assertTrue(abs(new_tile.summary.stats.mean - 0.26523) < 0.001, f'mean value is not close expected: 0.26523. actual: {new_tile.summary.stats.mean}') + + def test_get_time_from_granule(self): + """ + Test that TileSummarizingProcessor gets time from granule filename when time is not + present in the tile. + """ + reading_processor = GridReadingProcessor( + variable='chlor_a', + latitude='lat', + longitude='lon', + tile='tile' + ) + relative_path = '../granules/A2017005.L3m_CHL_chlor_a_4km.nc' + granule_path = path.join(path.dirname(__file__), relative_path) + + tile_summary = nexusproto.TileSummary() + tile_summary.granule = granule_path + tile_summary.data_var_name = json.dumps('chlor_a') + + input_tile = nexusproto.NexusTile() + input_tile.summary.CopyFrom(tile_summary) + + dims = { + 'lat': slice(0, 30), + 'lon': slice(0, 30), + 'tile': slice(10, 11), + } + + + with xr.open_dataset(granule_path, decode_cf=True) as ds: + output_tile = reading_processor._generate_tile(ds, dims, input_tile) + tile_summary_processor = TileSummarizingProcessor('test') + new_tile = tile_summary_processor.process(tile=output_tile, dataset=ds) + self.assertEqual(new_tile.summary.stats.min_time, new_tile.summary.stats.max_time) + self.assertEqual(new_tile.summary.stats.min_time, int(datetime(2017, 1, 5).timestamp()))