Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions common/common/parse_datetime_utils/ParseDatetimeUtils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from enum import Enum
import re
from functools import reduce
from datetime import datetime, timedelta
from typing import Tuple


class ParseDatetimeUtils:
def __init__(self, expression: str, sub_str_start: int =None, sub_str_end: int =None, is_range: bool =False):
self.expression = expression

self.sub_str_start = sub_str_start
self.sub_str_end = sub_str_end
self.is_sub_str = self.sub_str_start is not None and self.sub_str_end is not None

self.is_range = is_range

def parse(self, input_str: str) -> Tuple[datetime, datetime]:
input_str = input_str[self.sub_str_start:self.sub_str_end] \
if self.is_sub_str else input_str

start_dt, end_dt = (None, None)
if self.is_range:
input_str_len = len(input_str)
input_str_mid = int(input_str_len / 2)
start_dt = datetime.strptime(input_str[0:input_str_mid], self.expression)
end_dt = datetime.strptime(input_str[input_str_mid+(input_str_len%2):], self.expression)
else:
start_dt = datetime.strptime(input_str, self.expression)
end_dt = start_dt

return (start_dt, end_dt)
1 change: 1 addition & 0 deletions common/common/parse_datetime_utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ParseDatetimeUtils import ParseDatetimeUtils
42 changes: 42 additions & 0 deletions common/common/parse_datetime_utils_test/ParseDatetimeUtilsTests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from parse_datetime_utils import ParseDatetimeUtils
import unittest
from datetime import datetime


class ParseDatetimeUtilsTests(unittest.TestCase):
def test_parse(self):
filename = 'V4NA03_PM25_NA_20040115-RH35.nc'
expression = '%Y%m%d'
sub_str_start, sub_str_end = (15, 23)
parser = ParseDatetimeUtils(expression, sub_str_start=sub_str_start,
sub_str_end=sub_str_end)
start_datetime, end_datetime = parser.parse(filename)

self.assertEqual(start_datetime, end_datetime)
self.assertEqual(end_datetime, datetime(2004, 1, 15))

def test_parse_range_with_separator(self):
filename = 'V5GL01.HybridPM25.NorthAmerica.200706-200908.nc'
expression = '%Y%m'
sub_str_start, sub_str_end = (31, 44)
parser = ParseDatetimeUtils(expression, sub_str_start=sub_str_start,
sub_str_end=sub_str_end, is_range=True)
start_datetime, end_datetime = parser.parse(filename)

self.assertEqual(start_datetime, datetime(2007, 6, 1))
self.assertEqual(end_datetime, datetime(2009, 8, 1))

def test_parse_range_without_separator(self):
filename = 'V5GL01.HybridPM25.NorthAmerica.200706200908.nc'
expression = '%Y%m'
sub_str_start, sub_str_end = (31, 43)
parser = ParseDatetimeUtils(expression, sub_str_start=sub_str_start,
sub_str_end=sub_str_end, is_range=True)
start_datetime, end_datetime = parser.parse(filename)

self.assertEqual(start_datetime, datetime(2007, 6, 1))
self.assertEqual(end_datetime, datetime(2009, 8, 1))


if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions common/common/parse_datetime_utils_test/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .ParseDatetimeUtilsTests import ParseDatetimeUtilsTests
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import json
import logging
import re
import datetime

import numpy
from nexusproto import DataTile_pb2 as nexusproto
Expand All @@ -27,17 +30,45 @@ class NoTimeException(Exception):
pass


def find_time_min_max(tile_data):
def find_time_min_max(tile_data, time_from_granule=None):
if tile_data.time:
if isinstance(tile_data.time, nexusproto.ShapedArray):
time_data = from_shaped_array(tile_data.time)
return int(numpy.nanmin(time_data).item()), int(numpy.nanmax(time_data).item())
elif isinstance(tile_data.time, int):
elif isinstance(tile_data.time, int) and \
tile_data.time > datetime.datetime(1970, 1, 1).timestamp(): # time should be at least greater than Epoch, right?
return tile_data.time, tile_data.time

if time_from_granule:
return time_from_granule, time_from_granule

raise NoTimeException


def get_time_from_granule(granule: str) -> int:
"""
Get time from granule name. It makes the assumption that a datetime is
specificed in the granule name, and it has the following format "YYYYddd",
where YYYY is 4 digit year and ddd is day of year (i.e. 1 to 365).

Note: This is a very narrow implmentation for a specific need. If you found
yourself needing to modify this function to accommodate more use cases, then
perhaps it is time to refactor this function to a more dynamic module.
"""

# rs search for a sub str which starts with 19 or 20, and has 7 digits
search_res = re.search('((?:19|20)[0-9]{2})([0-9]{3})', os.path.basename(granule))
if not search_res:
return None

year, days = search_res.groups()
year = int(year)
days = int(days)

# since datetime is set to 1/1 (+1 day), timedelta needs to -1 to cancel it out
return int((datetime.datetime(year, 1, 1) + datetime.timedelta(days-1)).timestamp())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want to be careful about the UTC timezone.

For example with code

import pytz
aware = datetime.datetime(2011,8,15,8,15,12,0,pytz.UTC)

Or with the example in my comment above:

import pytz
datetime.strptime(filename, 'A%Y%j.L3m_CHL_chlor_a_4km.nc').replace(tzinfo=pytz.utc)

That would be more simple than the current version of the code and spare some lines of code.

I am not sure if the hardcoded -1 days is also something we would like to have for all the datasets. Maybe an additional optional configuration variable for the collection should give an offset for the min and offset for the max time.
In this case that would be -1 and -1 for both.

I guess the most difficult change in my request is to read from the collection configmap.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jasonmlkang did you have a chance to look at my comments. We're having similar need for AQACF project but of course the file naming convention is different so we would need to discuss how that could be configured.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Examples of file names from AQACF project:
V5GL01.HybridPM25.NorthAmerica.200807-200807.nc
V4NA03_PM25_NA_200401_200412-RH35.nc



class TileSummarizingProcessor(TileProcessor):

def __init__(self, dataset_name: str, *args, **kwargs):
Expand Down Expand Up @@ -92,7 +123,9 @@ def process(self, tile, dataset, *args, **kwargs):
logger.debug(f'find min max time')

try:
min_time, max_time = find_time_min_max(tile_data)
min_time, max_time = find_time_min_max(
tile_data, get_time_from_granule(tile.summary.granule)
)
logger.debug(f'set min max time')
tile_summary.stats.min_time = min_time
tile_summary.stats.max_time = max_time
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from datetime import datetime
import json
import unittest
from os import path
Expand Down Expand Up @@ -108,4 +109,39 @@ def test_hls_multiple_var_01(self):
new_tile = tile_summary_processor.process(tile=output_tile, dataset=ds)
self.assertEqual('[null, null, null, null, null, null, null, null, null, null, null]', new_tile.summary.standard_name, f'wrong new_tile.summary.standard_name')
self.assertEqual([None for _ in range(11)], json.loads(new_tile.summary.standard_name), f'unable to convert new_tile.summary.standard_name from JSON')
self.assertTrue(abs(new_tile.summary.stats.mean - 0.26523) < 0.001, f'mean value is not close expected: 0.26523. actual: {new_tile.summary.stats.mean}')
self.assertTrue(abs(new_tile.summary.stats.mean - 0.26523) < 0.001, f'mean value is not close expected: 0.26523. actual: {new_tile.summary.stats.mean}')

def test_get_time_from_granule(self):
"""
Test that TileSummarizingProcessor gets time from granule filename when time is not
present in the tile.
"""
reading_processor = GridReadingProcessor(
variable='chlor_a',
latitude='lat',
longitude='lon',
tile='tile'
)
relative_path = '../granules/A2017005.L3m_CHL_chlor_a_4km.nc'
granule_path = path.join(path.dirname(__file__), relative_path)

tile_summary = nexusproto.TileSummary()
tile_summary.granule = granule_path
tile_summary.data_var_name = json.dumps('chlor_a')

input_tile = nexusproto.NexusTile()
input_tile.summary.CopyFrom(tile_summary)

dims = {
'lat': slice(0, 30),
'lon': slice(0, 30),
'tile': slice(10, 11),
}


with xr.open_dataset(granule_path, decode_cf=True) as ds:
output_tile = reading_processor._generate_tile(ds, dims, input_tile)
tile_summary_processor = TileSummarizingProcessor('test')
new_tile = tile_summary_processor.process(tile=output_tile, dataset=ds)
self.assertEqual(new_tile.summary.stats.min_time, new_tile.summary.stats.max_time)
self.assertEqual(new_tile.summary.stats.min_time, int(datetime(2017, 1, 5).timestamp()))