Fix timeseries dataset integration into workflows (#2)

lukasbindreiter · web-flow · commit 686ad64a4249 · 2025-06-10T15:44:16.000+02:00
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,6 +1,8 @@
 {
   "editor.formatOnSave": true,
-  "editor.rulers": [120],
+  "editor.rulers": [
+    120
+  ],
   "[python]": {
     "editor.defaultFormatter": "charliermarsh.ruff",
     "editor.formatOnSave": true,
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.37.1] - 2025-06-10
+
 ### Fixed
 
+- `tilebox-datasets`: Fixed a bug in `TimeseriesDatasetChunk.from_message` relying on incorrect bool assumptions about
+  missing protobuf fields.
 - `tilebox-grpc`: More robust parsing of GRPC channel URLs.
+- `tilebox-workflows`: Fixed a bug in the timeseries interceptor that resulted in an error when accessing a collection.
 
 ## [0.37.0] - 2025-06-06
 
@@ -185,7 +190,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Released packages: `tilebox-datasets`, `tilebox-workflows`, `tilebox-storage`, `tilebox-grpc`
 
 
-[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.37.0...HEAD
+[Unreleased]: https://github.com/tilebox/tilebox-python/compare/v0.37.1...HEAD
+[0.37.1]: https://github.com/tilebox/tilebox-python/compare/v0.37.0...v0.37.1
 [0.37.0]: https://github.com/tilebox/tilebox-python/compare/v0.36.1...v0.37.0
 [0.36.1]: https://github.com/tilebox/tilebox-python/compare/v0.36.0...v0.36.1
 [0.36.0]: https://github.com/tilebox/tilebox-python/compare/v0.35.0...v0.36.0
diff --git a/tilebox-datasets/tilebox/datasets/data/timeseries.py b/tilebox-datasets/tilebox/datasets/data/timeseries.py
@@ -21,7 +21,13 @@ class TimeseriesDatasetChunk:
     @classmethod
     def from_message(cls, chunk: timeseries_pb2.TimeseriesDatasetChunk) -> "TimeseriesDatasetChunk":
         datapoint_interval = None
-        if chunk.datapoint_interval and chunk.datapoint_interval.start_id and chunk.datapoint_interval.end_id:
+        if (
+            chunk.datapoint_interval
+            and chunk.datapoint_interval.start_id
+            and chunk.datapoint_interval.end_id
+            and chunk.datapoint_interval.start_id.uuid
+            and chunk.datapoint_interval.end_id.uuid
+        ):
             datapoint_interval = DatapointInterval.from_message(chunk.datapoint_interval)
 
         time_interval = None
diff --git a/tilebox-workflows/tilebox/workflows/timeseries.py b/tilebox-workflows/tilebox/workflows/timeseries.py
@@ -22,19 +22,20 @@
 
 
 @execution_interceptor
-def _timeseries_dataset_chunk(task: Task, call_next: ForwardExecution, context: ExecutionContext) -> None:
+def _timeseries_dataset_chunk(task: Task, call_next: ForwardExecution, context: ExecutionContext) -> None:  # noqa: C901
     if not isinstance(task, TimeseriesTask):
         raise TypeError("Task is not a timeseries task. Inherit from TimeseriesTask to mark it as such.")
 
     chunk: TimeseriesDatasetChunk = task.timeseries_data  # type: ignore[attr-defined]
 
-    # let's get the collection object
-    dataset = context.runner_context.datasets_client._dataset_by_id(str(chunk.dataset_id))  # type: ignore[attr-defined]  # noqa: SLF001
-    collection = dataset.collection("unknown")  # dummy collection, we will inject the right id below:
+    # let's get a collection client
+    datasets_client = context.runner_context.datasets_client
+    dataset = datasets_client._dataset_by_id(str(chunk.dataset_id))  # type: ignore[attr-defined]  # noqa: SLF001
     # we already know the collection id, so we can skip the lookup (we don't know the name, but don't need it)
-    collection._info = CollectionInfo(Collection(chunk.collection_id, "unknown"), None, None)  # noqa: SLF001
+    collection_info = CollectionInfo(Collection(chunk.collection_id, "unknown"), None, None)
+    collection = CollectionClient(dataset, collection_info)
 
-    # leaf case: we are already executing a specific batch of datapoints fitting in the chunk size, so let's load them and process them
+    # leaf case: we are already executing a specific batch of datapoints fitting in the chunk size, so let's load them
     if chunk.datapoint_interval:
         datapoint_interval = (chunk.datapoint_interval.start_id, chunk.datapoint_interval.end_id)
         # we already are a leaf task executing for a specific datapoint interval:
@@ -44,6 +45,9 @@ def _timeseries_dataset_chunk(task: Task, call_next: ForwardExecution, context:
             skip_data=False,
             show_progress=False,
         )
+        if not datapoints:
+            return  # no datapoints in the interval -> we are done
+
         for i in range(datapoints.sizes["time"]):
             datapoint = datapoints.isel(time=i)
             call_next(context, datapoint)  # type: ignore[call-arg]
@@ -88,7 +92,7 @@ def _timeseries_dataset_chunk(task: Task, call_next: ForwardExecution, context:
 
     subtasks = [replace(task, timeseries_data=sub_chunk) for sub_chunk in sub_chunks]  # type: ignore[misc]
     if len(subtasks) > 0:
-        context.submit_batch(subtasks)
+        context.submit_subtasks(subtasks)
 
     return