Skip to content

Commit 37350d8

Browse files
Add create_dataset method
1 parent 4c31ead commit 37350d8

File tree

8 files changed

+366
-12
lines changed

8 files changed

+366
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- `tilebox-datasets`: Added `create_dataset` method to `Client` to create a new dataset.
13+
1014
## [0.45.0] - 2025-11-17
1115

1216
### Added

tilebox-datasets/tests/data/datasets.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,32 @@
22
from dataclasses import replace
33
from functools import lru_cache
44

5-
from google.protobuf.descriptor_pb2 import FileDescriptorProto, FileDescriptorSet
5+
from google.protobuf.descriptor_pb2 import FieldDescriptorProto, FileDescriptorProto, FileDescriptorSet
66
from hypothesis.strategies import (
77
DrawFn,
8+
booleans,
89
composite,
910
integers,
1011
just,
1112
lists,
1213
none,
1314
one_of,
15+
sampled_from,
1416
text,
1517
uuids,
1618
)
1719

1820
from tests.example_dataset.example_dataset_pb2 import DESCRIPTOR_PROTO
19-
from tilebox.datasets.data.datasets import AnnotatedType, Dataset, DatasetGroup, FieldAnnotation, ListDatasetsResponse
21+
from tilebox.datasets.data.datasets import (
22+
AnnotatedType,
23+
Dataset,
24+
DatasetGroup,
25+
DatasetKind,
26+
DatasetType,
27+
Field,
28+
FieldAnnotation,
29+
ListDatasetsResponse,
30+
)
2031
from tilebox.datasets.message_pool import register_once
2132

2233

@@ -28,6 +39,40 @@ def field_annotations(draw: DrawFn) -> FieldAnnotation:
2839
return FieldAnnotation(description, example_value)
2940

3041

42+
@composite
43+
def fields(draw: DrawFn) -> Field:
44+
"""A hypothesis strategy for generating random fields"""
45+
name = draw(text(alphabet=string.ascii_lowercase + "_", min_size=3, max_size=25))
46+
field_type = draw(
47+
one_of(
48+
just(FieldDescriptorProto.Type.TYPE_STRING),
49+
just(FieldDescriptorProto.Type.TYPE_BYTES),
50+
just(FieldDescriptorProto.Type.TYPE_BOOL),
51+
just(FieldDescriptorProto.Type.TYPE_INT64),
52+
just(FieldDescriptorProto.Type.TYPE_UINT64),
53+
just(FieldDescriptorProto.Type.TYPE_DOUBLE),
54+
just(FieldDescriptorProto.Type.TYPE_MESSAGE),
55+
)
56+
)
57+
type_name = f".datasets.v1.{name}" if field_type == FieldDescriptorProto.Type.TYPE_MESSAGE else None
58+
label = draw(
59+
one_of(just(FieldDescriptorProto.Label.LABEL_OPTIONAL), just(FieldDescriptorProto.Label.LABEL_REPEATED))
60+
)
61+
descriptor = FieldDescriptorProto(name=name, type=field_type, type_name=type_name, label=label)
62+
63+
annotation = draw(field_annotations())
64+
queryable = draw(booleans())
65+
return Field(descriptor, annotation, queryable)
66+
67+
68+
@composite
69+
def dataset_types(draw: DrawFn) -> DatasetType:
70+
"""A hypothesis strategy for generating random dataset types"""
71+
kind = draw(sampled_from(DatasetKind) | none())
72+
dataset_fields = draw(lists(fields(), min_size=1, max_size=5))
73+
return DatasetType(kind, dataset_fields)
74+
75+
3176
@lru_cache
3277
def example_dataset_type() -> AnnotatedType:
3378
descriptor = FileDescriptorProto.FromString(DESCRIPTOR_PROTO)

tilebox-datasets/tests/data/test_datasets.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,40 @@
11
from hypothesis import given
22

3-
from tests.data.datasets import annotated_types, dataset_groups, datasets, field_annotations, list_datasets_responses
4-
from tilebox.datasets.data.datasets import AnnotatedType, Dataset, DatasetGroup, FieldAnnotation, ListDatasetsResponse
3+
from tests.data.datasets import (
4+
annotated_types,
5+
dataset_groups,
6+
dataset_types,
7+
datasets,
8+
field_annotations,
9+
fields,
10+
list_datasets_responses,
11+
)
12+
from tilebox.datasets.data.datasets import (
13+
AnnotatedType,
14+
Dataset,
15+
DatasetGroup,
16+
DatasetType,
17+
Field,
18+
FieldAnnotation,
19+
ListDatasetsResponse,
20+
)
521

622

723
@given(field_annotations())
824
def test_field_annotations_to_message_and_back(annotation: FieldAnnotation) -> None:
925
assert FieldAnnotation.from_message(annotation.to_message()) == annotation
1026

1127

28+
@given(fields())
29+
def test_fields_to_message_and_back(field: Field) -> None:
30+
assert Field.from_message(field.to_message()) == field
31+
32+
33+
@given(dataset_types())
34+
def test_dataset_types_to_message_and_back(dataset_type: DatasetType) -> None:
35+
assert DatasetType.from_message(dataset_type.to_message()) == dataset_type
36+
37+
1238
@given(annotated_types())
1339
def test_annotated_types_to_message_and_back(annotated_type: AnnotatedType) -> None:
1440
assert AnnotatedType.from_message(annotated_type.to_message()) == annotated_type

tilebox-datasets/tilebox/datasets/aio/client.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tilebox.datasets.aio.dataset import DatasetClient
66
from tilebox.datasets.client import Client as BaseClient
77
from tilebox.datasets.client import token_from_env
8+
from tilebox.datasets.data.datasets import DatasetKind, FieldDict
89
from tilebox.datasets.datasets.v1.collections_pb2_grpc import CollectionServiceStub
910
from tilebox.datasets.datasets.v1.data_access_pb2_grpc import DataAccessServiceStub
1011
from tilebox.datasets.datasets.v1.data_ingestion_pb2_grpc import DataIngestionServiceStub
@@ -32,6 +33,22 @@ def __init__(self, *, url: str = "https://api.tilebox.com", token: str | None =
3233
)
3334
self._client = BaseClient(service)
3435

36+
async def create_dataset(
37+
self,
38+
kind: DatasetKind,
39+
code_name: str,
40+
fields: list[FieldDict],
41+
*,
42+
name: str | None = None,
43+
summary: str | None = None,
44+
) -> DatasetClient:
45+
if name is None:
46+
name = code_name
47+
if summary is None:
48+
summary = ""
49+
50+
return await self._client.create_dataset(kind, code_name, fields, name, summary, DatasetClient)
51+
3552
async def datasets(self) -> Group:
3653
return await self._client.datasets(DatasetClient)
3754

tilebox-datasets/tilebox/datasets/client.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from promise import Promise
88

99
from _tilebox.grpc.channel import parse_channel_info
10-
from tilebox.datasets.data.datasets import Dataset, DatasetGroup, ListDatasetsResponse
10+
from tilebox.datasets.data.datasets import Dataset, DatasetGroup, DatasetKind, FieldDict, ListDatasetsResponse
1111
from tilebox.datasets.group import Group
1212
from tilebox.datasets.message_pool import register_once
1313
from tilebox.datasets.service import TileboxDatasetService
@@ -26,6 +26,27 @@ class Client:
2626
def __init__(self, service: TileboxDatasetService) -> None:
2727
self._service = service
2828

29+
def create_dataset( # noqa: PLR0913
30+
self, kind: DatasetKind, code_name: str, fields: list[FieldDict], name: str, summary: str, dataset_type: type[T]
31+
) -> Promise[T]:
32+
"""Create a new dataset.
33+
34+
Args:
35+
kind: The kind of the dataset.
36+
code_name: The code name of the dataset.
37+
fields: The fields of the dataset.
38+
name: The name of the dataset. Defaults to the code name.
39+
summary: A short summary of the dataset. Optional.
40+
41+
Returns:
42+
The created dataset.
43+
"""
44+
return (
45+
self._service.create_dataset(kind, code_name, fields, name, summary)
46+
.then(_ensure_registered)
47+
.then(lambda dataset: dataset_type(self._service, dataset))
48+
)
49+
2950
def datasets(self, dataset_type: type[T]) -> Promise[Group]:
3051
"""Fetch all available datasets."""
3152
return (
@@ -40,11 +61,10 @@ def datasets(self, dataset_type: type[T]) -> Promise[Group]:
4061
)
4162

4263
def dataset(self, slug: str, dataset_type: type[T]) -> Promise[T]:
43-
"""
44-
Get a dataset by its slug, e.g. `open_data.copernicus.sentinel1_sar`.
64+
"""Get a dataset by its slug, e.g. `open_data.copernicus.sentinel1_sar`.
4565
4666
Args:
47-
slug: The slug of the dataset
67+
slug: The slug of the dataset.
4868
4969
Returns:
5070
The dataset if it exists.

tilebox-datasets/tilebox/datasets/data/datasets.py

Lines changed: 129 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,29 @@
11
from dataclasses import dataclass
2+
from datetime import datetime, timedelta
3+
from enum import Enum
4+
from typing import Required, TypedDict, get_args, get_origin
25
from uuid import UUID
36

4-
from google.protobuf.descriptor_pb2 import FileDescriptorSet
7+
import numpy as np
8+
from google.protobuf import duration_pb2, timestamp_pb2
9+
from google.protobuf.descriptor_pb2 import FieldDescriptorProto, FileDescriptorSet
10+
from shapely import Geometry
11+
from typing_extensions import NotRequired
512

6-
from tilebox.datasets.datasets.v1 import core_pb2, dataset_type_pb2, datasets_pb2
13+
from tilebox.datasets.datasets.v1 import core_pb2, dataset_type_pb2, datasets_pb2, well_known_types_pb2
714
from tilebox.datasets.uuid import uuid_message_to_optional_uuid, uuid_message_to_uuid, uuid_to_uuid_message
815

916

17+
class DatasetKind(Enum):
18+
TEMPORAL = dataset_type_pb2.DATASET_KIND_TEMPORAL
19+
"""A dataset that contains a timestamp field."""
20+
SPATIOTEMPORAL = dataset_type_pb2.DATASET_KIND_SPATIOTEMPORAL
21+
"""A dataset that contains a timestamp field and a geometry field."""
22+
23+
24+
_dataset_kind_int_to_enum = {kind.value: kind for kind in DatasetKind}
25+
26+
1027
@dataclass(frozen=True)
1128
class FieldAnnotation:
1229
description: str
@@ -20,6 +37,116 @@ def to_message(self) -> dataset_type_pb2.FieldAnnotation:
2037
return dataset_type_pb2.FieldAnnotation(description=self.description, example_value=self.example_value)
2138

2239

40+
class FieldDict(TypedDict):
41+
name: Required[str]
42+
type: Required[
43+
type[str]
44+
| type[list[str]]
45+
| type[bytes]
46+
| type[list[bytes]]
47+
| type[bool]
48+
| type[list[bool]]
49+
| type[int]
50+
| type[list[int]]
51+
| type[np.uint64]
52+
| type[list[np.uint64]]
53+
| type[float]
54+
| type[list[float]]
55+
| type[timedelta]
56+
| type[list[timedelta]]
57+
| type[datetime]
58+
| type[list[datetime]]
59+
| type[UUID]
60+
| type[list[UUID]]
61+
| type[Geometry]
62+
| type[list[Geometry]]
63+
]
64+
description: NotRequired[str]
65+
example_value: NotRequired[str]
66+
67+
68+
_TYPE_INFO: dict[type, tuple[FieldDescriptorProto.Type.ValueType, str | None]] = {
69+
str: (FieldDescriptorProto.TYPE_STRING, None),
70+
bytes: (FieldDescriptorProto.TYPE_BYTES, None),
71+
bool: (FieldDescriptorProto.TYPE_BOOL, None),
72+
int: (FieldDescriptorProto.TYPE_INT64, None),
73+
np.uint64: (FieldDescriptorProto.TYPE_UINT64, None),
74+
float: (FieldDescriptorProto.TYPE_DOUBLE, None),
75+
timedelta: (FieldDescriptorProto.TYPE_MESSAGE, f".{duration_pb2.Duration.DESCRIPTOR.full_name}"),
76+
datetime: (FieldDescriptorProto.TYPE_MESSAGE, f".{timestamp_pb2.Timestamp.DESCRIPTOR.full_name}"),
77+
UUID: (FieldDescriptorProto.TYPE_MESSAGE, f".{well_known_types_pb2.UUID.DESCRIPTOR.full_name}"),
78+
Geometry: (FieldDescriptorProto.TYPE_MESSAGE, f".{well_known_types_pb2.Geometry.DESCRIPTOR.full_name}"),
79+
}
80+
81+
82+
@dataclass(frozen=True)
83+
class Field:
84+
descriptor: FieldDescriptorProto
85+
annotation: FieldAnnotation
86+
queryable: bool
87+
88+
@classmethod
89+
def from_message(cls, field: dataset_type_pb2.Field) -> "Field":
90+
return cls(
91+
descriptor=field.descriptor,
92+
annotation=FieldAnnotation.from_message(field.annotation),
93+
queryable=field.queryable,
94+
)
95+
96+
@classmethod
97+
def from_dict(cls, field: FieldDict) -> "Field":
98+
origin = get_origin(field["type"])
99+
if origin is list:
100+
label = FieldDescriptorProto.Label.LABEL_REPEATED
101+
args = get_args(field["type"])
102+
inner_type = args[0] if args else field["type"]
103+
else:
104+
label = FieldDescriptorProto.Label.LABEL_OPTIONAL
105+
inner_type = field["type"]
106+
107+
(field_type, field_type_name) = _TYPE_INFO[inner_type]
108+
109+
return cls(
110+
descriptor=FieldDescriptorProto(
111+
name=field["name"],
112+
type=field_type,
113+
type_name=field_type_name,
114+
label=label,
115+
),
116+
annotation=FieldAnnotation(
117+
description=field.get("description", ""),
118+
example_value=field.get("example_value", ""),
119+
),
120+
queryable=False,
121+
)
122+
123+
def to_message(self) -> dataset_type_pb2.Field:
124+
return dataset_type_pb2.Field(
125+
descriptor=self.descriptor,
126+
annotation=self.annotation.to_message(),
127+
queryable=self.queryable,
128+
)
129+
130+
131+
@dataclass(frozen=True)
132+
class DatasetType:
133+
kind: DatasetKind | None
134+
fields: list[Field]
135+
136+
@classmethod
137+
def from_message(cls, dataset_type: dataset_type_pb2.DatasetType) -> "DatasetType":
138+
return cls(
139+
kind=_dataset_kind_int_to_enum.get(dataset_type.kind, None),
140+
fields=[Field.from_message(f) for f in dataset_type.fields],
141+
)
142+
143+
def to_message(self) -> dataset_type_pb2.DatasetType:
144+
return dataset_type_pb2.DatasetType(
145+
kind=self.kind.value if self.kind else dataset_type_pb2.DATASET_KIND_UNSPECIFIED,
146+
fields=[f.to_message() for f in self.fields],
147+
)
148+
149+
23150
@dataclass(frozen=True)
24151
class AnnotatedType:
25152
descriptor_set: FileDescriptorSet

0 commit comments

Comments
 (0)