Skip to content

Commit c0fa62c

Browse files
Add create_dataset method
1 parent 4c31ead commit c0fa62c

File tree

8 files changed

+365
-12
lines changed

8 files changed

+365
-12
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- `tilebox-datasets`: Added `create_dataset` method to `Client` to create a new dataset.
13+
1014
## [0.45.0] - 2025-11-17
1115

1216
### Added

tilebox-datasets/tests/data/datasets.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,32 @@
22
from dataclasses import replace
33
from functools import lru_cache
44

5-
from google.protobuf.descriptor_pb2 import FileDescriptorProto, FileDescriptorSet
5+
from google.protobuf.descriptor_pb2 import FieldDescriptorProto, FileDescriptorProto, FileDescriptorSet
66
from hypothesis.strategies import (
77
DrawFn,
8+
booleans,
89
composite,
910
integers,
1011
just,
1112
lists,
1213
none,
1314
one_of,
15+
sampled_from,
1416
text,
1517
uuids,
1618
)
1719

1820
from tests.example_dataset.example_dataset_pb2 import DESCRIPTOR_PROTO
19-
from tilebox.datasets.data.datasets import AnnotatedType, Dataset, DatasetGroup, FieldAnnotation, ListDatasetsResponse
21+
from tilebox.datasets.data.datasets import (
22+
AnnotatedType,
23+
Dataset,
24+
DatasetGroup,
25+
DatasetKind,
26+
DatasetType,
27+
Field,
28+
FieldAnnotation,
29+
ListDatasetsResponse,
30+
)
2031
from tilebox.datasets.message_pool import register_once
2132

2233

@@ -28,6 +39,40 @@ def field_annotations(draw: DrawFn) -> FieldAnnotation:
2839
return FieldAnnotation(description, example_value)
2940

3041

42+
@composite
43+
def fields(draw: DrawFn) -> Field:
44+
"""A hypothesis strategy for generating random fields"""
45+
name = draw(text(alphabet=string.ascii_lowercase + "_", min_size=3, max_size=25))
46+
field_type = draw(
47+
one_of(
48+
just(FieldDescriptorProto.Type.TYPE_STRING),
49+
just(FieldDescriptorProto.Type.TYPE_BYTES),
50+
just(FieldDescriptorProto.Type.TYPE_BOOL),
51+
just(FieldDescriptorProto.Type.TYPE_INT64),
52+
just(FieldDescriptorProto.Type.TYPE_UINT64),
53+
just(FieldDescriptorProto.Type.TYPE_DOUBLE),
54+
just(FieldDescriptorProto.Type.TYPE_MESSAGE),
55+
)
56+
)
57+
type_name = f".datasets.v1.{name}" if field_type == FieldDescriptorProto.Type.TYPE_MESSAGE else None
58+
label = draw(
59+
one_of(just(FieldDescriptorProto.Label.LABEL_OPTIONAL), just(FieldDescriptorProto.Label.LABEL_REPEATED))
60+
)
61+
descriptor = FieldDescriptorProto(name=name, type=field_type, type_name=type_name, label=label)
62+
63+
annotation = draw(field_annotations())
64+
queryable = draw(booleans())
65+
return Field(descriptor, annotation, queryable)
66+
67+
68+
@composite
69+
def dataset_types(draw: DrawFn) -> DatasetType:
70+
"""A hypothesis strategy for generating random dataset types"""
71+
kind = draw(sampled_from(DatasetKind) | none())
72+
dataset_fields = draw(lists(fields(), min_size=1, max_size=5))
73+
return DatasetType(kind, dataset_fields)
74+
75+
3176
@lru_cache
3277
def example_dataset_type() -> AnnotatedType:
3378
descriptor = FileDescriptorProto.FromString(DESCRIPTOR_PROTO)

tilebox-datasets/tests/data/test_datasets.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,40 @@
11
from hypothesis import given
22

3-
from tests.data.datasets import annotated_types, dataset_groups, datasets, field_annotations, list_datasets_responses
4-
from tilebox.datasets.data.datasets import AnnotatedType, Dataset, DatasetGroup, FieldAnnotation, ListDatasetsResponse
3+
from tests.data.datasets import (
4+
annotated_types,
5+
dataset_groups,
6+
dataset_types,
7+
datasets,
8+
field_annotations,
9+
fields,
10+
list_datasets_responses,
11+
)
12+
from tilebox.datasets.data.datasets import (
13+
AnnotatedType,
14+
Dataset,
15+
DatasetGroup,
16+
DatasetType,
17+
Field,
18+
FieldAnnotation,
19+
ListDatasetsResponse,
20+
)
521

622

723
@given(field_annotations())
824
def test_field_annotations_to_message_and_back(annotation: FieldAnnotation) -> None:
925
assert FieldAnnotation.from_message(annotation.to_message()) == annotation
1026

1127

28+
@given(fields())
29+
def test_fields_to_message_and_back(field: Field) -> None:
30+
assert Field.from_message(field.to_message()) == field
31+
32+
33+
@given(dataset_types())
34+
def test_dataset_types_to_message_and_back(dataset_type: DatasetType) -> None:
35+
assert DatasetType.from_message(dataset_type.to_message()) == dataset_type
36+
37+
1238
@given(annotated_types())
1339
def test_annotated_types_to_message_and_back(annotated_type: AnnotatedType) -> None:
1440
assert AnnotatedType.from_message(annotated_type.to_message()) == annotated_type

tilebox-datasets/tilebox/datasets/aio/client.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tilebox.datasets.aio.dataset import DatasetClient
66
from tilebox.datasets.client import Client as BaseClient
77
from tilebox.datasets.client import token_from_env
8+
from tilebox.datasets.data.datasets import DatasetKind, FieldDict
89
from tilebox.datasets.datasets.v1.collections_pb2_grpc import CollectionServiceStub
910
from tilebox.datasets.datasets.v1.data_access_pb2_grpc import DataAccessServiceStub
1011
from tilebox.datasets.datasets.v1.data_ingestion_pb2_grpc import DataIngestionServiceStub
@@ -32,6 +33,22 @@ def __init__(self, *, url: str = "https://api.tilebox.com", token: str | None =
3233
)
3334
self._client = BaseClient(service)
3435

36+
async def create_dataset(
37+
self,
38+
kind: DatasetKind,
39+
code_name: str,
40+
fields: list[FieldDict],
41+
*,
42+
name: str | None = None,
43+
summary: str | None = None,
44+
) -> DatasetClient:
45+
if name is None:
46+
name = code_name
47+
if summary is None:
48+
summary = ""
49+
50+
return await self._client.create_dataset(kind, code_name, fields, name, summary, DatasetClient)
51+
3552
async def datasets(self) -> Group:
3653
return await self._client.datasets(DatasetClient)
3754

tilebox-datasets/tilebox/datasets/client.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from promise import Promise
88

99
from _tilebox.grpc.channel import parse_channel_info
10-
from tilebox.datasets.data.datasets import Dataset, DatasetGroup, ListDatasetsResponse
10+
from tilebox.datasets.data.datasets import Dataset, DatasetGroup, DatasetKind, FieldDict, ListDatasetsResponse
1111
from tilebox.datasets.group import Group
1212
from tilebox.datasets.message_pool import register_once
1313
from tilebox.datasets.service import TileboxDatasetService
@@ -26,6 +26,27 @@ class Client:
2626
def __init__(self, service: TileboxDatasetService) -> None:
2727
self._service = service
2828

29+
def create_dataset( # noqa: PLR0913
30+
self, kind: DatasetKind, code_name: str, fields: list[FieldDict], name: str, summary: str, dataset_type: type[T]
31+
) -> Promise[T]:
32+
"""Create a new dataset.
33+
34+
Args:
35+
kind: The kind of the dataset.
36+
code_name: The code name of the dataset.
37+
fields: The fields of the dataset.
38+
name: The name of the dataset. Defaults to the code name.
39+
summary: A short summary of the dataset. Optional.
40+
41+
Returns:
42+
The created dataset.
43+
"""
44+
return (
45+
self._service.create_dataset(kind, code_name, fields, name, summary)
46+
.then(_ensure_registered)
47+
.then(lambda dataset: dataset_type(self._service, dataset))
48+
)
49+
2950
def datasets(self, dataset_type: type[T]) -> Promise[Group]:
3051
"""Fetch all available datasets."""
3152
return (
@@ -40,11 +61,10 @@ def datasets(self, dataset_type: type[T]) -> Promise[Group]:
4061
)
4162

4263
def dataset(self, slug: str, dataset_type: type[T]) -> Promise[T]:
43-
"""
44-
Get a dataset by its slug, e.g. `open_data.copernicus.sentinel1_sar`.
64+
"""Get a dataset by its slug, e.g. `open_data.copernicus.sentinel1_sar`.
4565
4666
Args:
47-
slug: The slug of the dataset
67+
slug: The slug of the dataset.
4868
4969
Returns:
5070
The dataset if it exists.

tilebox-datasets/tilebox/datasets/data/datasets.py

Lines changed: 128 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,28 @@
11
from dataclasses import dataclass
2+
from datetime import datetime, timedelta
3+
from enum import Enum
4+
from typing import NotRequired, Required, TypedDict, get_args, get_origin
25
from uuid import UUID
36

4-
from google.protobuf.descriptor_pb2 import FileDescriptorSet
7+
import numpy as np
8+
from google.protobuf import duration_pb2, timestamp_pb2
9+
from google.protobuf.descriptor_pb2 import FieldDescriptorProto, FileDescriptorSet
10+
from shapely import Geometry
511

6-
from tilebox.datasets.datasets.v1 import core_pb2, dataset_type_pb2, datasets_pb2
12+
from tilebox.datasets.datasets.v1 import core_pb2, dataset_type_pb2, datasets_pb2, well_known_types_pb2
713
from tilebox.datasets.uuid import uuid_message_to_optional_uuid, uuid_message_to_uuid, uuid_to_uuid_message
814

915

16+
class DatasetKind(Enum):
17+
TEMPORAL = dataset_type_pb2.DATASET_KIND_TEMPORAL
18+
"""A dataset that contains a timestamp field."""
19+
SPATIOTEMPORAL = dataset_type_pb2.DATASET_KIND_SPATIOTEMPORAL
20+
"""A dataset that contains a timestamp field and a geometry field."""
21+
22+
23+
_dataset_kind_int_to_enum = {kind.value: kind for kind in DatasetKind}
24+
25+
1026
@dataclass(frozen=True)
1127
class FieldAnnotation:
1228
description: str
@@ -20,6 +36,116 @@ def to_message(self) -> dataset_type_pb2.FieldAnnotation:
2036
return dataset_type_pb2.FieldAnnotation(description=self.description, example_value=self.example_value)
2137

2238

39+
class FieldDict(TypedDict):
40+
name: Required[str]
41+
type: Required[
42+
type[str]
43+
| type[list[str]]
44+
| type[bytes]
45+
| type[list[bytes]]
46+
| type[bool]
47+
| type[list[bool]]
48+
| type[int]
49+
| type[list[int]]
50+
| type[np.uint64]
51+
| type[list[np.uint64]]
52+
| type[float]
53+
| type[list[float]]
54+
| type[timedelta]
55+
| type[list[timedelta]]
56+
| type[datetime]
57+
| type[list[datetime]]
58+
| type[UUID]
59+
| type[list[UUID]]
60+
| type[Geometry]
61+
| type[list[Geometry]]
62+
]
63+
description: NotRequired[str]
64+
example_value: NotRequired[str]
65+
66+
67+
_TYPE_INFO: dict[type, tuple[FieldDescriptorProto.Type.ValueType, str | None]] = {
68+
str: (FieldDescriptorProto.TYPE_STRING, None),
69+
bytes: (FieldDescriptorProto.TYPE_BYTES, None),
70+
bool: (FieldDescriptorProto.TYPE_BOOL, None),
71+
int: (FieldDescriptorProto.TYPE_INT64, None),
72+
np.uint64: (FieldDescriptorProto.TYPE_UINT64, None),
73+
float: (FieldDescriptorProto.TYPE_DOUBLE, None),
74+
timedelta: (FieldDescriptorProto.TYPE_MESSAGE, f".{duration_pb2.Duration.DESCRIPTOR.full_name}"),
75+
datetime: (FieldDescriptorProto.TYPE_MESSAGE, f".{timestamp_pb2.Timestamp.DESCRIPTOR.full_name}"),
76+
UUID: (FieldDescriptorProto.TYPE_MESSAGE, f".{well_known_types_pb2.UUID.DESCRIPTOR.full_name}"),
77+
Geometry: (FieldDescriptorProto.TYPE_MESSAGE, f".{well_known_types_pb2.Geometry.DESCRIPTOR.full_name}"),
78+
}
79+
80+
81+
@dataclass(frozen=True)
82+
class Field:
83+
descriptor: FieldDescriptorProto
84+
annotation: FieldAnnotation
85+
queryable: bool
86+
87+
@classmethod
88+
def from_message(cls, field: dataset_type_pb2.Field) -> "Field":
89+
return cls(
90+
descriptor=field.descriptor,
91+
annotation=FieldAnnotation.from_message(field.annotation),
92+
queryable=field.queryable,
93+
)
94+
95+
@classmethod
96+
def from_dict(cls, field: FieldDict) -> "Field":
97+
origin = get_origin(field["type"])
98+
if origin is list:
99+
label = FieldDescriptorProto.Label.LABEL_REPEATED
100+
args = get_args(field["type"])
101+
inner_type = args[0] if args else field["type"]
102+
else:
103+
label = FieldDescriptorProto.Label.LABEL_OPTIONAL
104+
inner_type = field["type"]
105+
106+
(field_type, field_type_name) = _TYPE_INFO[inner_type]
107+
108+
return cls(
109+
descriptor=FieldDescriptorProto(
110+
name=field["name"],
111+
type=field_type,
112+
type_name=field_type_name,
113+
label=label,
114+
),
115+
annotation=FieldAnnotation(
116+
description=field.get("description", ""),
117+
example_value=field.get("example_value", ""),
118+
),
119+
queryable=False,
120+
)
121+
122+
def to_message(self) -> dataset_type_pb2.Field:
123+
return dataset_type_pb2.Field(
124+
descriptor=self.descriptor,
125+
annotation=self.annotation.to_message(),
126+
queryable=self.queryable,
127+
)
128+
129+
130+
@dataclass(frozen=True)
131+
class DatasetType:
132+
kind: DatasetKind | None
133+
fields: list[Field]
134+
135+
@classmethod
136+
def from_message(cls, dataset_type: dataset_type_pb2.DatasetType) -> "DatasetType":
137+
return cls(
138+
kind=_dataset_kind_int_to_enum.get(dataset_type.kind, None),
139+
fields=[Field.from_message(f) for f in dataset_type.fields],
140+
)
141+
142+
def to_message(self) -> dataset_type_pb2.DatasetType:
143+
return dataset_type_pb2.DatasetType(
144+
kind=self.kind.value if self.kind else dataset_type_pb2.DATASET_KIND_UNSPECIFIED,
145+
fields=[f.to_message() for f in self.fields],
146+
)
147+
148+
23149
@dataclass(frozen=True)
24150
class AnnotatedType:
25151
descriptor_set: FileDescriptorSet

0 commit comments

Comments
 (0)