From 44b4473c25585afa4e77cc9a3e3720b920bdd6d0 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 24 Nov 2025 20:55:38 +0000 Subject: [PATCH 01/20] Begin examining how to best add structured array support to Zarr v3 driver --- tensorstore/driver/zarr3/BUILD | 21 +- tensorstore/driver/zarr3/driver.cc | 41 +- tensorstore/driver/zarr3/dtype.cc | 298 +++++++++++++ tensorstore/driver/zarr3/dtype.h | 144 ++++++ tensorstore/driver/zarr3/dtype_test.cc | 293 ++++++++++++ tensorstore/driver/zarr3/metadata.cc | 514 ++++++++++++++++------ tensorstore/driver/zarr3/metadata.h | 51 ++- tensorstore/driver/zarr3/metadata_test.cc | 45 +- 8 files changed, 1251 insertions(+), 156 deletions(-) create mode 100644 tensorstore/driver/zarr3/dtype.cc create mode 100644 tensorstore/driver/zarr3/dtype.h create mode 100644 tensorstore/driver/zarr3/dtype_test.cc diff --git a/tensorstore/driver/zarr3/BUILD b/tensorstore/driver/zarr3/BUILD index 6e0613d5b..d67f58935 100644 --- a/tensorstore/driver/zarr3/BUILD +++ b/tensorstore/driver/zarr3/BUILD @@ -94,8 +94,8 @@ tensorstore_cc_library( tensorstore_cc_library( name = "metadata", - srcs = ["metadata.cc"], - hdrs = ["metadata.h"], + srcs = ["metadata.cc", "dtype.cc"], + hdrs = ["metadata.h", "dtype.h"], deps = [ ":default_nan", ":name_configuration_json_binder", @@ -145,6 +145,23 @@ tensorstore_cc_library( ], ) +tensorstore_cc_test( + name = "dtype_test", + size = "small", + srcs = ["dtype_test.cc"], + deps = [ + ":metadata", + "//tensorstore:data_type", + "//tensorstore:index", + "//tensorstore/internal/testing:json_gtest", + "//tensorstore/util:status_testutil", + "//tensorstore/util:str_cat", + "@abseil-cpp//absl/status", + "@googletest//:gtest_main", + "@nlohmann_json//:json", + ], +) + tensorstore_cc_test( name = "driver_test", size = "small", diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index a516c1a7b..15faced0a 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -121,8 +121,19 @@ class ZarrDriverSpec "metadata", jb::Validate( [](const auto& options, auto* obj) { - TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( - obj->metadata_constraints.data_type.value_or(DataType()))); + if (obj->metadata_constraints.data_type) { + if (auto dtype = GetScalarDataType( + *obj->metadata_constraints.data_type)) { + TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set(*dtype)); + } else if (obj->schema.dtype().valid()) { + return absl::InvalidArgumentError( + "schema dtype must be unspecified for structured " + "zarr3 data types"); + } else { + // Leave dtype unspecified; structured dtypes are handled + // at metadata level only. + } + } TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( RankConstraint{obj->metadata_constraints.rank})); return absl::OkStatus(); @@ -146,8 +157,8 @@ class ZarrDriverSpec SharedArray fill_value{schema.fill_value()}; const auto& metadata = metadata_constraints; - if (metadata.fill_value) { - fill_value = *metadata.fill_value; + if (metadata.fill_value && !metadata.fill_value->empty()) { + fill_value = (*metadata.fill_value)[0]; } return fill_value; @@ -274,8 +285,10 @@ class DataCacheBase static internal::ChunkGridSpecification GetChunkGridSpecification( const ZarrMetadata& metadata) { - auto fill_value = - BroadcastArray(metadata.fill_value, BoxView<>(metadata.rank)).value(); + assert(!metadata.fill_value.empty()); + auto fill_value = BroadcastArray(metadata.fill_value[0], + BoxView<>(metadata.rank)) + .value(); internal::ChunkGridSpecification::ComponentList components; auto& component = components.emplace_back( internal::AsyncWriteArray::Spec{ @@ -402,9 +415,16 @@ class DataCacheBase const void* metadata_ptr, size_t component_index) override { const auto& metadata = *static_cast(metadata_ptr); ChunkLayout chunk_layout; + SpecRankAndFieldInfo info; + info.chunked_rank = metadata.rank; + if (!metadata.data_type.fields.empty()) { + info.field = &metadata.data_type.fields[0]; + } + std::optional> chunk_shape_span; + chunk_shape_span.emplace(metadata.chunk_shape.data(), + metadata.chunk_shape.size()); TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( - metadata.data_type, metadata.rank, metadata.chunk_shape, - &metadata.codec_specs, chunk_layout)); + info, chunk_shape_span, &metadata.codec_specs, chunk_layout)); TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Finalize()); return chunk_layout; } @@ -470,7 +490,10 @@ class ZarrDriver : public ZarrDriverBase { Result> GetFillValue( IndexTransformView<> transform) override { const auto& metadata = this->metadata(); - return metadata.fill_value; + if (metadata.fill_value.empty()) { + return SharedArray(); + } + return metadata.fill_value[0]; } Future GetStorageStatistics( diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc new file mode 100644 index 000000000..8d1c9d49e --- /dev/null +++ b/tensorstore/driver/zarr3/dtype.cc @@ -0,0 +1,298 @@ +// Copyright 2020 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tensorstore/driver/zarr3/dtype.h" + +#include + +#include + +#include "absl/base/optimization.h" +#include "tensorstore/data_type.h" +#include "tensorstore/internal/json_binding/json_binding.h" +#include "tensorstore/util/endian.h" +#include "tensorstore/util/extents.h" +#include "tensorstore/util/quote_string.h" +#include "tensorstore/util/str_cat.h" + +namespace tensorstore { +namespace internal_zarr3 { + +Result ParseBaseDType(std::string_view dtype) { + using D = ZarrDType::BaseDType; + const auto make_dtype = [&](DataType result_dtype) -> Result { + return D{std::string(dtype), result_dtype, {}}; + }; + + if (dtype == "bool") return make_dtype(dtype_v); + if (dtype == "uint8") return make_dtype(dtype_v); + if (dtype == "uint16") return make_dtype(dtype_v); + if (dtype == "uint32") return make_dtype(dtype_v); + if (dtype == "uint64") return make_dtype(dtype_v); + if (dtype == "int8") return make_dtype(dtype_v); + if (dtype == "int16") return make_dtype(dtype_v); + if (dtype == "int32") return make_dtype(dtype_v); + if (dtype == "int64") return make_dtype(dtype_v); + if (dtype == "bfloat16") + return make_dtype(dtype_v<::tensorstore::dtypes::bfloat16_t>); + if (dtype == "float16") + return make_dtype(dtype_v<::tensorstore::dtypes::float16_t>); + if (dtype == "float32") + return make_dtype(dtype_v<::tensorstore::dtypes::float32_t>); + if (dtype == "float64") + return make_dtype(dtype_v<::tensorstore::dtypes::float64_t>); + if (dtype == "complex64") + return make_dtype(dtype_v<::tensorstore::dtypes::complex64_t>); + if (dtype == "complex128") + return make_dtype(dtype_v<::tensorstore::dtypes::complex128_t>); + + constexpr std::string_view kSupported = + "bool, uint8, uint16, uint32, uint64, int8, int16, int32, int64, " + "bfloat16, float16, float32, float64, complex64, complex128"; + return absl::InvalidArgumentError( + tensorstore::StrCat(dtype, " data type is not one of the supported " + "data types: ", + kSupported)); +} + +namespace { + +/// Parses a zarr metadata "dtype" JSON specification, but does not compute any +/// derived values, and does not check for duplicate field names. +/// +/// This is called by `ParseDType`. +/// +/// \param value The zarr metadata "dtype" JSON specification. +/// \param out[out] Must be non-null. Filled with the parsed dtype on success. +/// \error `absl::StatusCode::kInvalidArgument' if `value` is invalid. +Result ParseDTypeNoDerived(const nlohmann::json& value) { + ZarrDType out; + if (value.is_string()) { + // Single field. + out.has_fields = false; + out.fields.resize(1); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(out.fields[0]), + ParseBaseDType(value.get())); + return out; + } + out.has_fields = true; + auto parse_result = internal_json::JsonParseArray( + value, + [&](ptrdiff_t size) { + out.fields.resize(size); + return absl::OkStatus(); + }, + [&](const ::nlohmann::json& x, ptrdiff_t field_i) { + auto& field = out.fields[field_i]; + return internal_json::JsonParseArray( + x, + [&](ptrdiff_t size) { + if (size < 2 || size > 3) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected array of size 2 or 3, but received: ", x.dump())); + } + return absl::OkStatus(); + }, + [&](const ::nlohmann::json& v, ptrdiff_t i) { + switch (i) { + case 0: + if (internal_json::JsonRequireValueAs(v, &field.name).ok()) { + if (!field.name.empty()) return absl::OkStatus(); + } + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected non-empty string, but received: ", v.dump())); + case 1: { + std::string dtype_string; + TENSORSTORE_RETURN_IF_ERROR( + internal_json::JsonRequireValueAs(v, &dtype_string)); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(field), + ParseBaseDType(dtype_string)); + return absl::OkStatus(); + } + case 2: { + return internal_json::JsonParseArray( + v, + [&](ptrdiff_t size) { + field.outer_shape.resize(size); + return absl::OkStatus(); + }, + [&](const ::nlohmann::json& x, ptrdiff_t j) { + return internal_json::JsonRequireInteger( + x, &field.outer_shape[j], /*strict=*/true, 1, + kInfIndex); + }); + } + default: + ABSL_UNREACHABLE(); // COV_NF_LINE + } + }); + }); + if (!parse_result.ok()) return parse_result; + return out; +} + +} // namespace + +absl::Status ValidateDType(ZarrDType& dtype) { + dtype.bytes_per_outer_element = 0; + for (size_t field_i = 0; field_i < dtype.fields.size(); ++field_i) { + auto& field = dtype.fields[field_i]; + if (std::any_of( + dtype.fields.begin(), dtype.fields.begin() + field_i, + [&](const ZarrDType::Field& f) { return f.name == field.name; })) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Field name ", QuoteString(field.name), " occurs more than once")); + } + field.field_shape.resize(field.flexible_shape.size() + + field.outer_shape.size()); + std::copy(field.flexible_shape.begin(), field.flexible_shape.end(), + std::copy(field.outer_shape.begin(), field.outer_shape.end(), + field.field_shape.begin())); + + field.num_inner_elements = ProductOfExtents(span(field.field_shape)); + if (field.num_inner_elements == std::numeric_limits::max()) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Product of dimensions ", span(field.field_shape), " is too large")); + } + if (internal::MulOverflow(field.num_inner_elements, + static_cast(field.dtype->size), + &field.num_bytes)) { + return absl::InvalidArgumentError("Field size in bytes is too large"); + } + field.byte_offset = dtype.bytes_per_outer_element; + if (internal::AddOverflow(dtype.bytes_per_outer_element, field.num_bytes, + &dtype.bytes_per_outer_element)) { + return absl::InvalidArgumentError( + "Total number of bytes per outer array element is too large"); + } + } + return absl::OkStatus(); +} + +std::optional GetScalarDataType(const ZarrDType& dtype) { + if (!dtype.has_fields && !dtype.fields.empty()) { + return dtype.fields[0].dtype; + } + return std::nullopt; +} + +Result ParseDType(const nlohmann::json& value) { + TENSORSTORE_ASSIGN_OR_RETURN(ZarrDType dtype, ParseDTypeNoDerived(value)); + TENSORSTORE_RETURN_IF_ERROR(ValidateDType(dtype)); + return dtype; +} + +bool operator==(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b) { + return a.encoded_dtype == b.encoded_dtype && a.dtype == b.dtype && + a.flexible_shape == b.flexible_shape; +} + +bool operator!=(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b) { + return !(a == b); +} + +bool operator==(const ZarrDType::Field& a, const ZarrDType::Field& b) { + return static_cast(a) == + static_cast(b) && + a.outer_shape == b.outer_shape && a.name == b.name && + a.field_shape == b.field_shape && + a.num_inner_elements == b.num_inner_elements && + a.byte_offset == b.byte_offset && a.num_bytes == b.num_bytes; +} + +bool operator!=(const ZarrDType::Field& a, const ZarrDType::Field& b) { + return !(a == b); +} + +bool operator==(const ZarrDType& a, const ZarrDType& b) { + return a.has_fields == b.has_fields && + a.bytes_per_outer_element == b.bytes_per_outer_element && + a.fields == b.fields; +} + +bool operator!=(const ZarrDType& a, const ZarrDType& b) { return !(a == b); } + +void to_json(::nlohmann::json& out, const ZarrDType::Field& field) { + using array_t = ::nlohmann::json::array_t; + if (field.outer_shape.empty()) { + out = array_t{field.name, field.encoded_dtype}; + } else { + out = array_t{field.name, field.encoded_dtype, field.outer_shape}; + } +} + +void to_json(::nlohmann::json& out, // NOLINT + const ZarrDType& dtype) { + if (!dtype.has_fields) { + out = dtype.fields[0].encoded_dtype; + } else { + out = dtype.fields; + } +} + +TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER(ZarrDType, [](auto is_loading, + const auto& options, + auto* obj, auto* j) { + if constexpr (is_loading) { + TENSORSTORE_ASSIGN_OR_RETURN(*obj, ParseDType(*j)); + } else { + to_json(*j, *obj); + } + return absl::OkStatus(); +}) + +namespace { + +Result MakeBaseDType(std::string_view name, + DataType dtype) { + ZarrDType::BaseDType base_dtype; + base_dtype.dtype = dtype; + base_dtype.encoded_dtype = std::string(name); + return base_dtype; +} + +} // namespace + +Result ChooseBaseDType(DataType dtype) { + if (dtype == dtype_v) return MakeBaseDType("bool", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint8", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint16", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint32", dtype); + if (dtype == dtype_v) return MakeBaseDType("uint64", dtype); + if (dtype == dtype_v) return MakeBaseDType("int8", dtype); + if (dtype == dtype_v) return MakeBaseDType("int16", dtype); + if (dtype == dtype_v) return MakeBaseDType("int32", dtype); + if (dtype == dtype_v) return MakeBaseDType("int64", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::bfloat16_t>) + return MakeBaseDType("bfloat16", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::float16_t>) + return MakeBaseDType("float16", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::float32_t>) + return MakeBaseDType("float32", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::float64_t>) + return MakeBaseDType("float64", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::complex64_t>) + return MakeBaseDType("complex64", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::complex128_t>) + return MakeBaseDType("complex128", dtype); + return absl::InvalidArgumentError( + tensorstore::StrCat("Data type not supported: ", dtype)); +} + +} // namespace internal_zarr3 +} // namespace tensorstore diff --git a/tensorstore/driver/zarr3/dtype.h b/tensorstore/driver/zarr3/dtype.h new file mode 100644 index 000000000..430dd8849 --- /dev/null +++ b/tensorstore/driver/zarr3/dtype.h @@ -0,0 +1,144 @@ +// Copyright 2020 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TENSORSTORE_DRIVER_ZARR3_DTYPE_H_ +#define TENSORSTORE_DRIVER_ZARR3_DTYPE_H_ + +/// \file +/// Support for encoding/decoding zarr "dtype" specifications. +/// See: https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html#data-type + +#include +#include +#include "tensorstore/data_type.h" +#include "tensorstore/internal/json_binding/bindable.h" +#include "tensorstore/util/endian.h" +#include "tensorstore/util/result.h" + +namespace tensorstore { +namespace internal_zarr3 { + +/// Decoded representation of a zarr "dtype" specification. +/// +/// A zarr "dtype" is a JSON value that is either: +/// +/// 1. A string, which specifies a single data type (e.g. "int32"). +/// In this case, the zarr array is considered to have a single, unnamed field. +/// +/// 2. An array, where each element of the array is of the form: +/// `[name, type]` or `[name, type, shape]`, where `name` is a JSON +/// string specifying the unique, non-empty field name, `type` is a data type +/// string, and `shape` is an optional "inner" array shape (specified +/// as a JSON array of non-negative integers) which defaults to the rank-0 +/// shape `[]` if not specified. +/// +/// Each field is encoded according to `type` into a fixed-size sequence of +/// bytes. If the optional "inner" array `shape` is specified, the individual +/// elements are encoded in C order. The encoding of each multi-field array +/// element is simply the concatenation of the encodings of each field. +struct ZarrDType { + /// Decoded representation of single value. + struct BaseDType { + /// Data type string. + std::string encoded_dtype; + + /// Corresponding DataType used for in-memory representation. + DataType dtype; + + /// For "flexible" data types that are themselves arrays, this specifies the + /// shape. For regular data types, this is empty. + std::vector flexible_shape; + }; + + /// Decoded representation of a single field. + struct Field : public BaseDType { + /// Optional `shape` dimensions specified by a zarr "dtype" field specified + /// as a JSON array. If the zarr dtype was specified as a single `typestr` + /// value, or as a two-element array, this is empty. + std::vector outer_shape; + + /// Field name. Must be non-empty and unique if the zarr "dtype" was + /// specified as an array. Otherwise, is empty. + std::string name; + + /// The inner array dimensions of this field, equal to the concatenation of + /// `outer_shape` and `flexible_shape` (derived value). + std::vector field_shape; + + /// Product of `field_shape` dimensions (derived value). + Index num_inner_elements; + + /// Byte offset of this field within an "outer" element (derived value). + Index byte_offset; + + /// Number of bytes occupied by this field within an "outer" element + /// (derived value). + Index num_bytes; + }; + + /// Equal to `true` if the zarr "dtype" was specified as an array, in which + /// case all fields must have a unique, non-empty `name`. If `false`, there + /// must be a single field with an empty `name`. + bool has_fields; + + /// Decoded representation of the fields. + std::vector fields; + + /// Bytes per "outer" element (derived value). + Index bytes_per_outer_element; + + TENSORSTORE_DECLARE_JSON_DEFAULT_BINDER(ZarrDType, + internal_json_binding::NoOptions) + + friend void to_json(::nlohmann::json& out, // NOLINT + const ZarrDType& dtype); +}; + +bool operator==(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b); +bool operator!=(const ZarrDType::BaseDType& a, + const ZarrDType::BaseDType& b); +bool operator==(const ZarrDType::Field& a, const ZarrDType::Field& b); +bool operator!=(const ZarrDType::Field& a, const ZarrDType::Field& b); +bool operator==(const ZarrDType& a, const ZarrDType& b); +bool operator!=(const ZarrDType& a, const ZarrDType& b); + +/// Parses a zarr metadata "dtype" JSON specification. +/// +/// \error `absl::StatusCode::kInvalidArgument` if `value` is not valid. +Result ParseDType(const ::nlohmann::json& value); + +/// Validates `dtype and computes derived values. +/// +/// \error `absl::StatusCode::kInvalidArgument` if two fields have the same +/// name. +/// \error `absl::StatusCode::kInvalidArgument` if the field size is too large. +absl::Status ValidateDType(ZarrDType& dtype); + +/// Returns the underlying TensorStore `DataType` if `dtype` represents an +/// unstructured scalar array, otherwise `std::nullopt`. +std::optional GetScalarDataType(const ZarrDType& dtype); + + /// Parses a Zarr 3 data type string. + /// + /// \error `absl::StatusCode::kInvalidArgument` if `dtype` is not valid. + Result ParseBaseDType(std::string_view dtype); + + /// Chooses a zarr data type corresponding to `dtype`. + Result ChooseBaseDType(DataType dtype); + +} // namespace internal_zarr3 +} // namespace tensorstore + +#endif // TENSORSTORE_DRIVER_ZARR3_DTYPE_H_ diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc new file mode 100644 index 000000000..cbb7acbfb --- /dev/null +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -0,0 +1,293 @@ +// Copyright 2023 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tensorstore/driver/zarr3/dtype.h" + +#include +#include + +#include // for std::byte +#include +#include + +#include +#include +#include "absl/status/status.h" +#include +#include "tensorstore/data_type.h" +#include "tensorstore/index.h" +#include "tensorstore/internal/testing/json_gtest.h" +#include "tensorstore/util/status_testutil.h" +#include "tensorstore/util/str_cat.h" + +namespace { + +using ::tensorstore::DataType; +using ::tensorstore::dtype_v; +using ::tensorstore::Index; +using ::tensorstore::kInfIndex; +using ::tensorstore::StatusIs; +using ::tensorstore::internal_zarr3::ChooseBaseDType; +using ::tensorstore::internal_zarr3::ParseBaseDType; +using ::tensorstore::internal_zarr3::ParseDType; +using ::tensorstore::internal_zarr3::ZarrDType; +using ::testing::HasSubstr; +using ::testing::MatchesRegex; + +void CheckBaseDType(std::string dtype, DataType r, + std::vector flexible_shape) { + EXPECT_THAT(ParseBaseDType(dtype), ::testing::Optional(ZarrDType::BaseDType{ + dtype, r, flexible_shape})) + << dtype; +} + +TEST(ParseBaseDType, Success) { + CheckBaseDType("bool", dtype_v, {}); + CheckBaseDType("int8", dtype_v, {}); + CheckBaseDType("uint8", dtype_v, {}); + CheckBaseDType("int16", dtype_v, {}); + CheckBaseDType("uint16", dtype_v, {}); + CheckBaseDType("int32", dtype_v, {}); + CheckBaseDType("uint32", dtype_v, {}); + CheckBaseDType("int64", dtype_v, {}); + CheckBaseDType("uint64", dtype_v, {}); + CheckBaseDType("float16", dtype_v, {}); + CheckBaseDType("bfloat16", dtype_v, {}); + CheckBaseDType("float32", dtype_v, {}); + CheckBaseDType("float64", dtype_v, {}); + CheckBaseDType("complex64", dtype_v, {}); + CheckBaseDType("complex128", dtype_v, {}); +} + +TEST(ParseBaseDType, Failure) { + EXPECT_THAT( + ParseBaseDType(""), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("data type is not one of the supported data types"))); + EXPECT_THAT(ParseBaseDType("float"), + StatusIs(absl::StatusCode::kInvalidArgument)); + EXPECT_THAT(ParseBaseDType("string"), + StatusIs(absl::StatusCode::kInvalidArgument)); + EXPECT_THAT(ParseBaseDType(", + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{}, + /*.name=*/"", + /*.field_shape=*/{}, + /*.num_inner_elements=*/1, + /*.byte_offset=*/0, + /*.num_bytes=*/1}, + }, + /*.bytes_per_outer_element=*/1, + }); +} + +TEST(ParseDType, SingleNamedFieldChar) { + // Zarr 3 doesn't support fixed size strings natively in core, so we use uint8 for testing bytes + CheckDType(::nlohmann::json::array_t{{"x", "uint8"}}, + ZarrDType{ + /*.has_fields=*/true, + /*.fields=*/ + { + {{ + /*.encoded_dtype=*/"uint8", + /*.dtype=*/dtype_v, + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{}, + /*.name=*/"x", + /*.field_shape=*/{}, + /*.num_inner_elements=*/1, + /*.byte_offset=*/0, + /*.num_bytes=*/1}, + }, + /*.bytes_per_outer_element=*/1, + }); +} + +TEST(ParseDType, TwoNamedFields) { + CheckDType( + ::nlohmann::json::array_t{{"x", "int8", {2, 3}}, {"y", "int16", {5}}}, + ZarrDType{ + /*.has_fields=*/true, + /*.fields=*/ + { + {{ + /*.encoded_dtype=*/"int8", + /*.dtype=*/dtype_v, + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{2, 3}, + /*.name=*/"x", + /*.field_shape=*/{2, 3}, + /*.num_inner_elements=*/2 * 3, + /*.byte_offset=*/0, + /*.num_bytes=*/1 * 2 * 3}, + {{ + /*.encoded_dtype=*/"int16", + /*.dtype=*/dtype_v, + /*.flexible_shape=*/{}, + }, + /*.outer_shape=*/{5}, + /*.name=*/"y", + /*.field_shape=*/{5}, + /*.num_inner_elements=*/5, + /*.byte_offset=*/1 * 2 * 3, + /*.num_bytes=*/2 * 5}, + }, + /*.bytes_per_outer_element=*/1 * 2 * 3 + 2 * 5, + }); +} + +TEST(ParseDType, FieldSpecTooShort) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x"}}), + StatusIs( + absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Expected array of size 2 or 3, but received: [\"x\"]"))); +} + +TEST(ParseDType, FieldSpecTooLong) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "int16", {2, 3}, 5}}), + StatusIs( + absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Expected array of size 2 or 3, but received: " + "[\"x\",\"int16\",[2,3],5]"))); +} + +TEST(ParseDType, InvalidFieldName) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{3, "int16"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 0: " + "Expected non-empty string, but received: 3"))); +} + +TEST(ParseDType, EmptyFieldName) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"", "int16"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 0: " + "Expected non-empty string, but received: \"\""))); +} + +TEST(ParseDType, DuplicateFieldName) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "int16"}, {"x", "uint16"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Field name \"x\" occurs more than once"))); +} + +TEST(ParseDType, NonStringFieldBaseDType) { + EXPECT_THAT(ParseDType(::nlohmann::json::array_t{{"x", 3}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 1: " + "Expected string, but received: 3"))); +} + +TEST(ParseDType, InvalidFieldBaseDType) { + EXPECT_THAT(ParseDType(::nlohmann::json::array_t{{"x", "unknown"}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Error parsing value at position 0: " + "Error parsing value at position 1: " + "unknown data type is not one of the " + "supported data types"))); +} + +TEST(ParseDType, ProductOfDimensionsOverflow) { + EXPECT_THAT( + ParseDType( + ::nlohmann::json::array_t{{"x", "int8", {kInfIndex, kInfIndex}}}), + StatusIs(absl::StatusCode::kInvalidArgument, + MatchesRegex(".*Product of dimensions .* is too large.*"))); +} + +TEST(ParseDType, FieldSizeInBytesOverflow) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "float64", {kInfIndex}}}), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Field size in bytes is too large"))); +} + +TEST(ParseDType, BytesPerOuterElementOverflow) { + EXPECT_THAT( + ParseDType(::nlohmann::json::array_t{{"x", "int16", {kInfIndex}}, + {"y", "int16", {kInfIndex}}}), + StatusIs( + absl::StatusCode::kInvalidArgument, + HasSubstr( + "Total number of bytes per outer array element is too large"))); +} + +TEST(ChooseBaseDTypeTest, RoundTrip) { + constexpr tensorstore::DataType kSupportedDataTypes[] = { + dtype_v, dtype_v, dtype_v, dtype_v, + dtype_v, dtype_v, dtype_v, + dtype_v, dtype_v, + dtype_v, + dtype_v, + dtype_v, + dtype_v, + dtype_v, + dtype_v, + }; + for (auto dtype : kSupportedDataTypes) { + SCOPED_TRACE(tensorstore::StrCat("dtype=", dtype)); + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto base_zarr_dtype, + ChooseBaseDType(dtype)); + EXPECT_EQ(dtype, base_zarr_dtype.dtype); + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto parsed, ParseBaseDType(base_zarr_dtype.encoded_dtype)); + EXPECT_EQ(dtype, parsed.dtype); + EXPECT_EQ(base_zarr_dtype.flexible_shape, parsed.flexible_shape); + EXPECT_EQ(base_zarr_dtype.encoded_dtype, parsed.encoded_dtype); + } +} + +TEST(ChooseBaseDTypeTest, Invalid) { + struct X {}; + EXPECT_THAT(ChooseBaseDType(dtype_v), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Data type not supported"))); + EXPECT_THAT(ChooseBaseDType(dtype_v<::tensorstore::dtypes::string_t>), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("Data type not supported: string"))); +} + +} // namespace diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 528d373ae..c96c31426 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -50,6 +50,7 @@ #include "tensorstore/driver/zarr3/codec/codec_spec.h" #include "tensorstore/driver/zarr3/codec/sharding_indexed.h" #include "tensorstore/driver/zarr3/default_nan.h" +#include "tensorstore/driver/zarr3/dtype.h" #include "tensorstore/driver/zarr3/name_configuration_json_binder.h" #include "tensorstore/index.h" #include "tensorstore/index_space/dimension_units.h" @@ -252,24 +253,110 @@ constexpr std::array } // namespace -absl::Status FillValueJsonBinder::operator()(std::true_type is_loading, - internal_json_binding::NoOptions, - SharedArray* obj, - ::nlohmann::json* j) const { +FillValueJsonBinder::FillValueJsonBinder(ZarrDType dtype, + bool allow_missing_dtype) + : dtype(std::move(dtype)), allow_missing_dtype(allow_missing_dtype) {} + +FillValueJsonBinder::FillValueJsonBinder(DataType data_type, + bool allow_missing_dtype) + : allow_missing_dtype(allow_missing_dtype) { + dtype.has_fields = false; + dtype.fields.resize(1); + auto& field = dtype.fields[0]; + field.name.clear(); + field.outer_shape.clear(); + field.flexible_shape.clear(); + field.field_shape.clear(); + field.num_inner_elements = 1; + field.byte_offset = 0; + field.num_bytes = data_type->size; + field.dtype = data_type; + field.encoded_dtype = std::string(data_type.name()); +} + +absl::Status FillValueJsonBinder::operator()( + std::true_type is_loading, internal_json_binding::NoOptions, + std::vector>* obj, ::nlohmann::json* j) const { + obj->resize(dtype.fields.size()); + if (dtype.fields.size() == 1) { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); + } else { + if (!j->is_array()) { + return internal_json::ExpectedError(*j, "array"); + } + if (j->size() != dtype.fields.size()) { + return internal_json::ExpectedError( + *j, tensorstore::StrCat("array of size ", dtype.fields.size())); + } + for (size_t i = 0; i < dtype.fields.size(); ++i) { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle((*j)[i], dtype.fields[i].dtype, (*obj)[i])); + } + } + return absl::OkStatus(); +} + +absl::Status FillValueJsonBinder::operator()( + std::false_type is_loading, internal_json_binding::NoOptions, + const std::vector>* obj, + ::nlohmann::json* j) const { + if (dtype.fields.size() == 1) { + return EncodeSingle((*obj)[0], dtype.fields[0].dtype, *j); + } + // Structured fill value + *j = ::nlohmann::json::array(); + for (size_t i = 0; i < dtype.fields.size(); ++i) { + ::nlohmann::json item; + TENSORSTORE_RETURN_IF_ERROR( + EncodeSingle((*obj)[i], dtype.fields[i].dtype, item)); + j->push_back(std::move(item)); + } + return absl::OkStatus(); +} + +absl::Status FillValueJsonBinder::DecodeSingle(::nlohmann::json& j, + DataType data_type, + SharedArray& out) const { + if (!data_type.valid()) { + if (allow_missing_dtype) { + out = SharedArray(); + return absl::OkStatus(); + } + return absl::InvalidArgumentError( + "data_type must be specified before fill_value"); + } auto arr = AllocateArray(span{}, c_order, default_init, data_type); void* data = arr.data(); - *obj = std::move(arr); - return kFillValueDataTypeFunctions[static_cast(data_type.id())] - .decode(data, *j); + out = std::move(arr); + const auto& functions = + kFillValueDataTypeFunctions[static_cast(data_type.id())]; + if (!functions.decode) { + if (allow_missing_dtype) { + out = SharedArray(); + return absl::OkStatus(); + } + return absl::FailedPreconditionError( + "fill_value unsupported for specified data_type"); + } + return functions.decode(data, j); } -absl::Status FillValueJsonBinder::operator()(std::false_type is_loading, - internal_json_binding::NoOptions, - const SharedArray* obj, - ::nlohmann::json* j) const { - return kFillValueDataTypeFunctions[static_cast(data_type.id())] - .encode(obj->data(), *j); +absl::Status FillValueJsonBinder::EncodeSingle( + const SharedArray& arr, DataType data_type, + ::nlohmann::json& j) const { + if (!data_type.valid()) { + return absl::InvalidArgumentError( + "data_type must be specified before fill_value"); + } + const auto& functions = + kFillValueDataTypeFunctions[static_cast(data_type.id())]; + if (!functions.encode) { + return absl::FailedPreconditionError( + "fill_value unsupported for specified data_type"); + } + return functions.encode(arr.data(), j); } TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER(ChunkKeyEncoding, [](auto is_loading, @@ -357,7 +444,7 @@ constexpr auto MetadataJsonBinder = [] { rank = &obj->rank; } - auto ensure_data_type = [&]() -> Result { + auto ensure_data_type = [&]() -> Result { if constexpr (std::is_same_v) { return obj->data_type; } @@ -378,19 +465,18 @@ constexpr auto MetadataJsonBinder = [] { maybe_optional_member("node_type", jb::Constant([] { return "array"; })), jb::Member("data_type", - jb::Projection<&Self::data_type>(maybe_optional(jb::Validate( - [](const auto& options, auto* obj) { - return ValidateDataType(*obj); - }, - jb::DataTypeJsonBinder)))), + jb::Projection<&Self::data_type>(maybe_optional( + jb::DefaultBinder<>))), jb::Member( "fill_value", jb::Projection<&Self::fill_value>(maybe_optional( [&](auto is_loading, const auto& options, auto* obj, auto* j) { TENSORSTORE_ASSIGN_OR_RETURN(auto data_type, ensure_data_type()); - return FillValueJsonBinder{data_type}(is_loading, options, - obj, j); + constexpr bool allow_missing_dtype = + std::is_same_v; + return FillValueJsonBinder{data_type, allow_missing_dtype}( + is_loading, options, obj, j); }))), non_compatibility_field( jb::Member("shape", jb::Projection<&Self::shape>( @@ -477,9 +563,28 @@ std::string ZarrMetadata::GetCompatibilityKey() const { absl::Status ValidateMetadata(ZarrMetadata& metadata) { if (!metadata.codecs) { ArrayCodecResolveParameters decoded; - decoded.dtype = metadata.data_type; + if (metadata.data_type.fields.size() == 1 && + metadata.data_type.fields[0].outer_shape.empty()) { + decoded.dtype = metadata.data_type.fields[0].dtype; + } else { + decoded.dtype = dtype_v; + // TODO: Verify this works for structured types. + // Zarr2 uses a "scalar" array concept with byte storage for chunks. + } decoded.rank = metadata.rank; - decoded.fill_value = metadata.fill_value; + // Fill value for codec resolve might be complex. + // Zarr3 codecs usually don't depend on fill value except for some like + // "sharding_indexed"? Sharding uses fill_value for missing chunks. + if (metadata.fill_value.size() == 1) { + decoded.fill_value = metadata.fill_value[0]; + } else { + // How to represent structured fill value for codec? + // Sharding expects a single array. + // If we use structured type, the "array" is bytes. + // We might need to encode the fill value to bytes. + // For now, leave empty if multiple fields. + } + BytesCodecResolveParameters encoded; TENSORSTORE_ASSIGN_OR_RETURN( metadata.codecs, @@ -488,7 +593,14 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { // Get codec chunk layout info. ArrayDataTypeAndShapeInfo array_info; - array_info.dtype = metadata.data_type; + // array_info.dtype used here to validate codec compatibility. + if (metadata.data_type.fields.size() == 1 && + metadata.data_type.fields[0].outer_shape.empty()) { + array_info.dtype = metadata.data_type.fields[0].dtype; + } else { + array_info.dtype = dtype_v; + } + array_info.rank = metadata.rank; std::copy_n(metadata.chunk_shape.begin(), metadata.rank, array_info.shape.emplace().begin()); @@ -512,17 +624,34 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { absl::Status ValidateMetadata(const ZarrMetadata& metadata, const ZarrMetadataConstraints& constraints) { using internal::MetadataMismatchError; - if (constraints.data_type && *constraints.data_type != metadata.data_type) { - return MetadataMismatchError("data_type", constraints.data_type->name(), - metadata.data_type.name()); - } - if (constraints.fill_value && - !AreArraysIdenticallyEqual(*constraints.fill_value, - metadata.fill_value)) { - auto binder = FillValueJsonBinder{metadata.data_type}; - auto constraint_json = jb::ToJson(*constraints.fill_value, binder).value(); - auto metadata_json = jb::ToJson(metadata.fill_value, binder).value(); - return MetadataMismatchError("fill_value", constraint_json, metadata_json); + if (constraints.data_type) { + // Compare ZarrDType + if (::nlohmann::json(*constraints.data_type) != + ::nlohmann::json(metadata.data_type)) { + return MetadataMismatchError( + "data_type", ::nlohmann::json(*constraints.data_type).dump(), + ::nlohmann::json(metadata.data_type).dump()); + } + } + if (constraints.fill_value) { + // Compare vector of arrays + if (constraints.fill_value->size() != metadata.fill_value.size()) { + return MetadataMismatchError("fill_value size", + constraints.fill_value->size(), + metadata.fill_value.size()); + } + for (size_t i = 0; i < metadata.fill_value.size(); ++i) { + if (!AreArraysIdenticallyEqual((*constraints.fill_value)[i], + metadata.fill_value[i])) { + auto binder = FillValueJsonBinder{metadata.data_type}; + auto constraint_json = + jb::ToJson(*constraints.fill_value, binder).value(); + auto metadata_json = + jb::ToJson(metadata.fill_value, binder).value(); + return MetadataMismatchError("fill_value", constraint_json, + metadata_json); + } + } } if (constraints.shape && *constraints.shape != metadata.shape) { return MetadataMismatchError("shape", *constraints.shape, metadata.shape); @@ -574,23 +703,64 @@ absl::Status ValidateMetadata(const ZarrMetadata& metadata, metadata.unknown_extension_attributes); } +namespace { +std::string GetFieldNames(const ZarrDType& dtype) { + std::vector field_names; + for (const auto& field : dtype.fields) { + field_names.push_back(field.name); + } + return ::nlohmann::json(field_names).dump(); +} +} // namespace + +Result GetFieldIndex(const ZarrDType& dtype, + std::string_view selected_field) { + if (selected_field.empty()) { + if (dtype.fields.size() != 1) { + return absl::FailedPreconditionError(tensorstore::StrCat( + "Must specify a \"field\" that is one of: ", GetFieldNames(dtype))); + } + return 0; + } + if (!dtype.has_fields) { + return absl::FailedPreconditionError( + tensorstore::StrCat("Requested field ", QuoteString(selected_field), + " but dtype does not have named fields")); + } + for (size_t field_index = 0; field_index < dtype.fields.size(); + ++field_index) { + if (dtype.fields[field_index].name == selected_field) return field_index; + } + return absl::FailedPreconditionError( + tensorstore::StrCat("Requested field ", QuoteString(selected_field), + " is not one of: ", GetFieldNames(dtype))); +} + +SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, + size_t field_index) { + SpecRankAndFieldInfo info; + info.chunked_rank = metadata.rank; + info.field = &metadata.data_type.fields[field_index]; + return info; +} + Result> GetEffectiveDomain( - DimensionIndex rank, std::optional> shape, + const SpecRankAndFieldInfo& info, + std::optional> metadata_shape, std::optional>> dimension_names, - const Schema& schema, bool* dimension_names_used = nullptr) { + const Schema& schema, bool* dimension_names_used) { + const DimensionIndex rank = info.chunked_rank; if (dimension_names_used) *dimension_names_used = false; auto domain = schema.domain(); - if (!shape && !dimension_names && !domain.valid()) { + if (!metadata_shape && !dimension_names && !domain.valid()) { if (schema.rank() == 0) return {std::in_place, 0}; - // No information about the domain available. return {std::in_place}; } - // Rank is already validated by caller. assert(RankConstraint::EqualOrUnspecified(schema.rank(), rank)); IndexDomainBuilder builder(std::max(schema.rank().rank, rank)); - if (shape) { - builder.shape(*shape); + if (metadata_shape) { + builder.shape(*metadata_shape); builder.implicit_upper_bounds(true); } else { builder.origin(GetConstantVector(builder.rank())); @@ -602,12 +772,12 @@ Result> GetEffectiveDomain( normalized_dimension_names[i] = *name; } } - // Use dimension_names as labels if they are valid. - if (internal::ValidateDimensionLabelsAreUnique(normalized_dimension_names) + if (internal::ValidateDimensionLabelsAreUnique( + span(&normalized_dimension_names[0], rank)) .ok()) { - if (dimension_names_used) *dimension_names_used = true; builder.labels( span(&normalized_dimension_names[0], rank)); + if (dimension_names_used) *dimension_names_used = true; } } @@ -618,36 +788,53 @@ Result> GetEffectiveDomain( tensorstore::MaybeAnnotateStatus( _, "Mismatch between metadata and schema"))); return WithImplicitDimensions(domain, false, true); - return domain; } Result> GetEffectiveDomain( const ZarrMetadataConstraints& metadata_constraints, const Schema& schema, bool* dimension_names_used) { - return GetEffectiveDomain( - metadata_constraints.rank, metadata_constraints.shape, - metadata_constraints.dimension_names, schema, dimension_names_used); + SpecRankAndFieldInfo info; + info.chunked_rank = metadata_constraints.rank; + if (info.chunked_rank == dynamic_rank && metadata_constraints.shape) { + info.chunked_rank = metadata_constraints.shape->size(); + } + + std::optional> shape_span; + if (metadata_constraints.shape) { + shape_span.emplace(metadata_constraints.shape->data(), + metadata_constraints.shape->size()); + } + std::optional>> names_span; + if (metadata_constraints.dimension_names) { + names_span.emplace(metadata_constraints.dimension_names->data(), + metadata_constraints.dimension_names->size()); + } + + return GetEffectiveDomain(info, shape_span, names_span, schema, + dimension_names_used); } absl::Status SetChunkLayoutFromMetadata( - DataType dtype, DimensionIndex rank, + const SpecRankAndFieldInfo& info, std::optional> chunk_shape, const ZarrCodecChainSpec* codecs, ChunkLayout& chunk_layout) { - TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set(RankConstraint{rank})); - rank = chunk_layout.rank(); - if (rank == dynamic_rank) return absl::OkStatus(); + const DimensionIndex rank = info.chunked_rank; + if (rank == dynamic_rank) { + return absl::OkStatus(); + } + TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set(RankConstraint(rank))); + TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set( + ChunkLayout::GridOrigin(GetConstantVector(rank)))); if (chunk_shape) { assert(chunk_shape->size() == rank); TENSORSTORE_RETURN_IF_ERROR( chunk_layout.Set(ChunkLayout::WriteChunkShape(*chunk_shape))); } - TENSORSTORE_RETURN_IF_ERROR(chunk_layout.Set( - ChunkLayout::GridOrigin(GetConstantVector(rank)))); if (codecs) { ArrayDataTypeAndShapeInfo array_info; - array_info.dtype = dtype; + array_info.dtype = info.field ? info.field->dtype : dtype_v; array_info.rank = rank; if (chunk_shape) { std::copy_n(chunk_shape->begin(), rank, @@ -669,30 +856,47 @@ absl::Status SetChunkLayoutFromMetadata( span(layout_info.codec_chunk_shape->data(), rank)))); } } + return absl::OkStatus(); } -Result GetEffectiveChunkLayout( +absl::Status SetChunkLayoutFromMetadata( DataType dtype, DimensionIndex rank, std::optional> chunk_shape, - const ZarrCodecChainSpec* codecs, const Schema& schema) { - auto chunk_layout = schema.chunk_layout(); - TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( - dtype, rank, chunk_shape, codecs, chunk_layout)); - return chunk_layout; + const ZarrCodecChainSpec* codecs, ChunkLayout& chunk_layout) { + SpecRankAndFieldInfo info; + info.chunked_rank = rank; + info.field = nullptr; + return SetChunkLayoutFromMetadata(info, chunk_shape, codecs, chunk_layout); } Result GetEffectiveChunkLayout( const ZarrMetadataConstraints& metadata_constraints, const Schema& schema) { - assert(RankConstraint::EqualOrUnspecified(metadata_constraints.rank, - schema.rank())); - return GetEffectiveChunkLayout( - metadata_constraints.data_type.value_or(DataType{}), - std::max(metadata_constraints.rank, schema.rank().rank), - metadata_constraints.chunk_shape, + // Approximation: assume whole array access or simple array + SpecRankAndFieldInfo info; + info.chunked_rank = std::max(metadata_constraints.rank, schema.rank().rank); + if (info.chunked_rank == dynamic_rank && metadata_constraints.shape) { + info.chunked_rank = metadata_constraints.shape->size(); + } + if (info.chunked_rank == dynamic_rank && metadata_constraints.chunk_shape) { + info.chunked_rank = metadata_constraints.chunk_shape->size(); + } + // We can't easily know field info from constraints unless we parse data_type. + // If data_type is present and has 1 field, we can check it. + // For now, basic implementation. + + ChunkLayout chunk_layout = schema.chunk_layout(); + std::optional> chunk_shape_span; + if (metadata_constraints.chunk_shape) { + chunk_shape_span.emplace(metadata_constraints.chunk_shape->data(), + metadata_constraints.chunk_shape->size()); + } + TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( + info, chunk_shape_span, metadata_constraints.codec_specs ? &*metadata_constraints.codec_specs : nullptr, - schema); + chunk_layout)); + return chunk_layout; } Result GetDimensionUnits( @@ -732,53 +936,63 @@ CodecSpec GetCodecFromMetadata(const ZarrMetadata& metadata) { } absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, - const Schema& schema) { - if (!RankConstraint::EqualOrUnspecified(metadata.rank, schema.rank())) { + size_t field_index, const Schema& schema) { + auto info = GetSpecRankAndFieldInfo(metadata, field_index); + const auto& field = metadata.data_type.fields[field_index]; + + if (!RankConstraint::EqualOrUnspecified(schema.rank(), info.chunked_rank)) { return absl::FailedPreconditionError(tensorstore::StrCat( "Rank specified by schema (", schema.rank(), - ") does not match rank specified by metadata (", metadata.rank, ")")); + ") does not match rank specified by metadata (", info.chunked_rank, + ")")); } if (schema.domain().valid()) { + std::optional> metadata_shape_span; + metadata_shape_span.emplace(metadata.shape.data(), metadata.shape.size()); + std::optional>> dimension_names_span; + dimension_names_span.emplace(metadata.dimension_names.data(), + metadata.dimension_names.size()); TENSORSTORE_RETURN_IF_ERROR(GetEffectiveDomain( - metadata.rank, metadata.shape, metadata.dimension_names, schema)); + info, metadata_shape_span, dimension_names_span, schema, + /*dimension_names_used=*/nullptr)); } if (auto dtype = schema.dtype(); - !IsPossiblySameDataType(metadata.data_type, dtype)) { + !IsPossiblySameDataType(field.dtype, dtype)) { return absl::FailedPreconditionError( - tensorstore::StrCat("data_type from metadata (", metadata.data_type, + tensorstore::StrCat("data_type from metadata (", field.dtype, ") does not match dtype in schema (", dtype, ")")); } if (schema.chunk_layout().rank() != dynamic_rank) { - TENSORSTORE_ASSIGN_OR_RETURN( - auto chunk_layout, - GetEffectiveChunkLayout(metadata.data_type, metadata.rank, - metadata.chunk_shape, &metadata.codec_specs, - schema)); + ChunkLayout chunk_layout = schema.chunk_layout(); + std::optional> chunk_shape_span; + chunk_shape_span.emplace(metadata.chunk_shape.data(), + metadata.chunk_shape.size()); + TENSORSTORE_RETURN_IF_ERROR(SetChunkLayoutFromMetadata( + info, chunk_shape_span, &metadata.codec_specs, chunk_layout)); if (chunk_layout.codec_chunk_shape().hard_constraint) { return absl::InvalidArgumentError("codec_chunk_shape not supported"); } } if (auto schema_fill_value = schema.fill_value(); schema_fill_value.valid()) { - const auto& fill_value = metadata.fill_value; + const auto& fill_value = metadata.fill_value[field_index]; TENSORSTORE_ASSIGN_OR_RETURN( auto broadcast_fill_value, tensorstore::BroadcastArray(schema_fill_value, span{})); TENSORSTORE_ASSIGN_OR_RETURN( SharedArray converted_fill_value, tensorstore::MakeCopy(std::move(broadcast_fill_value), - skip_repeated_elements, metadata.data_type)); + skip_repeated_elements, field.dtype)); if (!AreArraysIdenticallyEqual(converted_fill_value, fill_value)) { auto binder = FillValueJsonBinder{metadata.data_type}; - auto schema_json = jb::ToJson(converted_fill_value, binder).value(); - auto metadata_json = jb::ToJson(metadata.fill_value, binder).value(); + // Error message generation might be tricky with binder return absl::FailedPreconditionError(tensorstore::StrCat( "Invalid fill_value: schema requires fill value of ", - schema_json.dump(), ", but metadata specifies fill value of ", - metadata_json.dump())); + schema_fill_value, ", but metadata specifies fill value of ", + fill_value)); } } @@ -804,8 +1018,14 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, return absl::OkStatus(); } +absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, + const Schema& schema) { + return ValidateMetadataSchema(metadata, /*field_index=*/0, schema); +} + Result> GetNewMetadata( - const ZarrMetadataConstraints& metadata_constraints, const Schema& schema) { + const ZarrMetadataConstraints& metadata_constraints, const Schema& schema, + std::string_view selected_field) { auto metadata = std::make_shared(); metadata->zarr_format = metadata_constraints.zarr_format.value_or(3); @@ -813,51 +1033,85 @@ Result> GetNewMetadata( metadata_constraints.chunk_key_encoding.value_or(ChunkKeyEncoding{ /*.kind=*/ChunkKeyEncoding::kDefault, /*.separator=*/'/'}); + // Determine data type first + if (metadata_constraints.data_type) { + metadata->data_type = *metadata_constraints.data_type; + } else if (!selected_field.empty()) { + return absl::InvalidArgumentError( + "\"dtype\" must be specified in \"metadata\" if \"field\" is " + "specified"); + } else if (auto dtype = schema.dtype(); dtype.valid()) { + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast( + metadata->data_type.fields.emplace_back()), + ChooseBaseDType(dtype)); + metadata->data_type.has_fields = false; + TENSORSTORE_RETURN_IF_ERROR(ValidateDType(metadata->data_type)); + } else { + return absl::InvalidArgumentError("dtype must be specified"); + } + + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, GetFieldIndex(metadata->data_type, selected_field)); + SpecRankAndFieldInfo info; + info.field = &metadata->data_type.fields[field_index]; + info.chunked_rank = metadata_constraints.rank; + if (info.chunked_rank == dynamic_rank && metadata_constraints.shape) { + info.chunked_rank = metadata_constraints.shape->size(); + } + if (info.chunked_rank == dynamic_rank && + schema.rank().rank != dynamic_rank) { + info.chunked_rank = schema.rank().rank; + } + // Set domain - bool dimension_names_used; + bool dimension_names_used = false; + std::optional> constraint_shape_span; + if (metadata_constraints.shape) { + constraint_shape_span.emplace(metadata_constraints.shape->data(), + metadata_constraints.shape->size()); + } + std::optional>> constraint_names_span; + if (metadata_constraints.dimension_names) { + constraint_names_span.emplace( + metadata_constraints.dimension_names->data(), + metadata_constraints.dimension_names->size()); + } TENSORSTORE_ASSIGN_OR_RETURN( - auto domain, - GetEffectiveDomain(metadata_constraints, schema, &dimension_names_used)); + auto domain, GetEffectiveDomain(info, constraint_shape_span, + constraint_names_span, schema, + &dimension_names_used)); if (!domain.valid() || !IsFinite(domain.box())) { return absl::InvalidArgumentError("domain must be specified"); } - const DimensionIndex rank = metadata->rank = domain.rank(); - metadata->shape.assign(domain.shape().begin(), domain.shape().end()); + const DimensionIndex rank = domain.rank(); + metadata->rank = rank; + info.chunked_rank = rank; + metadata->shape.assign(domain.shape().begin(), + domain.shape().begin() + rank); metadata->dimension_names.assign(domain.labels().begin(), - domain.labels().end()); - // Normalize empty string dimension names to `std::nullopt`. This is more - // consistent with the zarr v3 dimension name semantics, and ensures that the - // `dimension_names` metadata field will be excluded entirely if all dimension - // names are the empty string. - // - // However, if empty string dimension names were specified explicitly in - // `metadata_constraints`, leave them exactly as specified. + domain.labels().begin() + rank); + for (DimensionIndex i = 0; i < rank; ++i) { auto& name = metadata->dimension_names[i]; if (!name || !name->empty()) continue; - // Dimension name equals the empty string. - if (dimension_names_used && (*metadata_constraints.dimension_names)[i]) { - // Empty dimension name was explicitly specified in - // `metadata_constraints`, leave it as is. + if (dimension_names_used && metadata_constraints.dimension_names && + (*metadata_constraints.dimension_names)[i]) { assert((*metadata_constraints.dimension_names)[i]->empty()); continue; } - // Name was not explicitly specified in `metadata_constraints` as an empty - // string. Normalize it to `std::nullopt`. name = std::nullopt; } - // Set dtype - auto dtype = schema.dtype(); - if (!dtype.valid()) { - return absl::InvalidArgumentError("dtype must be specified"); - } - TENSORSTORE_RETURN_IF_ERROR(ValidateDataType(dtype)); - metadata->data_type = dtype; - if (metadata_constraints.fill_value) { metadata->fill_value = *metadata_constraints.fill_value; } else if (auto fill_value = schema.fill_value(); fill_value.valid()) { + // Assuming single field if setting from schema + if (metadata->data_type.fields.size() != 1) { + return absl::InvalidArgumentError( + "Cannot specify fill_value through schema for structured zarr data " + "type"); + } const auto status = [&] { TENSORSTORE_ASSIGN_OR_RETURN( auto broadcast_fill_value, @@ -865,23 +1119,26 @@ Result> GetNewMetadata( TENSORSTORE_ASSIGN_OR_RETURN( auto converted_fill_value, tensorstore::MakeCopy(std::move(broadcast_fill_value), - skip_repeated_elements, metadata->data_type)); - metadata->fill_value = std::move(converted_fill_value); + skip_repeated_elements, + metadata->data_type.fields[0].dtype)); + metadata->fill_value.push_back(std::move(converted_fill_value)); return absl::OkStatus(); }(); TENSORSTORE_RETURN_IF_ERROR( status, tensorstore::MaybeAnnotateStatus(_, "Invalid fill_value")); } else { - metadata->fill_value = tensorstore::AllocateArray( - /*shape=*/span(), c_order, value_init, - metadata->data_type); + metadata->fill_value.resize(metadata->data_type.fields.size()); + for (size_t i = 0; i < metadata->fill_value.size(); ++i) { + metadata->fill_value[i] = tensorstore::AllocateArray( + /*shape=*/span(), c_order, value_init, + metadata->data_type.fields[i].dtype); + } } metadata->user_attributes = metadata_constraints.user_attributes; metadata->unknown_extension_attributes = metadata_constraints.unknown_extension_attributes; - // Set dimension units TENSORSTORE_ASSIGN_OR_RETURN( auto dimension_units, GetEffectiveDimensionUnits(rank, metadata_constraints.dimension_units, @@ -895,12 +1152,16 @@ Result> GetNewMetadata( TENSORSTORE_ASSIGN_OR_RETURN(auto codec_spec, GetEffectiveCodec(metadata_constraints, schema)); - // Set chunk shape - ArrayCodecResolveParameters decoded; - decoded.dtype = metadata->data_type; + if (metadata->data_type.fields.size() == 1 && + metadata->data_type.fields[0].outer_shape.empty()) { + decoded.dtype = metadata->data_type.fields[0].dtype; + } else { + decoded.dtype = dtype_v; + } decoded.rank = metadata->rank; - decoded.fill_value = metadata->fill_value; + if (metadata->fill_value.size() == 1) + decoded.fill_value = metadata->fill_value[0]; TENSORSTORE_ASSIGN_OR_RETURN( auto chunk_layout, GetEffectiveChunkLayout(metadata_constraints, schema)); @@ -920,8 +1181,6 @@ Result> GetNewMetadata( if (!internal::RangesEqual(span(metadata->chunk_shape), span(read_chunk_shape))) { - // Read chunk and write chunk shapes differ. Insert sharding codec if there - // is not already one. if (!codec_spec->codecs || codec_spec->codecs->sharding_height() == 0) { auto sharding_codec = internal::MakeIntrusivePtr( @@ -945,7 +1204,8 @@ Result> GetNewMetadata( TENSORSTORE_RETURN_IF_ERROR(set_up_codecs( codec_spec->codecs ? *codec_spec->codecs : ZarrCodecChainSpec{})); TENSORSTORE_RETURN_IF_ERROR(ValidateMetadata(*metadata)); - TENSORSTORE_RETURN_IF_ERROR(ValidateMetadataSchema(*metadata, schema)); + TENSORSTORE_RETURN_IF_ERROR( + ValidateMetadataSchema(*metadata, field_index, schema)); return metadata; } diff --git a/tensorstore/driver/zarr3/metadata.h b/tensorstore/driver/zarr3/metadata.h index 05b8c6be3..4c7871b0d 100644 --- a/tensorstore/driver/zarr3/metadata.h +++ b/tensorstore/driver/zarr3/metadata.h @@ -33,6 +33,7 @@ #include "tensorstore/data_type.h" #include "tensorstore/driver/zarr3/codec/codec.h" #include "tensorstore/driver/zarr3/codec/codec_chain_spec.h" +#include "tensorstore/driver/zarr3/dtype.h" #include "tensorstore/index.h" #include "tensorstore/index_space/dimension_units.h" #include "tensorstore/index_space/index_domain.h" @@ -72,19 +73,35 @@ struct ChunkKeyEncoding { }; struct FillValueJsonBinder { - DataType data_type; + ZarrDType dtype; + bool allow_missing_dtype = false; + FillValueJsonBinder() = default; + explicit FillValueJsonBinder(ZarrDType dtype, + bool allow_missing_dtype = false); + explicit FillValueJsonBinder(DataType dtype, + bool allow_missing_dtype = false); absl::Status operator()(std::true_type is_loading, internal_json_binding::NoOptions, - SharedArray* obj, + std::vector>* obj, ::nlohmann::json* j) const; absl::Status operator()(std::false_type is_loading, internal_json_binding::NoOptions, - const SharedArray* obj, + const std::vector>* obj, ::nlohmann::json* j) const; + + private: + absl::Status DecodeSingle(::nlohmann::json& j, DataType data_type, + SharedArray& out) const; + absl::Status EncodeSingle(const SharedArray& arr, + DataType data_type, + ::nlohmann::json& j) const; }; +struct SpecRankAndFieldInfo; + + struct ZarrMetadata { // The following members are common to `ZarrMetadata` and // `ZarrMetadataConstraints`, except that in `ZarrMetadataConstraints` some @@ -94,14 +111,14 @@ struct ZarrMetadata { int zarr_format; std::vector shape; - DataType data_type; + ZarrDType data_type; ::nlohmann::json::object_t user_attributes; std::optional dimension_units; std::vector> dimension_names; ChunkKeyEncoding chunk_key_encoding; std::vector chunk_shape; ZarrCodecChainSpec codec_specs; - SharedArray fill_value; + std::vector> fill_value; ::nlohmann::json::object_t unknown_extension_attributes; std::string GetCompatibilityKey() const; @@ -123,14 +140,14 @@ struct ZarrMetadataConstraints { std::optional zarr_format; std::optional> shape; - std::optional data_type; + std::optional data_type; ::nlohmann::json::object_t user_attributes; std::optional dimension_units; std::optional>> dimension_names; std::optional chunk_key_encoding; std::optional> chunk_shape; std::optional codec_specs; - std::optional> fill_value; + std::optional>> fill_value; ::nlohmann::json::object_t unknown_extension_attributes; TENSORSTORE_DECLARE_JSON_DEFAULT_BINDER(ZarrMetadataConstraints, @@ -159,6 +176,10 @@ Result> GetEffectiveDomain( /// Sets chunk layout constraints implied by `dtype`, `rank`, `chunk_shape`, and /// `codecs`. +absl::Status SetChunkLayoutFromMetadata( + const SpecRankAndFieldInfo& info, + std::optional> chunk_shape, + const ZarrCodecChainSpec* codecs, ChunkLayout& chunk_layout); absl::Status SetChunkLayoutFromMetadata( DataType dtype, DimensionIndex rank, std::optional> chunk_shape, @@ -198,6 +219,8 @@ Result> GetEffectiveCodec( CodecSpec GetCodecFromMetadata(const ZarrMetadata& metadata); /// Validates that `schema` is compatible with `metadata`. +absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, + size_t field_index, const Schema& schema); absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, const Schema& schema); @@ -206,10 +229,22 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, /// \error `absl::StatusCode::kInvalidArgument` if any required fields are /// unspecified. Result> GetNewMetadata( - const ZarrMetadataConstraints& metadata_constraints, const Schema& schema); + const ZarrMetadataConstraints& metadata_constraints, + const Schema& schema, std::string_view selected_field = {}); absl::Status ValidateDataType(DataType dtype); +Result GetFieldIndex(const ZarrDType& dtype, + std::string_view selected_field); + +struct SpecRankAndFieldInfo { + DimensionIndex chunked_rank = dynamic_rank; + const ZarrDType::Field* field = nullptr; +}; + +SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, + size_t field_index); + } // namespace internal_zarr3 } // namespace tensorstore diff --git a/tensorstore/driver/zarr3/metadata_test.cc b/tensorstore/driver/zarr3/metadata_test.cc index 0b140fa80..11c97619f 100644 --- a/tensorstore/driver/zarr3/metadata_test.cc +++ b/tensorstore/driver/zarr3/metadata_test.cc @@ -51,6 +51,7 @@ namespace { namespace jb = ::tensorstore::internal_json_binding; using ::tensorstore::ChunkLayout; +using ::tensorstore::DataType; using ::tensorstore::CodecSpec; using ::tensorstore::dtype_v; using ::tensorstore::Index; @@ -68,6 +69,7 @@ using ::tensorstore::dtypes::float32_t; using ::tensorstore::dtypes::float64_t; using ::tensorstore::internal::uint_t; using ::tensorstore::internal_zarr3::FillValueJsonBinder; +using ::tensorstore::internal_zarr3::ZarrDType; using ::tensorstore::internal_zarr3::ZarrMetadata; using ::tensorstore::internal_zarr3::ZarrMetadataConstraints; using ::testing::HasSubstr; @@ -90,13 +92,30 @@ ::nlohmann::json GetBasicMetadata() { }; } +ZarrDType MakeScalarZarrDType(DataType dtype) { + ZarrDType dtype_info; + dtype_info.has_fields = false; + dtype_info.fields.resize(1); + auto& field = dtype_info.fields[0]; + field.dtype = dtype; + field.encoded_dtype = std::string(dtype.name()); + field.outer_shape.clear(); + field.flexible_shape.clear(); + field.field_shape.clear(); + field.num_inner_elements = 1; + field.byte_offset = 0; + field.num_bytes = dtype->size; + return dtype_info; +} + TEST(MetadataTest, ParseValid) { auto json = GetBasicMetadata(); tensorstore::TestJsonBinderRoundTripJsonOnly({json}); TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto metadata, ZarrMetadata::FromJson(json)); EXPECT_THAT(metadata.shape, ::testing::ElementsAre(10, 11, 12)); EXPECT_THAT(metadata.chunk_shape, ::testing::ElementsAre(1, 2, 3)); - EXPECT_THAT(metadata.data_type, tensorstore::dtype_v); + ASSERT_EQ(metadata.data_type.fields.size(), 1); + EXPECT_EQ(tensorstore::dtype_v, metadata.data_type.fields[0].dtype); EXPECT_THAT(metadata.dimension_names, ::testing::ElementsAre("a", std::nullopt, "")); EXPECT_THAT(metadata.user_attributes, MatchesJson({{"a", "b"}, {"c", "d"}})); @@ -115,7 +134,8 @@ TEST(MetadataTest, ParseValidNoDimensionNames) { TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto metadata, ZarrMetadata::FromJson(json)); EXPECT_THAT(metadata.shape, ::testing::ElementsAre(10, 11, 12)); EXPECT_THAT(metadata.chunk_shape, ::testing::ElementsAre(1, 2, 3)); - EXPECT_THAT(metadata.data_type, tensorstore::dtype_v); + ASSERT_EQ(metadata.data_type.fields.size(), 1); + EXPECT_EQ(tensorstore::dtype_v, metadata.data_type.fields[0].dtype); EXPECT_THAT(metadata.dimension_names, ::testing::ElementsAre(std::nullopt, std::nullopt, std::nullopt)); EXPECT_THAT(metadata.user_attributes, MatchesJson({{"a", "b"}, {"c", "d"}})); @@ -486,7 +506,9 @@ TEST(MetadataTest, DataTypes) { } TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto metadata, ZarrMetadata::FromJson(json)); - EXPECT_EQ(tensorstore::GetDataType(data_type_name), metadata.data_type); + ASSERT_FALSE(metadata.data_type.fields.empty()); + EXPECT_EQ(tensorstore::GetDataType(data_type_name), + metadata.data_type.fields[0].dtype); } } @@ -503,18 +525,20 @@ TEST(MetadataTest, InvalidDataType) { template void TestFillValue(std::vector> cases, bool skip_to_json = false) { - auto binder = FillValueJsonBinder{dtype_v}; + FillValueJsonBinder binder(MakeScalarZarrDType(dtype_v)); for (const auto& [value, json] : cases) { SharedArray expected_fill_value = tensorstore::MakeScalarArray(value); if (!skip_to_json) { - EXPECT_THAT(jb::ToJson(expected_fill_value, binder), + std::vector> vec{expected_fill_value}; + EXPECT_THAT(jb::ToJson(vec, binder), ::testing::Optional(MatchesJson(json))) << "value=" << value << ", json=" << json; } - EXPECT_THAT(jb::FromJson>(json, binder), - ::testing::Optional( - tensorstore::MatchesArrayIdentically(expected_fill_value))) + EXPECT_THAT( + jb::FromJson>>(json, binder), + ::testing::Optional(::testing::ElementsAre( + tensorstore::MatchesArrayIdentically(expected_fill_value)))) << "json=" << json; } } @@ -522,10 +546,11 @@ void TestFillValue(std::vector> cases, template void TestFillValueInvalid( std::vector> cases) { - auto binder = FillValueJsonBinder{dtype_v}; + FillValueJsonBinder binder(MakeScalarZarrDType(dtype_v)); for (const auto& [json, matcher] : cases) { EXPECT_THAT( - jb::FromJson>(json, binder).status(), + jb::FromJson>>(json, binder) + .status(), StatusIs(absl::StatusCode::kInvalidArgument, MatchesRegex(matcher))) << "json=" << json; } From 187f42452a359bca712a64050176b93e5ce9b145 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 24 Nov 2025 22:57:11 +0000 Subject: [PATCH 02/20] Updates to have proper reads --- tensorstore/driver/zarr3/chunk_cache.cc | 74 ++++++++++++++---- tensorstore/driver/zarr3/chunk_cache.h | 11 ++- tensorstore/driver/zarr3/driver.cc | 74 ++++++++++++------ tensorstore/driver/zarr3/dtype.cc | 64 +++++++++++---- tensorstore/driver/zarr3/metadata.cc | 100 ++++++++++++++++-------- 5 files changed, 239 insertions(+), 84 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index ee1cba9c1..6bfa8c039 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -18,6 +18,8 @@ #include #include +#include +#include #include #include #include @@ -73,15 +75,17 @@ ZarrChunkCache::~ZarrChunkCache() = default; ZarrLeafChunkCache::ZarrLeafChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - internal::CachePool::WeakPtr /*data_cache_pool*/) - : Base(std::move(store)), codec_state_(std::move(codec_state)) {} + ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/) + : Base(std::move(store)), + codec_state_(std::move(codec_state)), + dtype_(std::move(dtype)) {} void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver>&& receiver) { return internal::ChunkCache::Read( {static_cast(request), - /*component_index=*/0, request.staleness_bound, + request.component_index, request.staleness_bound, request.fill_missing_data_reads}, std::move(receiver)); } @@ -92,7 +96,7 @@ void ZarrLeafChunkCache::Write( receiver) { return internal::ChunkCache::Write( {static_cast(request), - /*component_index=*/0, request.store_data_equal_to_fill_value}, + request.component_index, request.store_data_equal_to_fill_value}, std::move(receiver)); } @@ -149,12 +153,52 @@ std::string ZarrLeafChunkCache::GetChunkStorageKey( Result, 1>> ZarrLeafChunkCache::DecodeChunk(span chunk_indices, absl::Cord data) { + const size_t num_fields = dtype_.fields.size(); + absl::InlinedVector, 1> field_arrays(num_fields); + + + // For single non-structured field, decode directly + if (num_fields == 1 && dtype_.fields[0].outer_shape.empty()) { + TENSORSTORE_ASSIGN_OR_RETURN( + field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(), + std::move(data))); + return field_arrays; + } + + // For structured types, decode byte array then extract fields + // Build decode shape: [chunk_dims..., bytes_per_outer_element] + const auto& chunk_shape = grid().chunk_shape; + std::vector decode_shape(chunk_shape.begin(), chunk_shape.end()); + decode_shape.push_back(dtype_.bytes_per_outer_element); + TENSORSTORE_ASSIGN_OR_RETURN( - auto array, - codec_state_->DecodeArray(grid().components[0].shape(), std::move(data))); - absl::InlinedVector, 1> components; - components.push_back(std::move(array)); - return components; + auto byte_array, codec_state_->DecodeArray(decode_shape, std::move(data))); + + // Extract each field from the byte array + const Index num_elements = byte_array.num_elements() / + dtype_.bytes_per_outer_element; + const auto* src_bytes = static_cast(byte_array.data()); + + for (size_t field_i = 0; field_i < num_fields; ++field_i) { + const auto& field = dtype_.fields[field_i]; + // Use the component's shape (from the grid) for the result array + const auto& component_shape = grid().components[field_i].shape(); + auto result_array = + AllocateArray(component_shape, c_order, default_init, field.dtype); + auto* dst = static_cast(result_array.data()); + const Index field_size = field.dtype->size; + + // Copy field data from each struct element + for (Index i = 0; i < num_elements; ++i) { + std::memcpy(dst + i * field_size, + src_bytes + i * dtype_.bytes_per_outer_element + + field.byte_offset, + field_size); + } + field_arrays[field_i] = std::move(result_array); + } + + return field_arrays; } Result ZarrLeafChunkCache::EncodeChunk( @@ -170,9 +214,10 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { ZarrShardedChunkCache::ZarrShardedChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) : base_kvstore_(std::move(store)), codec_state_(std::move(codec_state)), + dtype_(std::move(dtype)), data_cache_pool_(std::move(data_cache_pool)) {} Result> TranslateCellToSourceTransformForShard( @@ -326,6 +371,7 @@ void ZarrShardedChunkCache::Read( *this, std::move(request.transform), std::move(receiver), [transaction = std::move(request.transaction), batch = std::move(request.batch), + component_index = request.component_index, staleness_bound = request.staleness_bound, fill_missing_data_reads = request.fill_missing_data_reads](auto entry) { Batch shard_batch = batch; @@ -339,8 +385,7 @@ void ZarrShardedChunkCache::Read( IndexTransform<>>&& receiver) { entry->sub_chunk_cache.get()->Read( {{transaction, std::move(transform), shard_batch}, - staleness_bound, - fill_missing_data_reads}, + component_index, staleness_bound, fill_missing_data_reads}, std::move(receiver)); }; }); @@ -354,6 +399,7 @@ void ZarrShardedChunkCache::Write( &ZarrArrayToArrayCodec::PreparedState::Write>( *this, std::move(request.transform), std::move(receiver), [transaction = std::move(request.transaction), + component_index = request.component_index, store_data_equal_to_fill_value = request.store_data_equal_to_fill_value](auto entry) { internal::OpenTransactionPtr shard_transaction = transaction; @@ -366,7 +412,7 @@ void ZarrShardedChunkCache::Write( AnyFlowReceiver>&& receiver) { entry->sub_chunk_cache.get()->Write( - {{shard_transaction, std::move(transform)}, + {{shard_transaction, std::move(transform)}, component_index, store_data_equal_to_fill_value}, std::move(receiver)); }; @@ -481,7 +527,7 @@ void ZarrShardedChunkCache::Entry::DoInitialize() { *sharding_state.sub_chunk_codec_chain, std::move(sharding_kvstore), cache.executor(), ZarrShardingCodec::PreparedState::Ptr(&sharding_state), - cache.data_cache_pool_); + cache.dtype_, cache.data_cache_pool_); zarr_chunk_cache = new_cache.release(); return std::unique_ptr(&zarr_chunk_cache->cache()); }) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index dd40e43ac..5933115d7 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -31,6 +31,7 @@ #include "tensorstore/driver/read_request.h" #include "tensorstore/driver/write_request.h" #include "tensorstore/driver/zarr3/codec/codec.h" +#include "tensorstore/driver/zarr3/dtype.h" #include "tensorstore/index.h" #include "tensorstore/index_space/index_transform.h" #include "tensorstore/internal/cache/cache.h" @@ -72,6 +73,7 @@ class ZarrChunkCache { virtual const Executor& executor() const = 0; struct ReadRequest : internal::DriverReadRequest { + size_t component_index = 0; absl::Time staleness_bound; bool fill_missing_data_reads; }; @@ -81,6 +83,7 @@ class ZarrChunkCache { IndexTransform<>>&& receiver) = 0; struct WriteRequest : internal::DriverWriteRequest { + size_t component_index = 0; bool store_data_equal_to_fill_value; }; @@ -154,6 +157,7 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, explicit ZarrLeafChunkCache(kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool); void Read(ZarrChunkCache::ReadRequest request, @@ -181,6 +185,7 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, kvstore::Driver* GetKvStoreDriver() override; ZarrCodecChain::PreparedState::Ptr codec_state_; + ZarrDType dtype_; }; /// Chunk cache for a Zarr array where each chunk is a shard. @@ -190,6 +195,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { public: explicit ZarrShardedChunkCache(kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool); const ZarrShardingCodec::PreparedState& sharding_codec_state() const { @@ -239,6 +245,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { kvstore::DriverPtr base_kvstore_; ZarrCodecChain::PreparedState::Ptr codec_state_; + ZarrDType dtype_; // Data cache pool, if it differs from `this->pool()` (which is equal to the // metadata cache pool). @@ -253,11 +260,11 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { explicit ZarrShardSubChunkCache( kvstore::DriverPtr store, Executor executor, ZarrShardingCodec::PreparedState::Ptr sharding_state, - internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) : ChunkCacheImpl(std::move(store), ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), - std::move(data_cache_pool)), + std::move(dtype), std::move(data_cache_pool)), sharding_state_(std::move(sharding_state)), executor_(std::move(executor)) {} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 15faced0a..1674a1c6d 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -103,9 +103,11 @@ class ZarrDriverSpec /*Parent=*/KvsDriverSpec>; ZarrMetadataConstraints metadata_constraints; + std::string selected_field; constexpr static auto ApplyMembers = [](auto& x, auto f) { - return f(internal::BaseCast(x), x.metadata_constraints); + return f(internal::BaseCast(x), x.metadata_constraints, + x.selected_field); }; static inline const auto default_json_binder = jb::Sequence( @@ -139,7 +141,10 @@ class ZarrDriverSpec return absl::OkStatus(); }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( - jb::DefaultInitializedValue())))); + jb::DefaultInitializedValue()))), + jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( + jb::DefaultValue( + [](auto* obj) { *obj = std::string{}; })))); absl::Status ApplyOptions(SpecOptions&& options) override { if (options.minimal_spec) { @@ -286,21 +291,33 @@ class DataCacheBase static internal::ChunkGridSpecification GetChunkGridSpecification( const ZarrMetadata& metadata) { assert(!metadata.fill_value.empty()); - auto fill_value = BroadcastArray(metadata.fill_value[0], - BoxView<>(metadata.rank)) - .value(); internal::ChunkGridSpecification::ComponentList components; - auto& component = components.emplace_back( - internal::AsyncWriteArray::Spec{ - std::move(fill_value), - // Since all dimensions are resizable, just - // specify unbounded `valid_data_bounds`. - Box<>(metadata.rank), - ContiguousLayoutPermutation<>( - span(metadata.inner_order.data(), metadata.rank))}, - metadata.chunk_shape); - component.array_spec.fill_value_comparison_kind = - EqualityComparisonKind::identical; + + // Create one component per field (like zarr v2) + for (size_t field_i = 0; field_i < metadata.data_type.fields.size(); + ++field_i) { + const auto& field = metadata.data_type.fields[field_i]; + auto fill_value = metadata.fill_value[field_i]; + if (!fill_value.valid()) { + // Use value-initialized rank-0 fill value (like zarr v2) + fill_value = AllocateArray(span{}, c_order, value_init, + field.dtype); + } + auto chunk_fill_value = + BroadcastArray(fill_value, BoxView<>(metadata.rank)).value(); + + auto& component = components.emplace_back( + internal::AsyncWriteArray::Spec{ + std::move(chunk_fill_value), + // Since all dimensions are resizable, just + // specify unbounded `valid_data_bounds`. + Box<>(metadata.rank), + ContiguousLayoutPermutation<>( + span(metadata.inner_order.data(), metadata.rank))}, + metadata.chunk_shape); + component.array_spec.fill_value_comparison_kind = + EqualityComparisonKind::identical; + } return internal::ChunkGridSpecification(std::move(components)); } @@ -381,7 +398,7 @@ class DataCacheBase Result> GetExternalToInternalTransform( const void* metadata_ptr, size_t component_index) override { - assert(component_index == 0); + // component_index corresponds to the selected field index const auto& metadata = *static_cast(metadata_ptr); const DimensionIndex rank = metadata.rank; std::string_view normalized_dimension_names[kMaxRank]; @@ -404,10 +421,16 @@ class DataCacheBase absl::Status GetBoundSpecData(KvsDriverSpec& spec_base, const void* metadata_ptr, size_t component_index) override { - assert(component_index == 0); auto& spec = static_cast(spec_base); const auto& metadata = *static_cast(metadata_ptr); spec.metadata_constraints = ZarrMetadataConstraints(metadata); + // Encode selected_field from component_index + if (metadata.data_type.has_fields && + component_index < metadata.data_type.fields.size()) { + spec.selected_field = metadata.data_type.fields[component_index].name; + } else { + spec.selected_field.clear(); + } return absl::OkStatus(); } @@ -513,7 +536,8 @@ class ZarrDriver : public ZarrDriverBase { AnyFlowReceiver> receiver) override { return cache()->zarr_chunk_cache().Read( - {std::move(request), GetCurrentDataStalenessBound(), + {std::move(request), this->component_index(), + GetCurrentDataStalenessBound(), this->fill_value_mode_.fill_missing_data_reads}, std::move(receiver)); } @@ -523,7 +547,7 @@ class ZarrDriver : public ZarrDriverBase { AnyFlowReceiver> receiver) override { return cache()->zarr_chunk_cache().Write( - {std::move(request), + {std::move(request), this->component_index(), this->fill_value_mode_.store_data_equal_to_fill_value}, std::move(receiver)); } @@ -621,7 +645,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { *static_cast(initializer.metadata.get()); return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, - metadata.codec_state, /*data_cache_pool=*/*cache_pool()); + metadata.codec_state, metadata.data_type, + /*data_cache_pool=*/*cache_pool()); } Result GetComponentIndex(const void* metadata_ptr, @@ -629,9 +654,12 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { const auto& metadata = *static_cast(metadata_ptr); TENSORSTORE_RETURN_IF_ERROR( ValidateMetadata(metadata, spec().metadata_constraints)); + TENSORSTORE_ASSIGN_OR_RETURN( + auto field_index, + GetFieldIndex(metadata.data_type, spec().selected_field)); TENSORSTORE_RETURN_IF_ERROR( - ValidateMetadataSchema(metadata, spec().schema)); - return 0; + ValidateMetadataSchema(metadata, field_index, spec().schema)); + return field_index; } }; diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 8d1c9d49e..281b9c98b 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -76,20 +76,12 @@ namespace { /// \param value The zarr metadata "dtype" JSON specification. /// \param out[out] Must be non-null. Filled with the parsed dtype on success. /// \error `absl::StatusCode::kInvalidArgument' if `value` is invalid. -Result ParseDTypeNoDerived(const nlohmann::json& value) { - ZarrDType out; - if (value.is_string()) { - // Single field. - out.has_fields = false; - out.fields.resize(1); - TENSORSTORE_ASSIGN_OR_RETURN( - static_cast(out.fields[0]), - ParseBaseDType(value.get())); - return out; - } +// Helper to parse fields array (used by both array format and object format) +absl::Status ParseFieldsArray(const nlohmann::json& fields_json, + ZarrDType& out) { out.has_fields = true; - auto parse_result = internal_json::JsonParseArray( - value, + return internal_json::JsonParseArray( + fields_json, [&](ptrdiff_t size) { out.fields.resize(size); return absl::OkStatus(); @@ -140,7 +132,51 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { } }); }); - if (!parse_result.ok()) return parse_result; +} + +Result ParseDTypeNoDerived(const nlohmann::json& value) { + ZarrDType out; + if (value.is_string()) { + // Single field. + out.has_fields = false; + out.fields.resize(1); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(out.fields[0]), + ParseBaseDType(value.get())); + return out; + } + // Handle extended object format: + // {"name": "structured", "configuration": {"fields": [...]}} + if (value.is_object()) { + if (value.contains("name") && value.contains("configuration")) { + std::string type_name; + TENSORSTORE_RETURN_IF_ERROR( + internal_json::JsonRequireValueAs(value["name"], &type_name)); + if (type_name == "structured") { + const auto& config = value["configuration"]; + if (!config.is_object() || !config.contains("fields")) { + return absl::InvalidArgumentError( + "Structured data type requires 'configuration' object with " + "'fields' array"); + } + TENSORSTORE_RETURN_IF_ERROR(ParseFieldsArray(config["fields"], out)); + return out; + } + // For other named types, try to parse as a base dtype + out.has_fields = false; + out.fields.resize(1); + TENSORSTORE_ASSIGN_OR_RETURN( + static_cast(out.fields[0]), + ParseBaseDType(type_name)); + return out; + } + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected string, array, or object with 'name' and 'configuration', " + "but received: ", + value.dump())); + } + // Handle array format: [["field1", "type1"], ["field2", "type2"], ...] + TENSORSTORE_RETURN_IF_ERROR(ParseFieldsArray(value, out)); return out; } diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index c96c31426..880991e8c 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -31,7 +31,10 @@ #include #include +#include + #include "absl/algorithm/container.h" +#include "absl/strings/escaping.h" #include "absl/base/casts.h" #include "absl/base/optimization.h" #include "absl/meta/type_traits.h" @@ -282,16 +285,44 @@ absl::Status FillValueJsonBinder::operator()( TENSORSTORE_RETURN_IF_ERROR( DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); } else { - if (!j->is_array()) { - return internal_json::ExpectedError(*j, "array"); - } - if (j->size() != dtype.fields.size()) { - return internal_json::ExpectedError( - *j, tensorstore::StrCat("array of size ", dtype.fields.size())); - } - for (size_t i = 0; i < dtype.fields.size(); ++i) { - TENSORSTORE_RETURN_IF_ERROR( - DecodeSingle((*j)[i], dtype.fields[i].dtype, (*obj)[i])); + // For structured types, handle both array format and base64-encoded string + if (j->is_string()) { + // Decode base64-encoded fill value for entire struct + std::string b64_decoded; + if (!absl::Base64Unescape(j->get(), &b64_decoded)) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected valid base64-encoded fill value, but received: ", + j->dump())); + } + // Verify size matches expected struct size + if (static_cast(b64_decoded.size()) != + dtype.bytes_per_outer_element) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected ", dtype.bytes_per_outer_element, + " base64-encoded bytes for fill_value, but received ", + b64_decoded.size(), " bytes")); + } + // Extract per-field fill values from decoded bytes + for (size_t i = 0; i < dtype.fields.size(); ++i) { + const auto& field = dtype.fields[i]; + auto arr = AllocateArray(span{}, c_order, default_init, + field.dtype); + std::memcpy(arr.data(), b64_decoded.data() + field.byte_offset, + field.dtype->size); + (*obj)[i] = std::move(arr); + } + } else if (j->is_array()) { + if (j->size() != dtype.fields.size()) { + return internal_json::ExpectedError( + *j, tensorstore::StrCat("array of size ", dtype.fields.size())); + } + for (size_t i = 0; i < dtype.fields.size(); ++i) { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle((*j)[i], dtype.fields[i].dtype, (*obj)[i])); + } + } else { + return internal_json::ExpectedError(*j, + "array or base64-encoded string"); } } return absl::OkStatus(); @@ -561,28 +592,33 @@ std::string ZarrMetadata::GetCompatibilityKey() const { } absl::Status ValidateMetadata(ZarrMetadata& metadata) { + // Determine if this is a structured type with multiple fields + const bool is_structured = + metadata.data_type.fields.size() > 1 || + (metadata.data_type.fields.size() == 1 && + !metadata.data_type.fields[0].outer_shape.empty()); + + // Build the codec shape - for structured types, include bytes dimension + std::vector codec_shape(metadata.chunk_shape.begin(), + metadata.chunk_shape.end()); + if (is_structured) { + codec_shape.push_back(metadata.data_type.bytes_per_outer_element); + } + if (!metadata.codecs) { ArrayCodecResolveParameters decoded; - if (metadata.data_type.fields.size() == 1 && - metadata.data_type.fields[0].outer_shape.empty()) { + if (!is_structured) { decoded.dtype = metadata.data_type.fields[0].dtype; + decoded.rank = metadata.rank; } else { + // For structured types, use byte dtype with extra dimension decoded.dtype = dtype_v; - // TODO: Verify this works for structured types. - // Zarr2 uses a "scalar" array concept with byte storage for chunks. + decoded.rank = metadata.rank + 1; } - decoded.rank = metadata.rank; // Fill value for codec resolve might be complex. - // Zarr3 codecs usually don't depend on fill value except for some like - // "sharding_indexed"? Sharding uses fill_value for missing chunks. - if (metadata.fill_value.size() == 1) { + // For structured types, create a byte fill value + if (metadata.fill_value.size() == 1 && !is_structured) { decoded.fill_value = metadata.fill_value[0]; - } else { - // How to represent structured fill value for codec? - // Sharding expects a single array. - // If we use structured type, the "array" is bytes. - // We might need to encode the fill value to bytes. - // For now, leave empty if multiple fields. } BytesCodecResolveParameters encoded; @@ -593,17 +629,19 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { // Get codec chunk layout info. ArrayDataTypeAndShapeInfo array_info; - // array_info.dtype used here to validate codec compatibility. - if (metadata.data_type.fields.size() == 1 && - metadata.data_type.fields[0].outer_shape.empty()) { + if (!is_structured) { array_info.dtype = metadata.data_type.fields[0].dtype; + array_info.rank = metadata.rank; + std::copy_n(metadata.chunk_shape.begin(), metadata.rank, + array_info.shape.emplace().begin()); } else { array_info.dtype = dtype_v; + array_info.rank = metadata.rank + 1; + auto& shape = array_info.shape.emplace(); + std::copy_n(metadata.chunk_shape.begin(), metadata.rank, shape.begin()); + shape[metadata.rank] = metadata.data_type.bytes_per_outer_element; } - array_info.rank = metadata.rank; - std::copy_n(metadata.chunk_shape.begin(), metadata.rank, - array_info.shape.emplace().begin()); ArrayCodecChunkLayoutInfo layout_info; TENSORSTORE_RETURN_IF_ERROR( metadata.codec_specs.GetDecodedChunkLayout(array_info, layout_info)); @@ -617,7 +655,7 @@ absl::Status ValidateMetadata(ZarrMetadata& metadata) { } TENSORSTORE_ASSIGN_OR_RETURN(metadata.codec_state, - metadata.codecs->Prepare(metadata.chunk_shape)); + metadata.codecs->Prepare(codec_shape)); return absl::OkStatus(); } From c2e73cd6b1a2dcd5499522dce0bacd378af43279 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 24 Nov 2025 22:57:22 +0000 Subject: [PATCH 03/20] Local testing and examples --- examples/BUILD | 23 +++ examples/CMakeLists.txt | 163 ++++++++++++++++++ examples/read_structured_zarr3.cc | 271 ++++++++++++++++++++++++++++++ 3 files changed, 457 insertions(+) create mode 100644 examples/CMakeLists.txt create mode 100644 examples/read_structured_zarr3.cc diff --git a/examples/BUILD b/examples/BUILD index 94acdba14..4dcb2d604 100644 --- a/examples/BUILD +++ b/examples/BUILD @@ -122,3 +122,26 @@ tensorstore_cc_binary( "@riegeli//riegeli/bytes:writer", ], ) + +tensorstore_cc_binary( + name = "read_structured_zarr3", + srcs = ["read_structured_zarr3.cc"], + deps = [ + "//tensorstore", + "//tensorstore:array", + "//tensorstore:context", + "//tensorstore:data_type", + "//tensorstore:index", + "//tensorstore:open", + "//tensorstore:open_mode", + "//tensorstore:spec", + "//tensorstore/driver/zarr3", + "//tensorstore/kvstore/file", + "//tensorstore/util:result", + "//tensorstore/util:status", + "@abseil-cpp//absl/flags:flag", + "@abseil-cpp//absl/flags:parse", + "@abseil-cpp//absl/status", + "@nlohmann_json//:json", + ], +) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..92e9857fa --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,163 @@ +# Standalone CMakeLists.txt for read_structured_zarr3 example +# +# Build instructions: +# mkdir -p /home/ubuntu/source/tensorstore/examples/build +# cd /home/ubuntu/source/tensorstore/examples/build +# cmake .. +# make +# +# Run: +# ./read_structured_zarr3 --zarr_path=/home/ubuntu/source/tensorstore/filt_mig.mdio/headers + +cmake_minimum_required(VERSION 3.24) +project(read_structured_zarr3 LANGUAGES CXX) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Path to the tensorstore build directory +set(TENSORSTORE_BUILD_DIR "/home/ubuntu/source/tensorstore/build" CACHE PATH "Path to tensorstore build directory") +set(TENSORSTORE_SOURCE_DIR "/home/ubuntu/source/tensorstore" CACHE PATH "Path to tensorstore source directory") +set(DEPS_DIR "${TENSORSTORE_BUILD_DIR}/_deps") + +# Include paths (matching what tensorstore tests use) +include_directories( + ${TENSORSTORE_SOURCE_DIR} + ${DEPS_DIR}/absl-src + ${DEPS_DIR}/re2-src + ${DEPS_DIR}/riegeli-src +) + +include_directories(SYSTEM + ${DEPS_DIR}/half-build/include + ${DEPS_DIR}/half-src/include + ${DEPS_DIR}/nlohmann_json-build/include + ${DEPS_DIR}/nlohmann_json-src/include + ${TENSORSTORE_BUILD_DIR} +) + +# Compiler flags +add_compile_options( + -fPIE + -Wno-deprecated-declarations + -Wno-sign-compare + -Wno-unused-but-set-parameter + -Wno-maybe-uninitialized + -Wno-sequence-point + -Wno-unknown-warning-option + -Wno-stringop-overflow + -fsized-deallocation +) + +# Find all the static libraries we need from the tensorstore build +file(GLOB TENSORSTORE_LIBS "${TENSORSTORE_BUILD_DIR}/libtensorstore*.a") +file(GLOB_RECURSE ABSEIL_LIBS "${DEPS_DIR}/absl-build/absl/*.a") +file(GLOB_RECURSE RIEGELI_LIBS "${DEPS_DIR}/riegeli-build/*.a") + +# Additional dependency libraries - corrected paths +file(GLOB_RECURSE BLOSC_LIBS "${DEPS_DIR}/blosc-build/*.a") +file(GLOB_RECURSE ZSTD_LIBS "${DEPS_DIR}/zstd-build/*.a") +file(GLOB_RECURSE RE2_LIBS "${DEPS_DIR}/re2-build/*.a") +file(GLOB_RECURSE SNAPPY_LIBS "${DEPS_DIR}/snappy-build/*.a") +file(GLOB_RECURSE BROTLI_LIBS "${DEPS_DIR}/brotli-build/*.a") +file(GLOB_RECURSE LZ4_LIBS "${DEPS_DIR}/lz4-build/*.a") +file(GLOB_RECURSE ZLIB_LIBS "${DEPS_DIR}/zlib-build/*.a") +file(GLOB_RECURSE PROTOBUF_LIBS "${DEPS_DIR}/protobuf-build/*.a") +file(GLOB_RECURSE GRPC_LIBS "${DEPS_DIR}/grpc-build/*.a") +file(GLOB_RECURSE CARES_LIBS "${DEPS_DIR}/c-ares-build/*.a") +file(GLOB_RECURSE SSL_LIBS "${DEPS_DIR}/boringssl-build/ssl/*.a") +file(GLOB_RECURSE CRYPTO_LIBS "${DEPS_DIR}/boringssl-build/crypto/*.a") +file(GLOB_RECURSE LIBLZMA_LIBS "${DEPS_DIR}/liblzma-build/*.a") +file(GLOB_RECURSE BZIP2_LIBS "${DEPS_DIR}/bzip2-build/*.a") +file(GLOB_RECURSE JPEG_LIBS "${DEPS_DIR}/jpeg-build/*.a") +file(GLOB_RECURSE PNG_LIBS "${DEPS_DIR}/png-build/*.a") +file(GLOB_RECURSE TIFF_LIBS "${DEPS_DIR}/tiff-build/*.a") +file(GLOB_RECURSE AVIF_LIBS "${DEPS_DIR}/avif-build/*.a") +file(GLOB_RECURSE AOM_LIBS "${DEPS_DIR}/aom-build/*.a") +file(GLOB_RECURSE WEBP_LIBS "${DEPS_DIR}/webp-build/*.a") +file(GLOB_RECURSE CURL_LIBS "${DEPS_DIR}/curl-build/*.a") + +# Create executable +add_executable(read_structured_zarr3 read_structured_zarr3.cc) + +# Link libraries - use whole-archive for libraries that use static registration +# These include drivers, codecs, kvstores, and context resource providers +target_link_libraries(read_structured_zarr3 PRIVATE + # Force inclusion of libraries with static registrations + -Wl,--whole-archive + + # Context resource providers + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_data_copy_concurrency_resource.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_file_io_concurrency_resource.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_cache_cache_pool_resource.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_concurrency_resource.a + + # Zarr3 driver and codecs + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_driver.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_blosc.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_bytes.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_crc32c.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_gzip.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_transpose.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_zstd.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_sharding_indexed.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_codec_chain_spec.a + + # File kvstore and its resource providers + ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file.a + ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file_file_resource.a + + -Wl,--no-whole-archive + + -Wl,--start-group + + # Tensorstore libs + ${TENSORSTORE_LIBS} + + # Riegeli + ${RIEGELI_LIBS} + + # Abseil + ${ABSEIL_LIBS} + + # Compression libs + ${BLOSC_LIBS} + ${ZSTD_LIBS} + ${LZ4_LIBS} + ${SNAPPY_LIBS} + ${BROTLI_LIBS} + ${ZLIB_LIBS} + ${LIBLZMA_LIBS} + ${BZIP2_LIBS} + + # Regex + ${RE2_LIBS} + + # Protocol buffers and gRPC + ${PROTOBUF_LIBS} + ${GRPC_LIBS} + ${CARES_LIBS} + + # SSL/TLS + ${SSL_LIBS} + ${CRYPTO_LIBS} + + # Image libraries + ${JPEG_LIBS} + ${PNG_LIBS} + ${TIFF_LIBS} + ${AVIF_LIBS} + ${AOM_LIBS} + ${WEBP_LIBS} + + # HTTP + ${CURL_LIBS} + + -Wl,--end-group + + # System libraries + pthread + dl + m + rt +) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc new file mode 100644 index 000000000..1caacd8f5 --- /dev/null +++ b/examples/read_structured_zarr3.cc @@ -0,0 +1,271 @@ +// Copyright 2024 The TensorStore Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Standalone test for reading structured data from a Zarr v3 array. +// +// This test opens an existing zarr3 array with structured data type, +// reads the "inline" field, and prints all values. +// +// Usage: +// bazel run //examples:read_structured_zarr3 -- /path/to/zarr/array +// +// Or with cmake: +// cd examples/build && ./read_structured_zarr3 + +#include + +#include +#include +#include +#include + +#include "absl/flags/flag.h" +#include "absl/flags/parse.h" +#include "absl/status/status.h" +#include +#include "tensorstore/array.h" +#include "tensorstore/context.h" +#include "tensorstore/data_type.h" +#include "tensorstore/index.h" +#include "tensorstore/open.h" +#include "tensorstore/open_mode.h" +#include "tensorstore/spec.h" +#include "tensorstore/tensorstore.h" +#include "tensorstore/util/result.h" +#include "tensorstore/util/status.h" + +ABSL_FLAG(std::string, zarr_path, + "/home/ubuntu/source/tensorstore/filt_mig.mdio/headers", + "Path to the zarr3 array directory"); + +namespace { + +using ::tensorstore::Index; + +// Field layout from the zarr.json metadata: +// The structured dtype has the following fields with their byte offsets: +// trace_seq_num_line: int32 @ 0 +// trace_seq_num_reel: int32 @ 4 +// ... (many more fields) ... +// inline: int32 @ 180 +// crossline: int32 @ 184 +// cdp_x: int32 @ 188 +// cdp_y: int32 @ 192 +// +// Total struct size: 196 bytes (matches blosc typesize) + +constexpr size_t kInlineFieldOffset = 180; +constexpr size_t kStructSize = 196; + +// Read and parse the zarr.json metadata to display info about structured type +void PrintZarrMetadata(const std::string& zarr_path) { + std::string metadata_path = zarr_path + "/zarr.json"; + std::ifstream file(metadata_path); + if (!file.is_open()) { + std::cerr << "Could not open " << metadata_path << std::endl; + return; + } + + nlohmann::json metadata; + try { + file >> metadata; + } catch (const nlohmann::json::parse_error& e) { + std::cerr << "Failed to parse zarr.json: " << e.what() << std::endl; + return; + } + + std::cout << "\n=== Zarr Metadata ===" << std::endl; + std::cout << "Shape: " << metadata["shape"].dump() << std::endl; + std::cout << "Dimension names: " << metadata["dimension_names"].dump() + << std::endl; + + if (metadata.contains("data_type")) { + auto& dt = metadata["data_type"]; + std::cout << "\nData type format:" << std::endl; + if (dt.is_object()) { + std::cout << " Type: object with name=\"" << dt["name"].get() + << "\"" << std::endl; + if (dt.contains("configuration") && + dt["configuration"].contains("fields")) { + auto& fields = dt["configuration"]["fields"]; + std::cout << " Number of fields: " << fields.size() << std::endl; + std::cout << " Fields:" << std::endl; + size_t byte_offset = 0; + for (const auto& field : fields) { + std::string name = field[0].get(); + std::string type = field[1].get(); + size_t size = (type == "int32" || type == "uint32" || type == "float32") + ? 4 + : 2; // int16/uint16 + std::cout << " " << name << ": " << type << " @ byte " << byte_offset + << std::endl; + byte_offset += size; + } + std::cout << " Total struct size: " << byte_offset << " bytes" + << std::endl; + } + } else if (dt.is_string()) { + std::cout << " Type: simple \"" << dt.get() << "\"" + << std::endl; + } else if (dt.is_array()) { + std::cout << " Type: array with " << dt.size() << " fields" << std::endl; + } + } + + if (metadata.contains("codecs")) { + std::cout << "\nCodecs: " << metadata["codecs"].dump(2) << std::endl; + } +} + +absl::Status Run(const std::string& zarr_path) { + std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; + std::cout << "Opening zarr3 array at: " << zarr_path << std::endl; + + // First, display metadata information + PrintZarrMetadata(zarr_path); + + auto context = tensorstore::Context::Default(); + + // Create spec for opening the zarr3 array + // Note: "field" is at the driver level, not inside kvstore (same as zarr v2) + ::nlohmann::json spec_json = { + {"driver", "zarr3"}, + {"kvstore", + { + {"driver", "file"}, + {"path", zarr_path + "/"}, + }}, + {"field", "inline"}, // Field at byte offset 180 + }; + + std::cout << "\n=== Opening TensorStore ===" << std::endl; + std::cout << "Spec: " << spec_json.dump(2) << std::endl; + + // Open the TensorStore + auto open_result = + tensorstore::Open(spec_json, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!open_result.ok()) { + std::cout << "\n=== Open Failed ===" << std::endl; + std::cout << "Status: " << open_result.status() << std::endl; + std::cout << "\nThis error is expected if the zarr3 driver's dtype parsing\n" + << "does not yet support the extended structured data type format:\n" + << " {\"name\": \"structured\", \"configuration\": {\"fields\": [...]}}\n" + << std::endl; + std::cout << "The dtype.cc ParseDTypeNoDerived() function currently handles:\n" + << " 1. String format: \"int32\"\n" + << " 2. Array format: [[\"field1\", \"int32\"], ...]\n" + << "\nBut the zarr.json uses the extended object format shown above." + << std::endl; + return open_result.status(); + } + + auto store = std::move(open_result).value(); + + // Get information about the array + auto domain = store.domain(); + std::cout << "\n=== Array Info ===" << std::endl; + std::cout << "Domain: " << domain << std::endl; + std::cout << "Dtype: " << store.dtype() << std::endl; + std::cout << "Rank: " << store.rank() << std::endl; + + auto shape = domain.shape(); + std::cout << "Shape: ["; + for (int i = 0; i < shape.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << shape[i]; + } + std::cout << "]" << std::endl; + + // Read all data + std::cout << "\n=== Reading Data ===" << std::endl; + TENSORSTORE_ASSIGN_OR_RETURN( + auto array, tensorstore::Read(store).result()); + + std::cout << "Read complete. Array size: " << array.num_elements() + << " elements" << std::endl; + std::cout << "Data type: " << array.dtype() << std::endl; + + // Since field="inline" was specified, the array contains just int32 values + // directly - no struct extraction needed! + Index num_inline = shape[0]; + Index num_crossline = shape[1]; + + std::cout << "\n=== Inline field values (shape: " << num_inline << " x " + << num_crossline << ") ===" << std::endl; + + // Cast to int32 pointer since the data is already the inline field values + auto int_ptr = reinterpret_cast(array.data()); + + // Print first 10 rows (or fewer if less data) + Index rows_to_print = std::min(num_inline, Index{10}); + Index cols_to_print = std::min(num_crossline, Index{10}); + + for (Index i = 0; i < rows_to_print; ++i) { + for (Index j = 0; j < cols_to_print; ++j) { + std::cout << int_ptr[i * num_crossline + j]; + if (j < cols_to_print - 1) { + std::cout << "\t"; + } + } + if (num_crossline > cols_to_print) { + std::cout << "\t..."; + } + std::cout << std::endl; + } + if (num_inline > rows_to_print) { + std::cout << "... (" << (num_inline - rows_to_print) << " more rows)" + << std::endl; + } + + std::cout << "\n=== Summary ===" << std::endl; + std::cout << "Successfully read " << (num_inline * num_crossline) + << " inline values" << std::endl; + + // Show some statistics + int32_t min_val = int_ptr[0], max_val = int_ptr[0]; + int64_t sum = 0; + for (Index i = 0; i < num_inline * num_crossline; ++i) { + min_val = std::min(min_val, int_ptr[i]); + max_val = std::max(max_val, int_ptr[i]); + sum += int_ptr[i]; + } + std::cout << "Min value: " << min_val << std::endl; + std::cout << "Max value: " << max_val << std::endl; + std::cout << "Mean value: " << (static_cast(sum) / (num_inline * num_crossline)) << std::endl; + + return absl::OkStatus(); +} + +} // namespace + +int main(int argc, char** argv) { + absl::ParseCommandLine(argc, argv); + + std::string zarr_path = absl::GetFlag(FLAGS_zarr_path); + if (zarr_path.empty()) { + std::cerr << "Error: --zarr_path is required" << std::endl; + return 1; + } + + auto status = Run(zarr_path); + if (!status.ok()) { + std::cerr << "\nFinal status: " << status << std::endl; + return 1; + } + + return 0; +} From 9e8ed947f5912394ca715d36d6fd1eb630d04e8a Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 25 Nov 2025 18:12:58 +0000 Subject: [PATCH 04/20] Begin adding support for opening struct arrays as void and add support for raw bits dtype --- examples/read_structured_zarr3.cc | 324 +++++++++++++++++++----- tensorstore/driver/zarr3/chunk_cache.cc | 7 + tensorstore/driver/zarr3/driver.cc | 180 +++++++++++-- tensorstore/driver/zarr3/dtype.cc | 52 +++- tensorstore/driver/zarr3/dtype_test.cc | 14 + tensorstore/driver/zarr3/metadata.cc | 89 ++++++- 6 files changed, 565 insertions(+), 101 deletions(-) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc index 1caacd8f5..259eade34 100644 --- a/examples/read_structured_zarr3.cc +++ b/examples/read_structured_zarr3.cc @@ -12,16 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Standalone test for reading structured data from a Zarr v3 array. +// Standalone test for reading structured data from Zarr v3 arrays. // -// This test opens an existing zarr3 array with structured data type, -// reads the "inline" field, and prints all values. +// This test opens two Zarr v3 arrays: +// 1. A structured array with named fields (headers/) +// 2. A raw bytes array containing struct data (raw_headers/) +// +// Both arrays should contain the same data, allowing comparison of: +// - Field-based access vs manual byte extraction +// - Structured dtype parsing vs raw byte handling // // Usage: -// bazel run //examples:read_structured_zarr3 -- /path/to/zarr/array +// bazel run //examples:read_structured_zarr3 -- /path/to/parent/dir // // Or with cmake: -// cd examples/build && ./read_structured_zarr3 +// cd examples/build && ./read_structured_zarr3 --zarr_path=/path/to/parent/dir +// +// Where the parent dir contains both 'headers/' and 'raw_headers/' subdirs. #include @@ -45,9 +52,15 @@ #include "tensorstore/util/result.h" #include "tensorstore/util/status.h" +// Internal headers for testing dtype parsing +#include "tensorstore/driver/zarr3/dtype.h" + +// Additional headers for string operations +#include "absl/strings/str_join.h" + ABSL_FLAG(std::string, zarr_path, - "/home/ubuntu/source/tensorstore/filt_mig.mdio/headers", - "Path to the zarr3 array directory"); + "/home/ubuntu/source/tensorstore/filt_mig.mdio", + "Path to the parent .mdio directory containing headers/ and raw_headers/"); namespace { @@ -128,56 +141,13 @@ void PrintZarrMetadata(const std::string& zarr_path) { } } -absl::Status Run(const std::string& zarr_path) { - std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; - std::cout << "Opening zarr3 array at: " << zarr_path << std::endl; - - // First, display metadata information - PrintZarrMetadata(zarr_path); - - auto context = tensorstore::Context::Default(); - - // Create spec for opening the zarr3 array - // Note: "field" is at the driver level, not inside kvstore (same as zarr v2) - ::nlohmann::json spec_json = { - {"driver", "zarr3"}, - {"kvstore", - { - {"driver", "file"}, - {"path", zarr_path + "/"}, - }}, - {"field", "inline"}, // Field at byte offset 180 - }; - - std::cout << "\n=== Opening TensorStore ===" << std::endl; - std::cout << "Spec: " << spec_json.dump(2) << std::endl; - - // Open the TensorStore - auto open_result = - tensorstore::Open(spec_json, context, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!open_result.ok()) { - std::cout << "\n=== Open Failed ===" << std::endl; - std::cout << "Status: " << open_result.status() << std::endl; - std::cout << "\nThis error is expected if the zarr3 driver's dtype parsing\n" - << "does not yet support the extended structured data type format:\n" - << " {\"name\": \"structured\", \"configuration\": {\"fields\": [...]}}\n" - << std::endl; - std::cout << "The dtype.cc ParseDTypeNoDerived() function currently handles:\n" - << " 1. String format: \"int32\"\n" - << " 2. Array format: [[\"field1\", \"int32\"], ...]\n" - << "\nBut the zarr.json uses the extended object format shown above." - << std::endl; - return open_result.status(); - } - - auto store = std::move(open_result).value(); - +// Helper function to read and display inline field from an array +absl::Status ReadInlineField(const tensorstore::TensorStore<>& store, + const std::string& array_name, + bool is_raw_bytes = false) { // Get information about the array auto domain = store.domain(); - std::cout << "\n=== Array Info ===" << std::endl; + std::cout << "\n=== " << array_name << " Array Info ===" << std::endl; std::cout << "Domain: " << domain << std::endl; std::cout << "Dtype: " << store.dtype() << std::endl; std::cout << "Rank: " << store.rank() << std::endl; @@ -191,7 +161,7 @@ absl::Status Run(const std::string& zarr_path) { std::cout << "]" << std::endl; // Read all data - std::cout << "\n=== Reading Data ===" << std::endl; + std::cout << "\n=== Reading " << array_name << " Data ===" << std::endl; TENSORSTORE_ASSIGN_OR_RETURN( auto array, tensorstore::Read(store).result()); @@ -199,16 +169,46 @@ absl::Status Run(const std::string& zarr_path) { << " elements" << std::endl; std::cout << "Data type: " << array.dtype() << std::endl; - // Since field="inline" was specified, the array contains just int32 values - // directly - no struct extraction needed! - Index num_inline = shape[0]; - Index num_crossline = shape[1]; + Index num_inline, num_crossline; + const int32_t* int_ptr; + + if (is_raw_bytes) { + // For raw bytes, we need to extract the inline field manually + // Shape is [inline, crossline, struct_size] + num_inline = shape[0]; + num_crossline = shape[1]; + Index struct_size = shape[2]; + if (struct_size != kStructSize) { + std::cout << "Warning: Raw struct size (" << struct_size + << ") differs from expected header struct size (" << kStructSize + << "). Assuming padding." << std::endl; + } - std::cout << "\n=== Inline field values (shape: " << num_inline << " x " - << num_crossline << ") ===" << std::endl; + // Extract inline field (4 bytes starting at offset 180) + auto byte_ptr = reinterpret_cast(array.data()); + std::vector inline_values(num_inline * num_crossline); - // Cast to int32 pointer since the data is already the inline field values - auto int_ptr = reinterpret_cast(array.data()); + for (Index i = 0; i < num_inline; ++i) { + for (Index j = 0; j < num_crossline; ++j) { + Index struct_offset = (i * num_crossline + j) * struct_size; + Index field_offset = struct_offset + kInlineFieldOffset; + std::memcpy(&inline_values[i * num_crossline + j], + byte_ptr + field_offset, 4); + } + } + + std::cout << "Extracted inline field from raw bytes at offset " + << kInlineFieldOffset << std::endl; + int_ptr = inline_values.data(); + } else { + // For structured array, field access already gave us int32 values + num_inline = shape[0]; + num_crossline = shape[1]; + int_ptr = reinterpret_cast(array.data()); + } + + std::cout << "\n=== Inline field values from " << array_name + << " (shape: " << num_inline << " x " << num_crossline << ") ===" << std::endl; // Print first 10 rows (or fewer if less data) Index rows_to_print = std::min(num_inline, Index{10}); @@ -231,10 +231,10 @@ absl::Status Run(const std::string& zarr_path) { << std::endl; } - std::cout << "\n=== Summary ===" << std::endl; + std::cout << "\n=== " << array_name << " Summary ===" << std::endl; std::cout << "Successfully read " << (num_inline * num_crossline) << " inline values" << std::endl; - + // Show some statistics int32_t min_val = int_ptr[0], max_val = int_ptr[0]; int64_t sum = 0; @@ -250,6 +250,189 @@ absl::Status Run(const std::string& zarr_path) { return absl::OkStatus(); } +absl::Status Run(const std::string& zarr_path) { + std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; + std::cout << "Opening zarr3 arrays in: " << zarr_path << std::endl; + + auto context = tensorstore::Context::Default(); + + // First, display metadata information for structured array + std::string headers_path = zarr_path + "/headers"; + PrintZarrMetadata(headers_path); + + // Test raw_bytes parsing by reading and parsing the raw_headers zarr.json + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "TESTING RAW_BYTES PARSING" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + std::string raw_metadata_path = zarr_path + "/raw_headers/zarr.json"; + std::ifstream raw_file(raw_metadata_path); + if (!raw_file.is_open()) { + std::cout << "Could not open " << raw_metadata_path << std::endl; + return absl::NotFoundError("Raw headers metadata not found"); + } + + nlohmann::json raw_metadata; + try { + raw_file >> raw_metadata; + } catch (const nlohmann::json::parse_error& e) { + std::cout << "Failed to parse raw zarr.json: " << e.what() << std::endl; + return absl::DataLossError("Invalid raw metadata JSON"); + } + + std::cout << "Raw headers data_type: " << raw_metadata["data_type"].dump(2) << std::endl; + + // Test parsing the raw_bytes data type + std::cout << "Testing raw_bytes dtype parsing..." << std::endl; + + // For now, just verify the JSON structure is what we expect + if (!raw_metadata.contains("data_type")) { + std::cout << "FAILED: No data_type in metadata" << std::endl; + return absl::NotFoundError("Missing data_type"); + } + + auto& dt = raw_metadata["data_type"]; + if (!dt.is_object() || !dt.contains("name") || dt["name"] != "raw_bytes") { + std::cout << "FAILED: data_type is not raw_bytes extension" << std::endl; + return absl::InvalidArgumentError("Not raw_bytes extension"); + } + + if (!dt.contains("configuration") || !dt["configuration"].contains("length_bytes")) { + std::cout << "FAILED: Missing length_bytes in configuration" << std::endl; + return absl::InvalidArgumentError("Missing length_bytes"); + } + + int length_bytes = dt["configuration"]["length_bytes"]; + std::cout << "SUCCESS: Found raw_bytes extension with length_bytes = " << length_bytes << std::endl; + std::cout << "This should parse to:" << std::endl; + std::cout << " - Single field with byte_t dtype" << std::endl; + std::cout << " - Field shape: [" << length_bytes << "]" << std::endl; + std::cout << " - Bytes per outer element: " << length_bytes << std::endl; + + // Now actually test the parsing implementation + std::cout << "\n=== Testing ParseDType Implementation ===" << std::endl; + auto dtype_result = tensorstore::internal_zarr3::ParseDType(dt); + if (!dtype_result.ok()) { + std::cout << "FAILED: Could not parse raw_bytes data type: " << dtype_result.status() << std::endl; + return dtype_result.status(); + } + + auto dtype = std::move(dtype_result).value(); + std::cout << "SUCCESS: ParseDType worked!" << std::endl; + std::cout << " Fields: " << dtype.fields.size() << std::endl; + std::cout << " Has fields: " << dtype.has_fields << std::endl; + std::cout << " Bytes per outer element: " << dtype.bytes_per_outer_element << std::endl; + + if (!dtype.fields.empty()) { + const auto& field = dtype.fields[0]; + std::cout << " Field name: '" << field.name << "'" << std::endl; + std::cout << " Field dtype: " << field.dtype << std::endl; + std::cout << " Field shape: [" << absl::StrJoin(field.field_shape, ", ") << "]" << std::endl; + std::cout << " Field num_inner_elements: " << field.num_inner_elements << std::endl; + std::cout << " Field num_bytes: " << field.num_bytes << std::endl; + } + + // Verify the parsing is correct + bool parsing_correct = true; + if (dtype.fields.size() != 1) { + std::cout << "ERROR: Expected 1 field, got " << dtype.fields.size() << std::endl; + parsing_correct = false; + } + if (dtype.fields[0].name != "") { + std::cout << "ERROR: Expected empty field name, got '" << dtype.fields[0].name << "'" << std::endl; + parsing_correct = false; + } + if (dtype.fields[0].dtype != tensorstore::dtype_v) { + std::cout << "ERROR: Expected byte_t dtype, got " << dtype.fields[0].dtype << std::endl; + parsing_correct = false; + } + if (dtype.fields[0].field_shape != std::vector{length_bytes}) { + std::cout << "ERROR: Expected field shape [" << length_bytes << "], got [" + << absl::StrJoin(dtype.fields[0].field_shape, ", ") << "]" << std::endl; + parsing_correct = false; + } + if (dtype.bytes_per_outer_element != length_bytes) { + std::cout << "ERROR: Expected " << length_bytes << " bytes per element, got " + << dtype.bytes_per_outer_element << std::endl; + parsing_correct = false; + } + + if (parsing_correct) { + std::cout << "\n✅ PARSING VERIFICATION: All checks passed!" << std::endl; + std::cout << "The raw_bytes extension is correctly parsed." << std::endl; + } else { + std::cout << "\n❌ PARSING VERIFICATION: Some checks failed!" << std::endl; + return absl::InternalError("Parsing verification failed"); + } + + // Test 1: Read from structured array using field access + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "TEST 1: Reading from structured 'headers' array" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + ::nlohmann::json headers_spec = ::nlohmann::json::object(); + headers_spec["driver"] = "zarr3"; + headers_spec["kvstore"] = ::nlohmann::json::object(); + headers_spec["kvstore"]["driver"] = "file"; + headers_spec["kvstore"]["path"] = headers_path + "/"; + headers_spec["field"] = "inline"; // Extract inline field (int32 at byte offset 180) + + std::cout << "Spec: " << headers_spec.dump(2) << std::endl; + + auto headers_open_result = + tensorstore::Open(headers_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!headers_open_result.ok()) { + std::cout << "\n=== Headers Open Failed ===" << std::endl; + std::cout << "Status: " << headers_open_result.status() << std::endl; + return headers_open_result.status(); + } + + auto headers_store = std::move(headers_open_result).value(); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_store, "headers")); + + // Test 2: Read from raw bytes array (no special void access needed) + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "TEST 2: Reading from raw 'raw_headers' array" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + std::string raw_headers_path = zarr_path + "/raw_headers"; + ::nlohmann::json raw_spec = ::nlohmann::json::object(); + raw_spec["driver"] = "zarr3"; + raw_spec["kvstore"] = ::nlohmann::json::object(); + raw_spec["kvstore"]["driver"] = "file"; + raw_spec["kvstore"]["path"] = raw_headers_path + "/"; + // No field specified - raw_bytes has a single anonymous field + + std::cout << "Spec: " << raw_spec.dump(2) << std::endl; + + auto raw_open_result = + tensorstore::Open(raw_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!raw_open_result.ok()) { + std::cout << "\n=== Raw Headers Open Failed ===" << std::endl; + std::cout << "Status: " << raw_open_result.status() << std::endl; + return raw_open_result.status(); + } + + auto raw_store = std::move(raw_open_result).value(); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); + + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "COMPARISON: Both methods should give identical inline field values" << std::endl; + std::cout << std::string(60, '=') << std::endl; + std::cout << "The structured 'headers' array provides field access convenience,\n" + << "while the raw 'raw_headers' array provides direct byte access.\n" + << "Both extract the inline field from byte offset " << kInlineFieldOffset + << " in " << kStructSize << "-byte structs." << std::endl; + + return absl::OkStatus(); +} + } // namespace int main(int argc, char** argv) { @@ -261,6 +444,15 @@ int main(int argc, char** argv) { return 1; } + // Verify the path structure + std::string headers_path = zarr_path + "/headers"; + std::string raw_headers_path = zarr_path + "/raw_headers"; + + std::cout << "Expecting arrays at:" << std::endl; + std::cout << " Structured: " << headers_path << std::endl; + std::cout << " Raw bytes: " << raw_headers_path << std::endl; + std::cout << std::endl; + auto status = Run(zarr_path); if (!status.ok()) { std::cerr << "\nFinal status: " << status << std::endl; diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 6bfa8c039..64b6d69fd 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -156,6 +156,13 @@ ZarrLeafChunkCache::DecodeChunk(span chunk_indices, const size_t num_fields = dtype_.fields.size(); absl::InlinedVector, 1> field_arrays(num_fields); + // Special case: void access - return raw bytes directly + if (num_fields == 1 && dtype_.fields[0].name == "") { + TENSORSTORE_ASSIGN_OR_RETURN( + field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(), + std::move(data))); + return field_arrays; + } // For single non-structured field, decode directly if (num_fields == 1 && dtype_.fields[0].outer_shape.empty()) { diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 1674a1c6d..b4d96da1f 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -79,6 +80,8 @@ namespace tensorstore { namespace internal_zarr3 { +constexpr size_t kVoidFieldIndex = size_t(-1); + // Avoid anonymous namespace to workaround MSVC bug. // // https://developercommunity.visualstudio.com/t/Bug-involving-virtual-functions-templat/10424129 @@ -263,12 +266,29 @@ class DataCacheBase DimensionSet& implicit_lower_bounds, DimensionSet& implicit_upper_bounds) override { const auto& metadata = *static_cast(metadata_ptr); - assert(bounds.rank() == static_cast(metadata.shape.size())); - std::fill(bounds.origin().begin(), bounds.origin().end(), Index(0)); + assert(bounds.rank() >= static_cast(metadata.shape.size())); + std::fill(bounds.origin().begin(), + bounds.origin().begin() + metadata.shape.size(), Index(0)); std::copy(metadata.shape.begin(), metadata.shape.end(), bounds.shape().begin()); implicit_lower_bounds = false; - implicit_upper_bounds = true; + implicit_upper_bounds = false; + for (DimensionIndex i = 0; + i < static_cast(metadata.shape.size()); ++i) { + implicit_upper_bounds[i] = true; + } + if (bounds.rank() > static_cast(metadata.shape.size()) && + metadata.data_type.fields.size() == 1) { + const auto& field = metadata.data_type.fields[0]; + if (static_cast(metadata.shape.size() + + field.field_shape.size()) == + bounds.rank()) { + for (size_t i = 0; i < field.field_shape.size(); ++i) { + bounds.shape()[metadata.shape.size() + i] = field.field_shape[i]; + bounds.origin()[metadata.shape.size() + i] = 0; + } + } + } } Result> GetResizedMetadata( @@ -289,10 +309,47 @@ class DataCacheBase } static internal::ChunkGridSpecification GetChunkGridSpecification( - const ZarrMetadata& metadata) { + const ZarrMetadata& metadata, size_t field_index = 0) { assert(!metadata.fill_value.empty()); internal::ChunkGridSpecification::ComponentList components; + // Special case: void access - create single component for entire struct + if (field_index == kVoidFieldIndex) { + // For void access, use the fill_value from the single raw_bytes field + auto& fill_value = metadata.fill_value[0]; + std::cout << "[DEBUG] Void access fill_value: shape=" << fill_value.shape() + << ", dtype=" << fill_value.dtype() << std::endl; + + // Broadcast to shape [unbounded, unbounded, ..., struct_size] + std::vector target_shape(metadata.rank, kInfIndex); + target_shape.push_back(metadata.data_type.bytes_per_outer_element); + std::cout << "[DEBUG] Void access target_shape: ["; + for (size_t i = 0; i < target_shape.size(); ++i) { + if (i > 0) std::cout << ", "; + std::cout << target_shape[i]; + } + std::cout << "]" << std::endl; + auto chunk_fill_value = + BroadcastArray(fill_value, BoxView<>(target_shape)).value(); + + // Add extra dimension for struct size in bytes + std::vector chunk_shape_with_bytes = metadata.chunk_shape; + chunk_shape_with_bytes.push_back(metadata.data_type.bytes_per_outer_element); + + auto& component = components.emplace_back( + internal::AsyncWriteArray::Spec{ + std::move(chunk_fill_value), + // Since all dimensions are resizable, just + // specify unbounded `valid_data_bounds`. + Box<>(metadata.rank + 1), + ContiguousLayoutPermutation<>( + span(metadata.inner_order.data(), metadata.rank + 1))}, + chunk_shape_with_bytes); + component.array_spec.fill_value_comparison_kind = + EqualityComparisonKind::identical; + return internal::ChunkGridSpecification(std::move(components)); + } + // Create one component per field (like zarr v2) for (size_t field_i = 0; field_i < metadata.data_type.fields.size(); ++field_i) { @@ -303,18 +360,47 @@ class DataCacheBase fill_value = AllocateArray(span{}, c_order, value_init, field.dtype); } + + // Handle fields with shape (e.g. raw_bytes) + const size_t field_rank = field.field_shape.size(); + + // 1. Construct target shape for broadcasting + std::vector target_shape(metadata.rank, kInfIndex); + target_shape.insert(target_shape.end(), field.field_shape.begin(), + field.field_shape.end()); + auto chunk_fill_value = - BroadcastArray(fill_value, BoxView<>(metadata.rank)).value(); + BroadcastArray(fill_value, BoxView<>(target_shape)).value(); + + // 2. Construct component chunk shape + std::vector component_chunk_shape = metadata.chunk_shape; + component_chunk_shape.insert(component_chunk_shape.end(), + field.field_shape.begin(), + field.field_shape.end()); + + // 3. Construct permutation + std::vector component_permutation(metadata.rank + + field_rank); + std::copy_n(metadata.inner_order.data(), metadata.rank, + component_permutation.begin()); + std::iota(component_permutation.begin() + metadata.rank, + component_permutation.end(), metadata.rank); + + // 4. Construct bounds + Box<> valid_data_bounds(metadata.rank + field_rank); + for (size_t i = 0; i < field_rank; ++i) { + valid_data_bounds[metadata.rank + i] = + IndexInterval::UncheckedSized(0, field.field_shape[i]); + } auto& component = components.emplace_back( internal::AsyncWriteArray::Spec{ std::move(chunk_fill_value), // Since all dimensions are resizable, just // specify unbounded `valid_data_bounds`. - Box<>(metadata.rank), - ContiguousLayoutPermutation<>( - span(metadata.inner_order.data(), metadata.rank))}, - metadata.chunk_shape); + std::move(valid_data_bounds), + ContiguousLayoutPermutation<>(component_permutation)}, + component_chunk_shape); component.array_spec.fill_value_comparison_kind = EqualityComparisonKind::identical; } @@ -342,7 +428,7 @@ class DataCacheBase [](std::string& out, DimensionIndex dim, Index grid_index) { absl::StrAppend(&out, grid_index); }, - rank, grid_indices); + rank, grid_indices.subspan(0, rank)); return key; } @@ -355,17 +441,21 @@ class DataCacheBase key_prefix_.size() + (metadata.chunk_key_encoding.kind == ChunkKeyEncoding::kDefault ? 2 : 0)); - return internal::ParseGridIndexKeyWithDimensionSeparator( - metadata.chunk_key_encoding.separator, - [](std::string_view part, DimensionIndex dim, Index& grid_index) { - if (part.empty() || !absl::ascii_isdigit(part.front()) || - !absl::ascii_isdigit(part.back()) || - !absl::SimpleAtoi(part, &grid_index)) { - return false; - } - return true; - }, - key, grid_indices); + if (!internal::ParseGridIndexKeyWithDimensionSeparator( + metadata.chunk_key_encoding.separator, + [](std::string_view part, DimensionIndex dim, Index& grid_index) { + if (part.empty() || !absl::ascii_isdigit(part.front()) || + !absl::ascii_isdigit(part.back()) || + !absl::SimpleAtoi(part, &grid_index)) { + return false; + } + return true; + }, + key, grid_indices.subspan(0, metadata.rank))) { + return false; + } + std::fill(grid_indices.begin() + metadata.rank, grid_indices.end(), 0); + return true; } Index MinGridIndexForLexicographicalOrder( @@ -378,7 +468,7 @@ class DataCacheBase *static_cast(initial_metadata().get()); if (metadata.chunk_key_encoding.kind == ChunkKeyEncoding::kDefault) { std::string key = tensorstore::StrCat(key_prefix_, "c"); - for (DimensionIndex i = 0; i < cell_indices.size(); ++i) { + for (DimensionIndex i = 0; i < metadata.rank; ++i) { tensorstore::StrAppend( &key, std::string_view(&metadata.chunk_key_encoding.separator, 1), cell_indices[i]); @@ -388,7 +478,7 @@ class DataCacheBase // Use "0" for rank 0 as a special case. std::string key = tensorstore::StrCat( key_prefix_, cell_indices.empty() ? 0 : cell_indices[0]); - for (DimensionIndex i = 1; i < cell_indices.size(); ++i) { + for (DimensionIndex i = 1; i < metadata.rank; ++i) { tensorstore::StrAppend( &key, std::string_view(&metadata.chunk_key_encoding.separator, 1), cell_indices[i]); @@ -400,7 +490,11 @@ class DataCacheBase const void* metadata_ptr, size_t component_index) override { // component_index corresponds to the selected field index const auto& metadata = *static_cast(metadata_ptr); + const auto& field = metadata.data_type.fields[component_index]; const DimensionIndex rank = metadata.rank; + const DimensionIndex field_rank = field.field_shape.size(); + const DimensionIndex total_rank = rank + field_rank; + std::string_view normalized_dimension_names[kMaxRank]; for (DimensionIndex i = 0; i < rank; ++i) { if (const auto& name = metadata.dimension_names[i]; name.has_value()) { @@ -408,11 +502,20 @@ class DataCacheBase } } auto builder = - tensorstore::IndexTransformBuilder<>(rank, rank) - .input_shape(metadata.shape) - .input_labels(span(&normalized_dimension_names[0], rank)); - builder.implicit_upper_bounds(true); + tensorstore::IndexTransformBuilder<>(total_rank, total_rank); + std::vector full_shape = metadata.shape; + full_shape.insert(full_shape.end(), field.field_shape.begin(), + field.field_shape.end()); + builder.input_shape(full_shape); + builder.input_labels(span(&normalized_dimension_names[0], total_rank)); + + DimensionSet implicit_upper_bounds(false); for (DimensionIndex i = 0; i < rank; ++i) { + implicit_upper_bounds[i] = true; + } + builder.implicit_upper_bounds(implicit_upper_bounds); + + for (DimensionIndex i = 0; i < total_rank; ++i) { builder.output_single_input_dimension(i, i); } return builder.Finalize(); @@ -643,9 +746,26 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { DataCacheInitializer&& initializer) override { const auto& metadata = *static_cast(initializer.metadata.get()); + // For void access, modify the dtype to indicate special handling + ZarrDType dtype = metadata.data_type; + if (spec().selected_field == "") { + // Create a synthetic dtype for void access + dtype = ZarrDType{ + /*.has_fields=*/false, + /*.fields=*/{ZarrDType::Field{ + ZarrDType::BaseDType{"", dtype_v, + {metadata.data_type.bytes_per_outer_element}}, + /*.outer_shape=*/{}, + /*.name=*/"", + /*.field_shape=*/{metadata.data_type.bytes_per_outer_element}, + /*.num_inner_elements=*/metadata.data_type.bytes_per_outer_element, + /*.byte_offset=*/0, + /*.num_bytes=*/metadata.data_type.bytes_per_outer_element}}, + /*.bytes_per_outer_element=*/metadata.data_type.bytes_per_outer_element}; + } return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, - metadata.codec_state, metadata.data_type, + metadata.codec_state, dtype, /*data_cache_pool=*/*cache_pool()); } @@ -657,6 +777,10 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_ASSIGN_OR_RETURN( auto field_index, GetFieldIndex(metadata.data_type, spec().selected_field)); + // For void access, map to component index 0 + if (field_index == kVoidFieldIndex) { + field_index = 0; + } TENSORSTORE_RETURN_IF_ERROR( ValidateMetadataSchema(metadata, field_index, spec().schema)); return field_index; diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 281b9c98b..116712d70 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -19,6 +19,7 @@ #include #include "absl/base/optimization.h" +#include "absl/strings/ascii.h" #include "tensorstore/data_type.h" #include "tensorstore/internal/json_binding/json_binding.h" #include "tensorstore/util/endian.h" @@ -57,9 +58,26 @@ Result ParseBaseDType(std::string_view dtype) { if (dtype == "complex128") return make_dtype(dtype_v<::tensorstore::dtypes::complex128_t>); + // Handle r raw bits type where N is number of bits (must be multiple of 8) + if (dtype.size() > 1 && dtype[0] == 'r' && absl::ascii_isdigit(dtype[1])) { + std::string_view suffix = dtype.substr(1); + Index num_bits = 0; + if (!absl::SimpleAtoi(suffix, &num_bits) || + num_bits == 0 || + num_bits % 8 != 0) { + return absl::InvalidArgumentError(tensorstore::StrCat( + dtype, " data type is invalid; expected r where N is a positive " + "multiple of 8")); + } + Index num_bytes = num_bits / 8; + return ZarrDType::BaseDType{std::string(dtype), + dtype_v<::tensorstore::dtypes::byte_t>, + {num_bytes}}; + } + constexpr std::string_view kSupported = "bool, uint8, uint16, uint32, uint64, int8, int16, int32, int64, " - "bfloat16, float16, float32, float64, complex64, complex128"; + "bfloat16, float16, float32, float64, complex64, complex128, r"; return absl::InvalidArgumentError( tensorstore::StrCat(dtype, " data type is not one of the supported " "data types: ", @@ -162,6 +180,34 @@ Result ParseDTypeNoDerived(const nlohmann::json& value) { TENSORSTORE_RETURN_IF_ERROR(ParseFieldsArray(config["fields"], out)); return out; } + if (type_name == "raw_bytes") { + const auto& config = value["configuration"]; + if (!config.is_object() || !config.contains("length_bytes")) { + return absl::InvalidArgumentError( + "raw_bytes data type requires 'configuration' object with " + "'length_bytes' field"); + } + Index length_bytes; + TENSORSTORE_RETURN_IF_ERROR( + internal_json::JsonRequireValueAs(config["length_bytes"], &length_bytes)); + if (length_bytes <= 0) { + return absl::InvalidArgumentError( + "raw_bytes length_bytes must be positive"); + } + out.has_fields = false; + out.fields.resize(1); + out.fields[0].encoded_dtype = "raw_bytes"; + out.fields[0].dtype = dtype_v; + out.fields[0].flexible_shape = {length_bytes}; + out.fields[0].outer_shape = {}; + out.fields[0].name = ""; + out.fields[0].field_shape = {length_bytes}; + out.fields[0].num_inner_elements = length_bytes; + out.fields[0].byte_offset = 0; + out.fields[0].num_bytes = length_bytes; + out.bytes_per_outer_element = length_bytes; + return out; + } // For other named types, try to parse as a base dtype out.has_fields = false; out.fields.resize(1); @@ -326,6 +372,10 @@ Result ChooseBaseDType(DataType dtype) { return MakeBaseDType("complex64", dtype); if (dtype == dtype_v<::tensorstore::dtypes::complex128_t>) return MakeBaseDType("complex128", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::byte_t>) + return MakeBaseDType("r8", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::char_t>) + return MakeBaseDType("r8", dtype); return absl::InvalidArgumentError( tensorstore::StrCat("Data type not supported: ", dtype)); } diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index cbb7acbfb..e1c5b444c 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -68,6 +68,9 @@ TEST(ParseBaseDType, Success) { CheckBaseDType("float64", dtype_v, {}); CheckBaseDType("complex64", dtype_v, {}); CheckBaseDType("complex128", dtype_v, {}); + CheckBaseDType("r8", dtype_v, {1}); + CheckBaseDType("r16", dtype_v, {2}); + CheckBaseDType("r64", dtype_v, {8}); } TEST(ParseBaseDType, Failure) { @@ -81,6 +84,15 @@ TEST(ParseBaseDType, Failure) { StatusIs(absl::StatusCode::kInvalidArgument)); EXPECT_THAT(ParseBaseDType(""))); + EXPECT_THAT(ParseBaseDType("r7"), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("data type is invalid; expected r"))); + EXPECT_THAT(ParseBaseDType("r0"), + StatusIs(absl::StatusCode::kInvalidArgument, + HasSubstr("data type is invalid; expected r"))); } void CheckDType(const ::nlohmann::json& json, const ZarrDType& expected) { @@ -266,6 +278,8 @@ TEST(ChooseBaseDTypeTest, RoundTrip) { dtype_v, dtype_v, dtype_v, + dtype_v, + dtype_v, }; for (auto dtype : kSupportedDataTypes) { SCOPED_TRACE(tensorstore::StrCat("dtype=", dtype)); diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 880991e8c..6a83cdbec 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -250,6 +250,10 @@ constexpr std::array FillValueDataTypeFunctions::Make<::tensorstore::dtypes::T>(); \ /**/ TENSORSTORE_ZARR3_FOR_EACH_DATA_TYPE(TENSORSTORE_INTERNAL_DO_DEF) + // Add char_t support for string data types + functions[static_cast(DataTypeId::char_t)] = + FillValueDataTypeFunctions::Make<::tensorstore::dtypes::char_t>(); + // byte_t is handled specially to use uint8_t functions #undef TENSORSTORE_INTERNAL_DO_DEF return functions; }(); @@ -282,8 +286,39 @@ absl::Status FillValueJsonBinder::operator()( std::vector>* obj, ::nlohmann::json* j) const { obj->resize(dtype.fields.size()); if (dtype.fields.size() == 1) { - TENSORSTORE_RETURN_IF_ERROR( - DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); + // Special case: raw_bytes (single field with byte_t and flexible shape) + if (dtype.fields[0].dtype.id() == DataTypeId::byte_t && + !dtype.fields[0].flexible_shape.empty()) { + // Handle base64-encoded fill value for raw_bytes + if (!j->is_string()) { + return absl::InvalidArgumentError( + "Expected base64-encoded string for raw_bytes fill_value"); + } + std::string b64_decoded; + if (!absl::Base64Unescape(j->get(), &b64_decoded)) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected valid base64-encoded fill value, but received: ", + j->dump())); + } + // Verify size matches expected byte array size + Index expected_size = dtype.fields[0].num_inner_elements; + if (static_cast(b64_decoded.size()) != expected_size) { + return absl::InvalidArgumentError(tensorstore::StrCat( + "Expected ", expected_size, + " base64-encoded bytes for fill_value, but received ", + b64_decoded.size(), " bytes")); + } + // Create fill value array + auto fill_arr = AllocateArray(dtype.fields[0].field_shape, c_order, + default_init, dtype.fields[0].dtype); + std::memcpy(fill_arr.data(), b64_decoded.data(), b64_decoded.size()); + std::cout << "[DEBUG] Raw bytes fill_value parsed: shape=" << fill_arr.shape() + << ", dtype=" << dtype.fields[0].dtype << std::endl; + (*obj)[0] = std::move(fill_arr); + } else { + TENSORSTORE_RETURN_IF_ERROR( + DecodeSingle(*j, dtype.fields[0].dtype, (*obj)[0])); + } } else { // For structured types, handle both array format and base64-encoded string if (j->is_string()) { @@ -361,8 +396,14 @@ absl::Status FillValueJsonBinder::DecodeSingle(::nlohmann::json& j, AllocateArray(span{}, c_order, default_init, data_type); void* data = arr.data(); out = std::move(arr); + // Special handling for byte_t: use uint8_t functions since they're binary compatible + auto type_id = data_type.id(); + if (type_id == DataTypeId::byte_t) { + type_id = DataTypeId::uint8_t; + } + const auto& functions = - kFillValueDataTypeFunctions[static_cast(data_type.id())]; + kFillValueDataTypeFunctions[static_cast(type_id)]; if (!functions.decode) { if (allow_missing_dtype) { out = SharedArray(); @@ -381,8 +422,14 @@ absl::Status FillValueJsonBinder::EncodeSingle( return absl::InvalidArgumentError( "data_type must be specified before fill_value"); } + // Special handling for byte_t: use uint8_t functions since they're binary compatible + auto type_id = data_type.id(); + if (type_id == DataTypeId::byte_t) { + type_id = DataTypeId::uint8_t; + } + const auto& functions = - kFillValueDataTypeFunctions[static_cast(data_type.id())]; + kFillValueDataTypeFunctions[static_cast(type_id)]; if (!functions.encode) { return absl::FailedPreconditionError( "fill_value unsupported for specified data_type"); @@ -751,8 +798,19 @@ std::string GetFieldNames(const ZarrDType& dtype) { } } // namespace +constexpr size_t kVoidFieldIndex = size_t(-1); + Result GetFieldIndex(const ZarrDType& dtype, std::string_view selected_field) { + // Special case: "" requests raw byte access (works for any dtype) + if (selected_field == "") { + if (dtype.fields.empty()) { + return absl::FailedPreconditionError( + "Requested field \"\" but dtype has no fields"); + } + return kVoidFieldIndex; + } + if (selected_field.empty()) { if (dtype.fields.size() != 1) { return absl::FailedPreconditionError(tensorstore::StrCat( @@ -779,6 +837,9 @@ SpecRankAndFieldInfo GetSpecRankAndFieldInfo(const ZarrMetadata& metadata, SpecRankAndFieldInfo info; info.chunked_rank = metadata.rank; info.field = &metadata.data_type.fields[field_index]; + if (!info.field->field_shape.empty()) { + info.chunked_rank += info.field->field_shape.size(); + } return info; } @@ -798,8 +859,24 @@ Result> GetEffectiveDomain( assert(RankConstraint::EqualOrUnspecified(schema.rank(), rank)); IndexDomainBuilder builder(std::max(schema.rank().rank, rank)); if (metadata_shape) { - builder.shape(*metadata_shape); - builder.implicit_upper_bounds(true); + if (static_cast(metadata_shape->size()) < rank && + info.field && !info.field->field_shape.empty() && + static_cast(metadata_shape->size() + + info.field->field_shape.size()) == rank) { + std::vector full_shape(metadata_shape->begin(), + metadata_shape->end()); + full_shape.insert(full_shape.end(), info.field->field_shape.begin(), + info.field->field_shape.end()); + builder.shape(full_shape); + DimensionSet implicit_upper_bounds(false); + for (size_t i = 0; i < metadata_shape->size(); ++i) { + implicit_upper_bounds[i] = true; + } + builder.implicit_upper_bounds(implicit_upper_bounds); + } else { + builder.shape(*metadata_shape); + builder.implicit_upper_bounds(true); + } } else { builder.origin(GetConstantVector(builder.rank())); } From 44c765ec04e0492cd8ba9aa9f5b43cf97834359b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 25 Nov 2025 18:28:09 +0000 Subject: [PATCH 05/20] Fix failing tests --- tensorstore/driver/zarr3/dtype.cc | 26 ++++++++++++++++++++++---- tensorstore/driver/zarr3/dtype_test.cc | 9 +++++++-- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 116712d70..5b3261812 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -75,6 +75,13 @@ Result ParseBaseDType(std::string_view dtype) { {num_bytes}}; } + // Handle bare "r" - must have a number after it + if (dtype.size() >= 1 && dtype[0] == 'r') { + return absl::InvalidArgumentError(tensorstore::StrCat( + dtype, " data type is invalid; expected r where N is a positive " + "multiple of 8")); + } + constexpr std::string_view kSupported = "bool, uint8, uint16, uint32, uint64, int8, int16, int32, int64, " "bfloat16, float16, float32, float64, complex64, complex128, r"; @@ -372,10 +379,21 @@ Result ChooseBaseDType(DataType dtype) { return MakeBaseDType("complex64", dtype); if (dtype == dtype_v<::tensorstore::dtypes::complex128_t>) return MakeBaseDType("complex128", dtype); - if (dtype == dtype_v<::tensorstore::dtypes::byte_t>) - return MakeBaseDType("r8", dtype); - if (dtype == dtype_v<::tensorstore::dtypes::char_t>) - return MakeBaseDType("r8", dtype); + if (dtype == dtype_v<::tensorstore::dtypes::byte_t>) { + ZarrDType::BaseDType base_dtype; + base_dtype.dtype = dtype; + base_dtype.encoded_dtype = "r8"; + base_dtype.flexible_shape = {1}; + return base_dtype; + } + if (dtype == dtype_v<::tensorstore::dtypes::char_t>) { + // char_t encodes as r8, which parses back to byte_t + ZarrDType::BaseDType base_dtype; + base_dtype.dtype = dtype_v<::tensorstore::dtypes::byte_t>; + base_dtype.encoded_dtype = "r8"; + base_dtype.flexible_shape = {1}; + return base_dtype; + } return absl::InvalidArgumentError( tensorstore::StrCat("Data type not supported: ", dtype)); } diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index e1c5b444c..ef55aba09 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -285,10 +285,15 @@ TEST(ChooseBaseDTypeTest, RoundTrip) { SCOPED_TRACE(tensorstore::StrCat("dtype=", dtype)); TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto base_zarr_dtype, ChooseBaseDType(dtype)); - EXPECT_EQ(dtype, base_zarr_dtype.dtype); + // byte_t and char_t both encode as r8, which parses back to byte_t + DataType expected_dtype = dtype; + if (dtype == dtype_v) { + expected_dtype = dtype_v; + } + EXPECT_EQ(expected_dtype, base_zarr_dtype.dtype); TENSORSTORE_ASSERT_OK_AND_ASSIGN( auto parsed, ParseBaseDType(base_zarr_dtype.encoded_dtype)); - EXPECT_EQ(dtype, parsed.dtype); + EXPECT_EQ(expected_dtype, parsed.dtype); EXPECT_EQ(base_zarr_dtype.flexible_shape, parsed.flexible_shape); EXPECT_EQ(base_zarr_dtype.encoded_dtype, parsed.encoded_dtype); } From 547642d819aa5ac878300530e9d049018de27db8 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 25 Nov 2025 20:10:09 +0000 Subject: [PATCH 06/20] Resolve issues with opening struct as void --- examples/read_structured_zarr3.cc | 40 ++++++++++++-- tensorstore/driver/zarr3/driver.cc | 83 ++++++++++++++++++++++++------ 2 files changed, 104 insertions(+), 19 deletions(-) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc index 259eade34..bf12ced1b 100644 --- a/examples/read_structured_zarr3.cc +++ b/examples/read_structured_zarr3.cc @@ -422,12 +422,44 @@ absl::Status Run(const std::string& zarr_path) { auto raw_store = std::move(raw_open_result).value(); TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); + // Test 3: Read from headers array as void (field="") + // Use a fresh context to avoid cache sharing with Test 1 std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "COMPARISON: Both methods should give identical inline field values" << std::endl; + std::cout << "TEST 3: Reading from 'headers' array as void (field=\"\")" << std::endl; std::cout << std::string(60, '=') << std::endl; - std::cout << "The structured 'headers' array provides field access convenience,\n" - << "while the raw 'raw_headers' array provides direct byte access.\n" - << "Both extract the inline field from byte offset " << kInlineFieldOffset + + auto context_void = tensorstore::Context::Default(); + + ::nlohmann::json headers_void_spec = ::nlohmann::json::object(); + headers_void_spec["driver"] = "zarr3"; + headers_void_spec["kvstore"] = ::nlohmann::json::object(); + headers_void_spec["kvstore"]["driver"] = "file"; + headers_void_spec["kvstore"]["path"] = headers_path + "/"; + headers_void_spec["field"] = ""; // Special field for raw byte access + + std::cout << "Spec: " << headers_void_spec.dump(2) << std::endl; + + auto headers_void_open_result = + tensorstore::Open(headers_void_spec, context_void, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result(); + + if (!headers_void_open_result.ok()) { + std::cout << "\n=== Headers (void) Open Failed ===" << std::endl; + std::cout << "Status: " << headers_void_open_result.status() << std::endl; + return headers_void_open_result.status(); + } + + auto headers_void_store = std::move(headers_void_open_result).value(); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (void)", /*is_raw_bytes=*/true)); + + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "COMPARISON: All three methods should give identical inline field values" << std::endl; + std::cout << std::string(60, '=') << std::endl; + std::cout << "- Test 1: 'headers' with field=\"inline\" provides field access convenience\n" + << "- Test 2: 'raw_headers' (raw_bytes type) provides direct byte access\n" + << "- Test 3: 'headers' with field=\"\" provides raw byte access to structured data\n" + << "All three extract the inline field from byte offset " << kInlineFieldOffset << " in " << kStructSize << "-byte structs." << std::endl; return absl::OkStatus(); diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index b4d96da1f..bed1171d2 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -315,26 +315,27 @@ class DataCacheBase // Special case: void access - create single component for entire struct if (field_index == kVoidFieldIndex) { - // For void access, use the fill_value from the single raw_bytes field - auto& fill_value = metadata.fill_value[0]; - std::cout << "[DEBUG] Void access fill_value: shape=" << fill_value.shape() - << ", dtype=" << fill_value.dtype() << std::endl; + // For void access, create a zero-filled byte array as the fill value + const Index bytes_per_element = metadata.data_type.bytes_per_outer_element; + auto base_fill_value = AllocateArray( + span({bytes_per_element}), c_order, value_init, + dtype_v); // Broadcast to shape [unbounded, unbounded, ..., struct_size] std::vector target_shape(metadata.rank, kInfIndex); - target_shape.push_back(metadata.data_type.bytes_per_outer_element); - std::cout << "[DEBUG] Void access target_shape: ["; - for (size_t i = 0; i < target_shape.size(); ++i) { - if (i > 0) std::cout << ", "; - std::cout << target_shape[i]; - } - std::cout << "]" << std::endl; + target_shape.push_back(bytes_per_element); auto chunk_fill_value = - BroadcastArray(fill_value, BoxView<>(target_shape)).value(); + BroadcastArray(base_fill_value, BoxView<>(target_shape)).value(); // Add extra dimension for struct size in bytes std::vector chunk_shape_with_bytes = metadata.chunk_shape; - chunk_shape_with_bytes.push_back(metadata.data_type.bytes_per_outer_element); + chunk_shape_with_bytes.push_back(bytes_per_element); + + // Create permutation: copy existing inner_order and add the new dimension + std::vector void_permutation(metadata.rank + 1); + std::copy_n(metadata.inner_order.data(), metadata.rank, + void_permutation.begin()); + void_permutation[metadata.rank] = metadata.rank; // Add the bytes dimension auto& component = components.emplace_back( internal::AsyncWriteArray::Spec{ @@ -343,7 +344,7 @@ class DataCacheBase // specify unbounded `valid_data_bounds`. Box<>(metadata.rank + 1), ContiguousLayoutPermutation<>( - span(metadata.inner_order.data(), metadata.rank + 1))}, + span(void_permutation.data(), metadata.rank + 1))}, chunk_shape_with_bytes); component.array_spec.fill_value_comparison_kind = EqualityComparisonKind::identical; @@ -570,7 +571,13 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { std::string key_prefix, U&&... arg) : ChunkCacheImpl(std::move(initializer.store), std::forward(arg)...), DataCacheBase(std::move(initializer), std::move(key_prefix)), - grid_(DataCacheBase::GetChunkGridSpecification(metadata())) {} + grid_(DataCacheBase::GetChunkGridSpecification( + metadata(), + // Check if this is void access by examining the dtype + (ChunkCacheImpl::dtype_.fields.size() == 1 && + ChunkCacheImpl::dtype_.fields[0].name == "") + ? kVoidFieldIndex + : 0)) {} const internal::LexicographicalGridIndexKeyParser& GetChunkStorageKeyParser() final { @@ -596,6 +603,52 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { return DataCacheBase::executor(); } + // Override to handle void access - check the dtype to see if this is void + Result> GetExternalToInternalTransform( + const void* metadata_ptr, size_t component_index) override { + const auto& metadata = *static_cast(metadata_ptr); + + // Check if this is void access by examining the cache's dtype + const bool is_void_access = (ChunkCacheImpl::dtype_.fields.size() == 1 && + ChunkCacheImpl::dtype_.fields[0].name == ""); + + if (is_void_access) { + // For void access, create transform with extra bytes dimension + const DimensionIndex rank = metadata.rank; + const Index bytes_per_element = metadata.data_type.bytes_per_outer_element; + const DimensionIndex total_rank = rank + 1; + + std::string_view normalized_dimension_names[kMaxRank]; + for (DimensionIndex i = 0; i < rank; ++i) { + if (const auto& name = metadata.dimension_names[i]; name.has_value()) { + normalized_dimension_names[i] = *name; + } + } + + auto builder = + tensorstore::IndexTransformBuilder<>(total_rank, total_rank); + std::vector full_shape = metadata.shape; + full_shape.push_back(bytes_per_element); + builder.input_shape(full_shape); + builder.input_labels(span(&normalized_dimension_names[0], total_rank)); + + DimensionSet implicit_upper_bounds(false); + for (DimensionIndex i = 0; i < rank; ++i) { + implicit_upper_bounds[i] = true; + } + builder.implicit_upper_bounds(implicit_upper_bounds); + + for (DimensionIndex i = 0; i < total_rank; ++i) { + builder.output_single_input_dimension(i, i); + } + return builder.Finalize(); + } + + // Not void access - delegate to base implementation + return DataCacheBase::GetExternalToInternalTransform(metadata_ptr, + component_index); + } + internal::ChunkGridSpecification grid_; }; From 2a4c3d852e0f38b5601dd43482ae878d86a6d7b6 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 26 Nov 2025 15:03:55 +0000 Subject: [PATCH 07/20] Remove debug print --- tensorstore/driver/zarr3/metadata.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 6a83cdbec..9aef7bd0b 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -312,8 +312,6 @@ absl::Status FillValueJsonBinder::operator()( auto fill_arr = AllocateArray(dtype.fields[0].field_shape, c_order, default_init, dtype.fields[0].dtype); std::memcpy(fill_arr.data(), b64_decoded.data(), b64_decoded.size()); - std::cout << "[DEBUG] Raw bytes fill_value parsed: shape=" << fill_arr.shape() - << ", dtype=" << dtype.fields[0].dtype << std::endl; (*obj)[0] = std::move(fill_arr); } else { TENSORSTORE_RETURN_IF_ERROR( From b0abb94070f7be7337e7a30b90802ee8617801dd Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 2 Dec 2025 22:01:10 +0000 Subject: [PATCH 08/20] Add field for open as void --- .gitignore | 5 +++++ examples/read_structured_zarr3.cc | 11 ++++++----- tensorstore/driver/zarr3/driver.cc | 31 +++++++++++++++++++----------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/.gitignore b/.gitignore index e4737363c..7c75044c5 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,8 @@ __pycache__ *.pyc /python/tensorstore/*.so /python/tensorstore/*.pyd + +build/ +bootstrap.sh +filt_mig.mdio +generate_test.py \ No newline at end of file diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc index bf12ced1b..720ef1330 100644 --- a/examples/read_structured_zarr3.cc +++ b/examples/read_structured_zarr3.cc @@ -21,6 +21,7 @@ // Both arrays should contain the same data, allowing comparison of: // - Field-based access vs manual byte extraction // - Structured dtype parsing vs raw byte handling +// - New open_as_void option for raw byte access to structured data // // Usage: // bazel run //examples:read_structured_zarr3 -- /path/to/parent/dir @@ -422,10 +423,10 @@ absl::Status Run(const std::string& zarr_path) { auto raw_store = std::move(raw_open_result).value(); TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); - // Test 3: Read from headers array as void (field="") + // Test 3: Read from headers array as void (open_as_void=true) // Use a fresh context to avoid cache sharing with Test 1 std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 3: Reading from 'headers' array as void (field=\"\")" << std::endl; + std::cout << "TEST 3: Reading from 'headers' array as void (open_as_void=true)" << std::endl; std::cout << std::string(60, '=') << std::endl; auto context_void = tensorstore::Context::Default(); @@ -435,7 +436,7 @@ absl::Status Run(const std::string& zarr_path) { headers_void_spec["kvstore"] = ::nlohmann::json::object(); headers_void_spec["kvstore"]["driver"] = "file"; headers_void_spec["kvstore"]["path"] = headers_path + "/"; - headers_void_spec["field"] = ""; // Special field for raw byte access + headers_void_spec["open_as_void"] = true; // New option for raw byte access std::cout << "Spec: " << headers_void_spec.dump(2) << std::endl; @@ -451,14 +452,14 @@ absl::Status Run(const std::string& zarr_path) { } auto headers_void_store = std::move(headers_void_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (void)", /*is_raw_bytes=*/true)); + TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (open_as_void)", /*is_raw_bytes=*/true)); std::cout << "\n" << std::string(60, '=') << std::endl; std::cout << "COMPARISON: All three methods should give identical inline field values" << std::endl; std::cout << std::string(60, '=') << std::endl; std::cout << "- Test 1: 'headers' with field=\"inline\" provides field access convenience\n" << "- Test 2: 'raw_headers' (raw_bytes type) provides direct byte access\n" - << "- Test 3: 'headers' with field=\"\" provides raw byte access to structured data\n" + << "- Test 3: 'headers' with open_as_void=true provides raw byte access to structured data\n" << "All three extract the inline field from byte offset " << kInlineFieldOffset << " in " << kStructSize << "-byte structs." << std::endl; diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index bed1171d2..f4aad10d7 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -107,10 +107,11 @@ class ZarrDriverSpec ZarrMetadataConstraints metadata_constraints; std::string selected_field; + bool open_as_void; constexpr static auto ApplyMembers = [](auto& x, auto f) { return f(internal::BaseCast(x), x.metadata_constraints, - x.selected_field); + x.selected_field, x.open_as_void); }; static inline const auto default_json_binder = jb::Sequence( @@ -145,9 +146,17 @@ class ZarrDriverSpec }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( jb::DefaultInitializedValue()))), - jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( - jb::DefaultValue( - [](auto* obj) { *obj = std::string{}; })))); + jb::Member( + "field", + jb::Projection<&ZarrDriverSpec::selected_field>( + jb::DefaultValue( + [](auto* obj) { *obj = std::string{}; }))), + jb::Member( + "open_as_void", + jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::DefaultValue( + [](auto* v) { *v = false; /*selected_field = "";*/ })))); + absl::Status ApplyOptions(SpecOptions&& options) override { if (options.minimal_spec) { @@ -607,43 +616,43 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { Result> GetExternalToInternalTransform( const void* metadata_ptr, size_t component_index) override { const auto& metadata = *static_cast(metadata_ptr); - + // Check if this is void access by examining the cache's dtype const bool is_void_access = (ChunkCacheImpl::dtype_.fields.size() == 1 && ChunkCacheImpl::dtype_.fields[0].name == ""); - + if (is_void_access) { // For void access, create transform with extra bytes dimension const DimensionIndex rank = metadata.rank; const Index bytes_per_element = metadata.data_type.bytes_per_outer_element; const DimensionIndex total_rank = rank + 1; - + std::string_view normalized_dimension_names[kMaxRank]; for (DimensionIndex i = 0; i < rank; ++i) { if (const auto& name = metadata.dimension_names[i]; name.has_value()) { normalized_dimension_names[i] = *name; } } - + auto builder = tensorstore::IndexTransformBuilder<>(total_rank, total_rank); std::vector full_shape = metadata.shape; full_shape.push_back(bytes_per_element); builder.input_shape(full_shape); builder.input_labels(span(&normalized_dimension_names[0], total_rank)); - + DimensionSet implicit_upper_bounds(false); for (DimensionIndex i = 0; i < rank; ++i) { implicit_upper_bounds[i] = true; } builder.implicit_upper_bounds(implicit_upper_bounds); - + for (DimensionIndex i = 0; i < total_rank; ++i) { builder.output_single_input_dimension(i, i); } return builder.Finalize(); } - + // Not void access - delegate to base implementation return DataCacheBase::GetExternalToInternalTransform(metadata_ptr, component_index); From fff0a5be9ce8fa1baed0a2db5503b852f3fb5184 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 15:38:36 +0000 Subject: [PATCH 09/20] Add a shim for new open_as_void flag open option --- tensorstore/driver/zarr3/driver.cc | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index f4aad10d7..18c8f3a77 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -140,8 +140,9 @@ class ZarrDriverSpec // at metadata level only. } } - TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( - RankConstraint{obj->metadata_constraints.rank})); + TENSORSTORE_RETURN_IF_ERROR( + obj->schema.Set( + RankConstraint{obj->metadata_constraints.rank})); return absl::OkStatus(); }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( @@ -151,11 +152,23 @@ class ZarrDriverSpec jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), + + // NEW: wrap the open_as_void projection in a Validate jb::Member( "open_as_void", - jb::Projection<&ZarrDriverSpec::open_as_void>( - jb::DefaultValue( - [](auto* v) { *v = false; /*selected_field = "";*/ })))); + jb::Validate( + [](const auto& options, ZarrDriverSpec* obj) -> absl::Status { + // At this point, Projection has already set obj->open_as_void + if (obj->open_as_void) { + obj->selected_field = ""; + } + return absl::OkStatus(); + }, + jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::DefaultValue( + [](auto* v) { *v = false; }))))); + + absl::Status ApplyOptions(SpecOptions&& options) override { From b6c24f96289a523d14cd6dc9a173f70e10690e15 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 15:55:02 +0000 Subject: [PATCH 10/20] Revert some formatting changes --- tensorstore/driver/zarr3/driver.cc | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 18c8f3a77..dd95c711b 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -140,22 +140,18 @@ class ZarrDriverSpec // at metadata level only. } } - TENSORSTORE_RETURN_IF_ERROR( - obj->schema.Set( - RankConstraint{obj->metadata_constraints.rank})); + TENSORSTORE_RETURN_IF_ERROR(obj->schema.Set( + RankConstraint{obj->metadata_constraints.rank})); return absl::OkStatus(); }, jb::Projection<&ZarrDriverSpec::metadata_constraints>( jb::DefaultInitializedValue()))), - jb::Member( - "field", - jb::Projection<&ZarrDriverSpec::selected_field>( + jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), // NEW: wrap the open_as_void projection in a Validate - jb::Member( - "open_as_void", + jb::Member("open_as_void", jb::Validate( [](const auto& options, ZarrDriverSpec* obj) -> absl::Status { // At this point, Projection has already set obj->open_as_void From 488b1605c1f15f322e4b39f03b02d6cd8b29900b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 15:56:34 +0000 Subject: [PATCH 11/20] revert gitignore changes --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index 7c75044c5..e4737363c 100644 --- a/.gitignore +++ b/.gitignore @@ -21,8 +21,3 @@ __pycache__ *.pyc /python/tensorstore/*.so /python/tensorstore/*.pyd - -build/ -bootstrap.sh -filt_mig.mdio -generate_test.py \ No newline at end of file From 54941a09cf5e057e9c32d20512c0bb114b6f9b83 Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Wed, 3 Dec 2025 13:06:22 -0600 Subject: [PATCH 12/20] V3 structs remove shim (#2) * Begin removing void field shim * Fully removed void string shim * Cleanup debug prints * Remove shimmed validation * Remove unnecessary comment * Prefer false over zero for ternary clarity --- tensorstore/driver/zarr3/chunk_cache.cc | 16 ++++++---- tensorstore/driver/zarr3/chunk_cache.h | 14 ++++++--- tensorstore/driver/zarr3/driver.cc | 38 +++++++---------------- tensorstore/driver/zarr3/metadata.cc | 14 +++++---- tensorstore/driver/zarr3/metadata.h | 6 ++-- tensorstore/driver/zarr3/metadata_test.cc | 2 +- 6 files changed, 45 insertions(+), 45 deletions(-) diff --git a/tensorstore/driver/zarr3/chunk_cache.cc b/tensorstore/driver/zarr3/chunk_cache.cc index 64b6d69fd..f14efd607 100644 --- a/tensorstore/driver/zarr3/chunk_cache.cc +++ b/tensorstore/driver/zarr3/chunk_cache.cc @@ -75,10 +75,12 @@ ZarrChunkCache::~ZarrChunkCache() = default; ZarrLeafChunkCache::ZarrLeafChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/) + ZarrDType dtype, internal::CachePool::WeakPtr /*data_cache_pool*/, + bool open_as_void) : Base(std::move(store)), codec_state_(std::move(codec_state)), - dtype_(std::move(dtype)) {} + dtype_(std::move(dtype)), + open_as_void_(open_as_void) {} void ZarrLeafChunkCache::Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver chunk_indices, absl::InlinedVector, 1> field_arrays(num_fields); // Special case: void access - return raw bytes directly - if (num_fields == 1 && dtype_.fields[0].name == "") { + if (open_as_void_) { TENSORSTORE_ASSIGN_OR_RETURN( field_arrays[0], codec_state_->DecodeArray(grid().components[0].shape(), std::move(data))); @@ -221,11 +223,13 @@ kvstore::Driver* ZarrLeafChunkCache::GetKvStoreDriver() { ZarrShardedChunkCache::ZarrShardedChunkCache( kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, - ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, + bool open_as_void) : base_kvstore_(std::move(store)), codec_state_(std::move(codec_state)), dtype_(std::move(dtype)), - data_cache_pool_(std::move(data_cache_pool)) {} + data_cache_pool_(std::move(data_cache_pool)), + open_as_void_(open_as_void) {} Result> TranslateCellToSourceTransformForShard( IndexTransform<> transform, span grid_cell_indices, @@ -534,7 +538,7 @@ void ZarrShardedChunkCache::Entry::DoInitialize() { *sharding_state.sub_chunk_codec_chain, std::move(sharding_kvstore), cache.executor(), ZarrShardingCodec::PreparedState::Ptr(&sharding_state), - cache.dtype_, cache.data_cache_pool_); + cache.dtype_, cache.data_cache_pool_, cache.open_as_void_); zarr_chunk_cache = new_cache.release(); return std::unique_ptr(&zarr_chunk_cache->cache()); }) diff --git a/tensorstore/driver/zarr3/chunk_cache.h b/tensorstore/driver/zarr3/chunk_cache.h index 5933115d7..a39eb1dc8 100644 --- a/tensorstore/driver/zarr3/chunk_cache.h +++ b/tensorstore/driver/zarr3/chunk_cache.h @@ -158,7 +158,8 @@ class ZarrLeafChunkCache : public internal::KvsBackedChunkCache, explicit ZarrLeafChunkCache(kvstore::DriverPtr store, ZarrCodecChain::PreparedState::Ptr codec_state, ZarrDType dtype, - internal::CachePool::WeakPtr data_cache_pool); + internal::CachePool::WeakPtr data_cache_pool, + bool open_as_void = false); void Read(ZarrChunkCache::ReadRequest request, AnyFlowReceiver( @@ -246,6 +249,7 @@ class ZarrShardedChunkCache : public internal::Cache, public ZarrChunkCache { kvstore::DriverPtr base_kvstore_; ZarrCodecChain::PreparedState::Ptr codec_state_; ZarrDType dtype_; + bool open_as_void_; // Data cache pool, if it differs from `this->pool()` (which is equal to the // metadata cache pool). @@ -260,11 +264,13 @@ class ZarrShardSubChunkCache : public ChunkCacheImpl { explicit ZarrShardSubChunkCache( kvstore::DriverPtr store, Executor executor, ZarrShardingCodec::PreparedState::Ptr sharding_state, - ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool) + ZarrDType dtype, internal::CachePool::WeakPtr data_cache_pool, + bool open_as_void = false) : ChunkCacheImpl(std::move(store), ZarrCodecChain::PreparedState::Ptr( sharding_state->sub_chunk_codec_state), - std::move(dtype), std::move(data_cache_pool)), + std::move(dtype), std::move(data_cache_pool), + open_as_void), sharding_state_(std::move(sharding_state)), executor_(std::move(executor)) {} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index dd95c711b..f4c0ad9d7 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -149,20 +149,9 @@ class ZarrDriverSpec jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), - - // NEW: wrap the open_as_void projection in a Validate - jb::Member("open_as_void", - jb::Validate( - [](const auto& options, ZarrDriverSpec* obj) -> absl::Status { - // At this point, Projection has already set obj->open_as_void - if (obj->open_as_void) { - obj->selected_field = ""; - } - return absl::OkStatus(); - }, - jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::Member("open_as_void", jb::Projection<&ZarrDriverSpec::open_as_void>( jb::DefaultValue( - [](auto* v) { *v = false; }))))); + [](auto* v) { *v = false; })))); @@ -592,10 +581,7 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { grid_(DataCacheBase::GetChunkGridSpecification( metadata(), // Check if this is void access by examining the dtype - (ChunkCacheImpl::dtype_.fields.size() == 1 && - ChunkCacheImpl::dtype_.fields[0].name == "") - ? kVoidFieldIndex - : 0)) {} + ChunkCacheImpl::open_as_void_ ? kVoidFieldIndex : false)) {} const internal::LexicographicalGridIndexKeyParser& GetChunkStorageKeyParser() final { @@ -626,9 +612,8 @@ class ZarrDataCache : public ChunkCacheImpl, public DataCacheBase { const void* metadata_ptr, size_t component_index) override { const auto& metadata = *static_cast(metadata_ptr); - // Check if this is void access by examining the cache's dtype - const bool is_void_access = (ChunkCacheImpl::dtype_.fields.size() == 1 && - ChunkCacheImpl::dtype_.fields[0].name == ""); + // Check if this is void access by examining the stored flag + const bool is_void_access = ChunkCacheImpl::open_as_void_; if (is_void_access) { // For void access, create transform with extra bytes dimension @@ -802,7 +787,7 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_ASSIGN_OR_RETURN( auto metadata, internal_zarr3::GetNewMetadata(spec().metadata_constraints, - spec().schema), + spec().schema, spec().selected_field, spec().open_as_void), tensorstore::MaybeAnnotateStatus( _, "Cannot create using specified \"metadata\" and schema")); return metadata; @@ -819,15 +804,15 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { *static_cast(initializer.metadata.get()); // For void access, modify the dtype to indicate special handling ZarrDType dtype = metadata.data_type; - if (spec().selected_field == "") { + if (spec().open_as_void) { // Create a synthetic dtype for void access dtype = ZarrDType{ /*.has_fields=*/false, /*.fields=*/{ZarrDType::Field{ - ZarrDType::BaseDType{"", dtype_v, + ZarrDType::BaseDType{"", dtype_v, {metadata.data_type.bytes_per_outer_element}}, /*.outer_shape=*/{}, - /*.name=*/"", + /*.name=*/"", /*.field_shape=*/{metadata.data_type.bytes_per_outer_element}, /*.num_inner_elements=*/metadata.data_type.bytes_per_outer_element, /*.byte_offset=*/0, @@ -837,7 +822,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { return internal_zarr3::MakeZarrChunkCache( *metadata.codecs, std::move(initializer), spec().store.path, metadata.codec_state, dtype, - /*data_cache_pool=*/*cache_pool()); + /*data_cache_pool=*/*cache_pool(), + spec().open_as_void); } Result GetComponentIndex(const void* metadata_ptr, @@ -847,7 +833,7 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { ValidateMetadata(metadata, spec().metadata_constraints)); TENSORSTORE_ASSIGN_OR_RETURN( auto field_index, - GetFieldIndex(metadata.data_type, spec().selected_field)); + GetFieldIndex(metadata.data_type, spec().selected_field, spec().open_as_void)); // For void access, map to component index 0 if (field_index == kVoidFieldIndex) { field_index = 0; diff --git a/tensorstore/driver/zarr3/metadata.cc b/tensorstore/driver/zarr3/metadata.cc index 9aef7bd0b..ba4454de4 100644 --- a/tensorstore/driver/zarr3/metadata.cc +++ b/tensorstore/driver/zarr3/metadata.cc @@ -799,12 +799,14 @@ std::string GetFieldNames(const ZarrDType& dtype) { constexpr size_t kVoidFieldIndex = size_t(-1); Result GetFieldIndex(const ZarrDType& dtype, - std::string_view selected_field) { - // Special case: "" requests raw byte access (works for any dtype) - if (selected_field == "") { + std::string_view selected_field, + bool open_as_void) { + // Special case: open_as_void requests raw byte access (works for any dtype) + + if (open_as_void) { if (dtype.fields.empty()) { return absl::FailedPreconditionError( - "Requested field \"\" but dtype has no fields"); + "Requested void access but dtype has no fields"); } return kVoidFieldIndex; } @@ -1138,7 +1140,7 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, Result> GetNewMetadata( const ZarrMetadataConstraints& metadata_constraints, const Schema& schema, - std::string_view selected_field) { + std::string_view selected_field, bool open_as_void) { auto metadata = std::make_shared(); metadata->zarr_format = metadata_constraints.zarr_format.value_or(3); @@ -1165,7 +1167,7 @@ Result> GetNewMetadata( } TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, GetFieldIndex(metadata->data_type, selected_field)); + size_t field_index, GetFieldIndex(metadata->data_type, selected_field, open_as_void)); SpecRankAndFieldInfo info; info.field = &metadata->data_type.fields[field_index]; info.chunked_rank = metadata_constraints.rank; diff --git a/tensorstore/driver/zarr3/metadata.h b/tensorstore/driver/zarr3/metadata.h index 4c7871b0d..857210546 100644 --- a/tensorstore/driver/zarr3/metadata.h +++ b/tensorstore/driver/zarr3/metadata.h @@ -230,12 +230,14 @@ absl::Status ValidateMetadataSchema(const ZarrMetadata& metadata, /// unspecified. Result> GetNewMetadata( const ZarrMetadataConstraints& metadata_constraints, - const Schema& schema, std::string_view selected_field = {}); + const Schema& schema, std::string_view selected_field = {}, + bool open_as_void = false); absl::Status ValidateDataType(DataType dtype); Result GetFieldIndex(const ZarrDType& dtype, - std::string_view selected_field); + std::string_view selected_field, + bool open_as_void = false); struct SpecRankAndFieldInfo { DimensionIndex chunked_rank = dynamic_rank; diff --git a/tensorstore/driver/zarr3/metadata_test.cc b/tensorstore/driver/zarr3/metadata_test.cc index 11c97619f..ba7a26593 100644 --- a/tensorstore/driver/zarr3/metadata_test.cc +++ b/tensorstore/driver/zarr3/metadata_test.cc @@ -438,7 +438,7 @@ Result> TestGetNewMetadata( TENSORSTORE_RETURN_IF_ERROR(status); TENSORSTORE_ASSIGN_OR_RETURN( auto constraints, ZarrMetadataConstraints::FromJson(constraints_json)); - return GetNewMetadata(constraints, schema); + return GetNewMetadata(constraints, schema, /*selected_field=*/{}, /*open_as_void=*/false); } TEST(GetNewMetadataTest, DuplicateDimensionNames) { From c9f58f9eae12c236c1398619c0c43a298fc58dfc Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 3 Dec 2025 19:38:40 +0000 Subject: [PATCH 13/20] Fix structured fill value population --- tensorstore/driver/zarr3/driver.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index f4c0ad9d7..51cc17f42 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -675,7 +675,13 @@ class ZarrDriver : public ZarrDriverBase { if (metadata.fill_value.empty()) { return SharedArray(); } - return metadata.fill_value[0]; + // return metadata.fill_value[0]; + // TODO: Doe we actually need to validate this or can we trust that component_index will return a valid index? + size_t index = this->component_index(); + if (index >= metadata.fill_value.size()) { + return absl::OutOfRangeError("Component index out of bounds"); + } + return metadata.fill_value[index]; } Future GetStorageStatistics( From 7655cfd4cf435e90a1b468929c344de1300a0aa1 Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 10:03:47 -0600 Subject: [PATCH 14/20] V3 examples merge (#3) * Implement a more general and portable example set * Fix driver cache bug * Update example for template * Cleanup example * Remove testing examples from source --- examples/CMakeLists.txt | 163 ---------- examples/read_structured_zarr3.cc | 496 ----------------------------- tensorstore/driver/zarr3/driver.cc | 8 +- 3 files changed, 6 insertions(+), 661 deletions(-) delete mode 100644 examples/CMakeLists.txt delete mode 100644 examples/read_structured_zarr3.cc diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt deleted file mode 100644 index 92e9857fa..000000000 --- a/examples/CMakeLists.txt +++ /dev/null @@ -1,163 +0,0 @@ -# Standalone CMakeLists.txt for read_structured_zarr3 example -# -# Build instructions: -# mkdir -p /home/ubuntu/source/tensorstore/examples/build -# cd /home/ubuntu/source/tensorstore/examples/build -# cmake .. -# make -# -# Run: -# ./read_structured_zarr3 --zarr_path=/home/ubuntu/source/tensorstore/filt_mig.mdio/headers - -cmake_minimum_required(VERSION 3.24) -project(read_structured_zarr3 LANGUAGES CXX) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -# Path to the tensorstore build directory -set(TENSORSTORE_BUILD_DIR "/home/ubuntu/source/tensorstore/build" CACHE PATH "Path to tensorstore build directory") -set(TENSORSTORE_SOURCE_DIR "/home/ubuntu/source/tensorstore" CACHE PATH "Path to tensorstore source directory") -set(DEPS_DIR "${TENSORSTORE_BUILD_DIR}/_deps") - -# Include paths (matching what tensorstore tests use) -include_directories( - ${TENSORSTORE_SOURCE_DIR} - ${DEPS_DIR}/absl-src - ${DEPS_DIR}/re2-src - ${DEPS_DIR}/riegeli-src -) - -include_directories(SYSTEM - ${DEPS_DIR}/half-build/include - ${DEPS_DIR}/half-src/include - ${DEPS_DIR}/nlohmann_json-build/include - ${DEPS_DIR}/nlohmann_json-src/include - ${TENSORSTORE_BUILD_DIR} -) - -# Compiler flags -add_compile_options( - -fPIE - -Wno-deprecated-declarations - -Wno-sign-compare - -Wno-unused-but-set-parameter - -Wno-maybe-uninitialized - -Wno-sequence-point - -Wno-unknown-warning-option - -Wno-stringop-overflow - -fsized-deallocation -) - -# Find all the static libraries we need from the tensorstore build -file(GLOB TENSORSTORE_LIBS "${TENSORSTORE_BUILD_DIR}/libtensorstore*.a") -file(GLOB_RECURSE ABSEIL_LIBS "${DEPS_DIR}/absl-build/absl/*.a") -file(GLOB_RECURSE RIEGELI_LIBS "${DEPS_DIR}/riegeli-build/*.a") - -# Additional dependency libraries - corrected paths -file(GLOB_RECURSE BLOSC_LIBS "${DEPS_DIR}/blosc-build/*.a") -file(GLOB_RECURSE ZSTD_LIBS "${DEPS_DIR}/zstd-build/*.a") -file(GLOB_RECURSE RE2_LIBS "${DEPS_DIR}/re2-build/*.a") -file(GLOB_RECURSE SNAPPY_LIBS "${DEPS_DIR}/snappy-build/*.a") -file(GLOB_RECURSE BROTLI_LIBS "${DEPS_DIR}/brotli-build/*.a") -file(GLOB_RECURSE LZ4_LIBS "${DEPS_DIR}/lz4-build/*.a") -file(GLOB_RECURSE ZLIB_LIBS "${DEPS_DIR}/zlib-build/*.a") -file(GLOB_RECURSE PROTOBUF_LIBS "${DEPS_DIR}/protobuf-build/*.a") -file(GLOB_RECURSE GRPC_LIBS "${DEPS_DIR}/grpc-build/*.a") -file(GLOB_RECURSE CARES_LIBS "${DEPS_DIR}/c-ares-build/*.a") -file(GLOB_RECURSE SSL_LIBS "${DEPS_DIR}/boringssl-build/ssl/*.a") -file(GLOB_RECURSE CRYPTO_LIBS "${DEPS_DIR}/boringssl-build/crypto/*.a") -file(GLOB_RECURSE LIBLZMA_LIBS "${DEPS_DIR}/liblzma-build/*.a") -file(GLOB_RECURSE BZIP2_LIBS "${DEPS_DIR}/bzip2-build/*.a") -file(GLOB_RECURSE JPEG_LIBS "${DEPS_DIR}/jpeg-build/*.a") -file(GLOB_RECURSE PNG_LIBS "${DEPS_DIR}/png-build/*.a") -file(GLOB_RECURSE TIFF_LIBS "${DEPS_DIR}/tiff-build/*.a") -file(GLOB_RECURSE AVIF_LIBS "${DEPS_DIR}/avif-build/*.a") -file(GLOB_RECURSE AOM_LIBS "${DEPS_DIR}/aom-build/*.a") -file(GLOB_RECURSE WEBP_LIBS "${DEPS_DIR}/webp-build/*.a") -file(GLOB_RECURSE CURL_LIBS "${DEPS_DIR}/curl-build/*.a") - -# Create executable -add_executable(read_structured_zarr3 read_structured_zarr3.cc) - -# Link libraries - use whole-archive for libraries that use static registration -# These include drivers, codecs, kvstores, and context resource providers -target_link_libraries(read_structured_zarr3 PRIVATE - # Force inclusion of libraries with static registrations - -Wl,--whole-archive - - # Context resource providers - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_data_copy_concurrency_resource.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_file_io_concurrency_resource.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_cache_cache_pool_resource.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_internal_concurrency_resource.a - - # Zarr3 driver and codecs - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_driver.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_blosc.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_bytes.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_crc32c.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_gzip.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_transpose.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_zstd.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_sharding_indexed.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_driver_zarr3_codec_codec_chain_spec.a - - # File kvstore and its resource providers - ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file.a - ${TENSORSTORE_BUILD_DIR}/libtensorstore_kvstore_file_file_resource.a - - -Wl,--no-whole-archive - - -Wl,--start-group - - # Tensorstore libs - ${TENSORSTORE_LIBS} - - # Riegeli - ${RIEGELI_LIBS} - - # Abseil - ${ABSEIL_LIBS} - - # Compression libs - ${BLOSC_LIBS} - ${ZSTD_LIBS} - ${LZ4_LIBS} - ${SNAPPY_LIBS} - ${BROTLI_LIBS} - ${ZLIB_LIBS} - ${LIBLZMA_LIBS} - ${BZIP2_LIBS} - - # Regex - ${RE2_LIBS} - - # Protocol buffers and gRPC - ${PROTOBUF_LIBS} - ${GRPC_LIBS} - ${CARES_LIBS} - - # SSL/TLS - ${SSL_LIBS} - ${CRYPTO_LIBS} - - # Image libraries - ${JPEG_LIBS} - ${PNG_LIBS} - ${TIFF_LIBS} - ${AVIF_LIBS} - ${AOM_LIBS} - ${WEBP_LIBS} - - # HTTP - ${CURL_LIBS} - - -Wl,--end-group - - # System libraries - pthread - dl - m - rt -) diff --git a/examples/read_structured_zarr3.cc b/examples/read_structured_zarr3.cc deleted file mode 100644 index 720ef1330..000000000 --- a/examples/read_structured_zarr3.cc +++ /dev/null @@ -1,496 +0,0 @@ -// Copyright 2024 The TensorStore Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Standalone test for reading structured data from Zarr v3 arrays. -// -// This test opens two Zarr v3 arrays: -// 1. A structured array with named fields (headers/) -// 2. A raw bytes array containing struct data (raw_headers/) -// -// Both arrays should contain the same data, allowing comparison of: -// - Field-based access vs manual byte extraction -// - Structured dtype parsing vs raw byte handling -// - New open_as_void option for raw byte access to structured data -// -// Usage: -// bazel run //examples:read_structured_zarr3 -- /path/to/parent/dir -// -// Or with cmake: -// cd examples/build && ./read_structured_zarr3 --zarr_path=/path/to/parent/dir -// -// Where the parent dir contains both 'headers/' and 'raw_headers/' subdirs. - -#include - -#include -#include -#include -#include - -#include "absl/flags/flag.h" -#include "absl/flags/parse.h" -#include "absl/status/status.h" -#include -#include "tensorstore/array.h" -#include "tensorstore/context.h" -#include "tensorstore/data_type.h" -#include "tensorstore/index.h" -#include "tensorstore/open.h" -#include "tensorstore/open_mode.h" -#include "tensorstore/spec.h" -#include "tensorstore/tensorstore.h" -#include "tensorstore/util/result.h" -#include "tensorstore/util/status.h" - -// Internal headers for testing dtype parsing -#include "tensorstore/driver/zarr3/dtype.h" - -// Additional headers for string operations -#include "absl/strings/str_join.h" - -ABSL_FLAG(std::string, zarr_path, - "/home/ubuntu/source/tensorstore/filt_mig.mdio", - "Path to the parent .mdio directory containing headers/ and raw_headers/"); - -namespace { - -using ::tensorstore::Index; - -// Field layout from the zarr.json metadata: -// The structured dtype has the following fields with their byte offsets: -// trace_seq_num_line: int32 @ 0 -// trace_seq_num_reel: int32 @ 4 -// ... (many more fields) ... -// inline: int32 @ 180 -// crossline: int32 @ 184 -// cdp_x: int32 @ 188 -// cdp_y: int32 @ 192 -// -// Total struct size: 196 bytes (matches blosc typesize) - -constexpr size_t kInlineFieldOffset = 180; -constexpr size_t kStructSize = 196; - -// Read and parse the zarr.json metadata to display info about structured type -void PrintZarrMetadata(const std::string& zarr_path) { - std::string metadata_path = zarr_path + "/zarr.json"; - std::ifstream file(metadata_path); - if (!file.is_open()) { - std::cerr << "Could not open " << metadata_path << std::endl; - return; - } - - nlohmann::json metadata; - try { - file >> metadata; - } catch (const nlohmann::json::parse_error& e) { - std::cerr << "Failed to parse zarr.json: " << e.what() << std::endl; - return; - } - - std::cout << "\n=== Zarr Metadata ===" << std::endl; - std::cout << "Shape: " << metadata["shape"].dump() << std::endl; - std::cout << "Dimension names: " << metadata["dimension_names"].dump() - << std::endl; - - if (metadata.contains("data_type")) { - auto& dt = metadata["data_type"]; - std::cout << "\nData type format:" << std::endl; - if (dt.is_object()) { - std::cout << " Type: object with name=\"" << dt["name"].get() - << "\"" << std::endl; - if (dt.contains("configuration") && - dt["configuration"].contains("fields")) { - auto& fields = dt["configuration"]["fields"]; - std::cout << " Number of fields: " << fields.size() << std::endl; - std::cout << " Fields:" << std::endl; - size_t byte_offset = 0; - for (const auto& field : fields) { - std::string name = field[0].get(); - std::string type = field[1].get(); - size_t size = (type == "int32" || type == "uint32" || type == "float32") - ? 4 - : 2; // int16/uint16 - std::cout << " " << name << ": " << type << " @ byte " << byte_offset - << std::endl; - byte_offset += size; - } - std::cout << " Total struct size: " << byte_offset << " bytes" - << std::endl; - } - } else if (dt.is_string()) { - std::cout << " Type: simple \"" << dt.get() << "\"" - << std::endl; - } else if (dt.is_array()) { - std::cout << " Type: array with " << dt.size() << " fields" << std::endl; - } - } - - if (metadata.contains("codecs")) { - std::cout << "\nCodecs: " << metadata["codecs"].dump(2) << std::endl; - } -} - -// Helper function to read and display inline field from an array -absl::Status ReadInlineField(const tensorstore::TensorStore<>& store, - const std::string& array_name, - bool is_raw_bytes = false) { - // Get information about the array - auto domain = store.domain(); - std::cout << "\n=== " << array_name << " Array Info ===" << std::endl; - std::cout << "Domain: " << domain << std::endl; - std::cout << "Dtype: " << store.dtype() << std::endl; - std::cout << "Rank: " << store.rank() << std::endl; - - auto shape = domain.shape(); - std::cout << "Shape: ["; - for (int i = 0; i < shape.size(); ++i) { - if (i > 0) std::cout << ", "; - std::cout << shape[i]; - } - std::cout << "]" << std::endl; - - // Read all data - std::cout << "\n=== Reading " << array_name << " Data ===" << std::endl; - TENSORSTORE_ASSIGN_OR_RETURN( - auto array, tensorstore::Read(store).result()); - - std::cout << "Read complete. Array size: " << array.num_elements() - << " elements" << std::endl; - std::cout << "Data type: " << array.dtype() << std::endl; - - Index num_inline, num_crossline; - const int32_t* int_ptr; - - if (is_raw_bytes) { - // For raw bytes, we need to extract the inline field manually - // Shape is [inline, crossline, struct_size] - num_inline = shape[0]; - num_crossline = shape[1]; - Index struct_size = shape[2]; - if (struct_size != kStructSize) { - std::cout << "Warning: Raw struct size (" << struct_size - << ") differs from expected header struct size (" << kStructSize - << "). Assuming padding." << std::endl; - } - - // Extract inline field (4 bytes starting at offset 180) - auto byte_ptr = reinterpret_cast(array.data()); - std::vector inline_values(num_inline * num_crossline); - - for (Index i = 0; i < num_inline; ++i) { - for (Index j = 0; j < num_crossline; ++j) { - Index struct_offset = (i * num_crossline + j) * struct_size; - Index field_offset = struct_offset + kInlineFieldOffset; - std::memcpy(&inline_values[i * num_crossline + j], - byte_ptr + field_offset, 4); - } - } - - std::cout << "Extracted inline field from raw bytes at offset " - << kInlineFieldOffset << std::endl; - int_ptr = inline_values.data(); - } else { - // For structured array, field access already gave us int32 values - num_inline = shape[0]; - num_crossline = shape[1]; - int_ptr = reinterpret_cast(array.data()); - } - - std::cout << "\n=== Inline field values from " << array_name - << " (shape: " << num_inline << " x " << num_crossline << ") ===" << std::endl; - - // Print first 10 rows (or fewer if less data) - Index rows_to_print = std::min(num_inline, Index{10}); - Index cols_to_print = std::min(num_crossline, Index{10}); - - for (Index i = 0; i < rows_to_print; ++i) { - for (Index j = 0; j < cols_to_print; ++j) { - std::cout << int_ptr[i * num_crossline + j]; - if (j < cols_to_print - 1) { - std::cout << "\t"; - } - } - if (num_crossline > cols_to_print) { - std::cout << "\t..."; - } - std::cout << std::endl; - } - if (num_inline > rows_to_print) { - std::cout << "... (" << (num_inline - rows_to_print) << " more rows)" - << std::endl; - } - - std::cout << "\n=== " << array_name << " Summary ===" << std::endl; - std::cout << "Successfully read " << (num_inline * num_crossline) - << " inline values" << std::endl; - - // Show some statistics - int32_t min_val = int_ptr[0], max_val = int_ptr[0]; - int64_t sum = 0; - for (Index i = 0; i < num_inline * num_crossline; ++i) { - min_val = std::min(min_val, int_ptr[i]); - max_val = std::max(max_val, int_ptr[i]); - sum += int_ptr[i]; - } - std::cout << "Min value: " << min_val << std::endl; - std::cout << "Max value: " << max_val << std::endl; - std::cout << "Mean value: " << (static_cast(sum) / (num_inline * num_crossline)) << std::endl; - - return absl::OkStatus(); -} - -absl::Status Run(const std::string& zarr_path) { - std::cout << "=== Zarr v3 Structured Data Type Test ===" << std::endl; - std::cout << "Opening zarr3 arrays in: " << zarr_path << std::endl; - - auto context = tensorstore::Context::Default(); - - // First, display metadata information for structured array - std::string headers_path = zarr_path + "/headers"; - PrintZarrMetadata(headers_path); - - // Test raw_bytes parsing by reading and parsing the raw_headers zarr.json - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TESTING RAW_BYTES PARSING" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - std::string raw_metadata_path = zarr_path + "/raw_headers/zarr.json"; - std::ifstream raw_file(raw_metadata_path); - if (!raw_file.is_open()) { - std::cout << "Could not open " << raw_metadata_path << std::endl; - return absl::NotFoundError("Raw headers metadata not found"); - } - - nlohmann::json raw_metadata; - try { - raw_file >> raw_metadata; - } catch (const nlohmann::json::parse_error& e) { - std::cout << "Failed to parse raw zarr.json: " << e.what() << std::endl; - return absl::DataLossError("Invalid raw metadata JSON"); - } - - std::cout << "Raw headers data_type: " << raw_metadata["data_type"].dump(2) << std::endl; - - // Test parsing the raw_bytes data type - std::cout << "Testing raw_bytes dtype parsing..." << std::endl; - - // For now, just verify the JSON structure is what we expect - if (!raw_metadata.contains("data_type")) { - std::cout << "FAILED: No data_type in metadata" << std::endl; - return absl::NotFoundError("Missing data_type"); - } - - auto& dt = raw_metadata["data_type"]; - if (!dt.is_object() || !dt.contains("name") || dt["name"] != "raw_bytes") { - std::cout << "FAILED: data_type is not raw_bytes extension" << std::endl; - return absl::InvalidArgumentError("Not raw_bytes extension"); - } - - if (!dt.contains("configuration") || !dt["configuration"].contains("length_bytes")) { - std::cout << "FAILED: Missing length_bytes in configuration" << std::endl; - return absl::InvalidArgumentError("Missing length_bytes"); - } - - int length_bytes = dt["configuration"]["length_bytes"]; - std::cout << "SUCCESS: Found raw_bytes extension with length_bytes = " << length_bytes << std::endl; - std::cout << "This should parse to:" << std::endl; - std::cout << " - Single field with byte_t dtype" << std::endl; - std::cout << " - Field shape: [" << length_bytes << "]" << std::endl; - std::cout << " - Bytes per outer element: " << length_bytes << std::endl; - - // Now actually test the parsing implementation - std::cout << "\n=== Testing ParseDType Implementation ===" << std::endl; - auto dtype_result = tensorstore::internal_zarr3::ParseDType(dt); - if (!dtype_result.ok()) { - std::cout << "FAILED: Could not parse raw_bytes data type: " << dtype_result.status() << std::endl; - return dtype_result.status(); - } - - auto dtype = std::move(dtype_result).value(); - std::cout << "SUCCESS: ParseDType worked!" << std::endl; - std::cout << " Fields: " << dtype.fields.size() << std::endl; - std::cout << " Has fields: " << dtype.has_fields << std::endl; - std::cout << " Bytes per outer element: " << dtype.bytes_per_outer_element << std::endl; - - if (!dtype.fields.empty()) { - const auto& field = dtype.fields[0]; - std::cout << " Field name: '" << field.name << "'" << std::endl; - std::cout << " Field dtype: " << field.dtype << std::endl; - std::cout << " Field shape: [" << absl::StrJoin(field.field_shape, ", ") << "]" << std::endl; - std::cout << " Field num_inner_elements: " << field.num_inner_elements << std::endl; - std::cout << " Field num_bytes: " << field.num_bytes << std::endl; - } - - // Verify the parsing is correct - bool parsing_correct = true; - if (dtype.fields.size() != 1) { - std::cout << "ERROR: Expected 1 field, got " << dtype.fields.size() << std::endl; - parsing_correct = false; - } - if (dtype.fields[0].name != "") { - std::cout << "ERROR: Expected empty field name, got '" << dtype.fields[0].name << "'" << std::endl; - parsing_correct = false; - } - if (dtype.fields[0].dtype != tensorstore::dtype_v) { - std::cout << "ERROR: Expected byte_t dtype, got " << dtype.fields[0].dtype << std::endl; - parsing_correct = false; - } - if (dtype.fields[0].field_shape != std::vector{length_bytes}) { - std::cout << "ERROR: Expected field shape [" << length_bytes << "], got [" - << absl::StrJoin(dtype.fields[0].field_shape, ", ") << "]" << std::endl; - parsing_correct = false; - } - if (dtype.bytes_per_outer_element != length_bytes) { - std::cout << "ERROR: Expected " << length_bytes << " bytes per element, got " - << dtype.bytes_per_outer_element << std::endl; - parsing_correct = false; - } - - if (parsing_correct) { - std::cout << "\n✅ PARSING VERIFICATION: All checks passed!" << std::endl; - std::cout << "The raw_bytes extension is correctly parsed." << std::endl; - } else { - std::cout << "\n❌ PARSING VERIFICATION: Some checks failed!" << std::endl; - return absl::InternalError("Parsing verification failed"); - } - - // Test 1: Read from structured array using field access - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 1: Reading from structured 'headers' array" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - ::nlohmann::json headers_spec = ::nlohmann::json::object(); - headers_spec["driver"] = "zarr3"; - headers_spec["kvstore"] = ::nlohmann::json::object(); - headers_spec["kvstore"]["driver"] = "file"; - headers_spec["kvstore"]["path"] = headers_path + "/"; - headers_spec["field"] = "inline"; // Extract inline field (int32 at byte offset 180) - - std::cout << "Spec: " << headers_spec.dump(2) << std::endl; - - auto headers_open_result = - tensorstore::Open(headers_spec, context, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!headers_open_result.ok()) { - std::cout << "\n=== Headers Open Failed ===" << std::endl; - std::cout << "Status: " << headers_open_result.status() << std::endl; - return headers_open_result.status(); - } - - auto headers_store = std::move(headers_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_store, "headers")); - - // Test 2: Read from raw bytes array (no special void access needed) - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 2: Reading from raw 'raw_headers' array" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - std::string raw_headers_path = zarr_path + "/raw_headers"; - ::nlohmann::json raw_spec = ::nlohmann::json::object(); - raw_spec["driver"] = "zarr3"; - raw_spec["kvstore"] = ::nlohmann::json::object(); - raw_spec["kvstore"]["driver"] = "file"; - raw_spec["kvstore"]["path"] = raw_headers_path + "/"; - // No field specified - raw_bytes has a single anonymous field - - std::cout << "Spec: " << raw_spec.dump(2) << std::endl; - - auto raw_open_result = - tensorstore::Open(raw_spec, context, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!raw_open_result.ok()) { - std::cout << "\n=== Raw Headers Open Failed ===" << std::endl; - std::cout << "Status: " << raw_open_result.status() << std::endl; - return raw_open_result.status(); - } - - auto raw_store = std::move(raw_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(raw_store, "raw_headers", /*is_raw_bytes=*/true)); - - // Test 3: Read from headers array as void (open_as_void=true) - // Use a fresh context to avoid cache sharing with Test 1 - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "TEST 3: Reading from 'headers' array as void (open_as_void=true)" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - auto context_void = tensorstore::Context::Default(); - - ::nlohmann::json headers_void_spec = ::nlohmann::json::object(); - headers_void_spec["driver"] = "zarr3"; - headers_void_spec["kvstore"] = ::nlohmann::json::object(); - headers_void_spec["kvstore"]["driver"] = "file"; - headers_void_spec["kvstore"]["path"] = headers_path + "/"; - headers_void_spec["open_as_void"] = true; // New option for raw byte access - - std::cout << "Spec: " << headers_void_spec.dump(2) << std::endl; - - auto headers_void_open_result = - tensorstore::Open(headers_void_spec, context_void, tensorstore::OpenMode::open, - tensorstore::ReadWriteMode::read) - .result(); - - if (!headers_void_open_result.ok()) { - std::cout << "\n=== Headers (void) Open Failed ===" << std::endl; - std::cout << "Status: " << headers_void_open_result.status() << std::endl; - return headers_void_open_result.status(); - } - - auto headers_void_store = std::move(headers_void_open_result).value(); - TENSORSTORE_RETURN_IF_ERROR(ReadInlineField(headers_void_store, "headers (open_as_void)", /*is_raw_bytes=*/true)); - - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "COMPARISON: All three methods should give identical inline field values" << std::endl; - std::cout << std::string(60, '=') << std::endl; - std::cout << "- Test 1: 'headers' with field=\"inline\" provides field access convenience\n" - << "- Test 2: 'raw_headers' (raw_bytes type) provides direct byte access\n" - << "- Test 3: 'headers' with open_as_void=true provides raw byte access to structured data\n" - << "All three extract the inline field from byte offset " << kInlineFieldOffset - << " in " << kStructSize << "-byte structs." << std::endl; - - return absl::OkStatus(); -} - -} // namespace - -int main(int argc, char** argv) { - absl::ParseCommandLine(argc, argv); - - std::string zarr_path = absl::GetFlag(FLAGS_zarr_path); - if (zarr_path.empty()) { - std::cerr << "Error: --zarr_path is required" << std::endl; - return 1; - } - - // Verify the path structure - std::string headers_path = zarr_path + "/headers"; - std::string raw_headers_path = zarr_path + "/raw_headers"; - - std::cout << "Expecting arrays at:" << std::endl; - std::cout << " Structured: " << headers_path << std::endl; - std::cout << " Raw bytes: " << raw_headers_path << std::endl; - std::cout << std::endl; - - auto status = Run(zarr_path); - if (!status.ok()) { - std::cerr << "\nFinal status: " << status << std::endl; - return 1; - } - - return 0; -} diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index 51cc17f42..ec30edd82 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -779,12 +779,16 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { std::string GetDataCacheKey(const void* metadata) override { std::string result; + const auto& zarr_metadata = *static_cast(metadata); internal::EncodeCacheKey( - &result, spec().store.path, - static_cast(metadata)->GetCompatibilityKey()); + &result, + spec().store.path, + zarr_metadata.GetCompatibilityKey(), + spec().open_as_void ? "void" : "normal"); return result; } + Result> Create(const void* existing_metadata, CreateOptions options) override { if (existing_metadata) { From 8c4c4cafe2b33df06131d985c2574c973f817b3d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Dec 2025 16:07:26 +0000 Subject: [PATCH 15/20] Remove vestigial example build --- examples/BUILD | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/examples/BUILD b/examples/BUILD index 4dcb2d604..94acdba14 100644 --- a/examples/BUILD +++ b/examples/BUILD @@ -122,26 +122,3 @@ tensorstore_cc_binary( "@riegeli//riegeli/bytes:writer", ], ) - -tensorstore_cc_binary( - name = "read_structured_zarr3", - srcs = ["read_structured_zarr3.cc"], - deps = [ - "//tensorstore", - "//tensorstore:array", - "//tensorstore:context", - "//tensorstore:data_type", - "//tensorstore:index", - "//tensorstore:open", - "//tensorstore:open_mode", - "//tensorstore:spec", - "//tensorstore/driver/zarr3", - "//tensorstore/kvstore/file", - "//tensorstore/util:result", - "//tensorstore/util:status", - "@abseil-cpp//absl/flags:flag", - "@abseil-cpp//absl/flags:parse", - "@abseil-cpp//absl/status", - "@nlohmann_json//:json", - ], -) From 4b590f855adc963fe20940bd704693d81190483a Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 11:11:14 -0600 Subject: [PATCH 16/20] V3 structs fix fills (#4) * Use the appropriate fill value for open_as_void structured data * Cleanup --- tensorstore/driver/zarr3/driver.cc | 70 ++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 4 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index ec30edd82..f86e4ad88 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -171,12 +171,74 @@ class ZarrDriverSpec IndexTransformView<> transform) const override { SharedArray fill_value{schema.fill_value()}; - const auto& metadata = metadata_constraints; - if (metadata.fill_value && !metadata.fill_value->empty()) { - fill_value = (*metadata.fill_value)[0]; + const auto& constraints = metadata_constraints; + + // If constraints don't specify a fill value, just use the schema's. + if (!constraints.fill_value || constraints.fill_value->empty()) { + return fill_value; + } + + const auto& vec = *constraints.fill_value; + + // If we don't have dtype information, we can't do field-aware logic. + if (!constraints.data_type) { + if (!vec.empty()) return vec[0]; + return fill_value; + } + + const ZarrDType& dtype = *constraints.data_type; + + // Determine which field this spec refers to (or void access). + TENSORSTORE_ASSIGN_OR_RETURN( + size_t field_index, + GetFieldIndex(dtype, selected_field, open_as_void)); + + // ── Normal field access: just return that field's fill_value ─────────────── + if (field_index != kVoidFieldIndex) { + if (field_index < vec.size()) { + return vec[field_index]; + } + // Fallback to "no fill". + return SharedArray(); + } + + // ── Void access: synthesize a byte-level fill value ──────────────────────── + // + // We want a 1D byte array of length bytes_per_outer_element whose contents + // are exactly the Zarr-defined struct layout built from per-field fills. + + // Special case: "raw bytes" field (single byte_t field with flexible shape). + // In that case the existing fill array already has the correct bytes. + if (dtype.fields.size() == 1 && + dtype.fields[0].dtype.id() == DataTypeId::byte_t && + !dtype.fields[0].flexible_shape.empty()) { + // vec[0] should be a byte array of size bytes_per_outer_element. + return vec[0]; + } + + const Index nbytes = dtype.bytes_per_outer_element; + + auto byte_arr = AllocateArray( + span({nbytes}), c_order, default_init, + dtype_v); + auto* dst = static_cast(byte_arr.data()); + std::memset(dst, 0, static_cast(nbytes)); + + // Pack each field's scalar fill into its byte_offset region. + for (size_t i = 0; i < dtype.fields.size() && i < vec.size(); ++i) { + const auto& field = dtype.fields[i]; + const auto& field_fill = vec[i]; + if (!field_fill.valid()) continue; + + // We assume a single outer element per field here (which is exactly how + // FillValueJsonBinder constructs per-field fill values). + std::memcpy( + dst + field.byte_offset, + static_cast(field_fill.data()), + static_cast(field.num_bytes)); } - return fill_value; + return byte_arr; } Result GetDimensionUnits() const override { From c0082a0f09c4537bed65aaaf17939f8825204985 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Dec 2025 17:22:51 +0000 Subject: [PATCH 17/20] Add new options to schema --- tensorstore/driver/zarr3/schema.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/tensorstore/driver/zarr3/schema.yml b/tensorstore/driver/zarr3/schema.yml index 4f9733415..9491027b1 100644 --- a/tensorstore/driver/zarr3/schema.yml +++ b/tensorstore/driver/zarr3/schema.yml @@ -17,6 +17,31 @@ allOf: automatically. When creating a new array, the new metadata is obtained by combining these metadata constraints with any `Schema` constraints. $ref: driver/zarr3/Metadata + field: + type: string + title: Field selection for structured arrays. + description: | + Name of the field to select from a structured array. When specified, + the tensorstore will provide access to only the specified field of + each element in the structured array. + open_as_void: + type: boolean + default: false + title: Raw byte access mode. + description: | + When true, opens the array as raw bytes instead of interpreting it + as structured data. The resulting array will have an additional + dimension representing the byte layout of each element. + oneOf: + - not: + anyOf: + - required: ["field"] + - required: ["open_as_void"] + - allOf: + - not: + required: ["field"] + - not: + required: ["open_as_void"] examples: - driver: zarr3 kvstore: From 9a46c82968fb1e70e1cb14e3b827dcf627b80463 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 4 Dec 2025 17:31:17 +0000 Subject: [PATCH 18/20] Fix copyright header date --- tensorstore/driver/zarr3/dtype.cc | 2 +- tensorstore/driver/zarr3/dtype.h | 2 +- tensorstore/driver/zarr3/dtype_test.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorstore/driver/zarr3/dtype.cc b/tensorstore/driver/zarr3/dtype.cc index 5b3261812..b8aacaa68 100644 --- a/tensorstore/driver/zarr3/dtype.cc +++ b/tensorstore/driver/zarr3/dtype.cc @@ -1,4 +1,4 @@ -// Copyright 2020 The TensorStore Authors +// Copyright 2025 The TensorStore Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/tensorstore/driver/zarr3/dtype.h b/tensorstore/driver/zarr3/dtype.h index 430dd8849..73a6b0961 100644 --- a/tensorstore/driver/zarr3/dtype.h +++ b/tensorstore/driver/zarr3/dtype.h @@ -1,4 +1,4 @@ -// Copyright 2020 The TensorStore Authors +// Copyright 2025 The TensorStore Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index ef55aba09..709178bc3 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -1,4 +1,4 @@ -// Copyright 2023 The TensorStore Authors +// Copyright 2025 The TensorStore Authors // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From b9b5e41db3266155aa47323249f18687a1e2e45b Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 12:52:30 -0600 Subject: [PATCH 19/20] Cleanup (#5) --- tensorstore/driver/zarr3/driver.cc | 2 -- tensorstore/driver/zarr3/dtype_test.cc | 1 - 2 files changed, 3 deletions(-) diff --git a/tensorstore/driver/zarr3/driver.cc b/tensorstore/driver/zarr3/driver.cc index f86e4ad88..f65533197 100644 --- a/tensorstore/driver/zarr3/driver.cc +++ b/tensorstore/driver/zarr3/driver.cc @@ -737,8 +737,6 @@ class ZarrDriver : public ZarrDriverBase { if (metadata.fill_value.empty()) { return SharedArray(); } - // return metadata.fill_value[0]; - // TODO: Doe we actually need to validate this or can we trust that component_index will return a valid index? size_t index = this->component_index(); if (index >= metadata.fill_value.size()) { return absl::OutOfRangeError("Component index out of bounds"); diff --git a/tensorstore/driver/zarr3/dtype_test.cc b/tensorstore/driver/zarr3/dtype_test.cc index 709178bc3..a41830069 100644 --- a/tensorstore/driver/zarr3/dtype_test.cc +++ b/tensorstore/driver/zarr3/dtype_test.cc @@ -17,7 +17,6 @@ #include #include -#include // for std::byte #include #include From 4e12b633eb5624f209f7bd3d4e441fb468ef677c Mon Sep 17 00:00:00 2001 From: Brian Michell Date: Thu, 4 Dec 2025 14:46:38 -0600 Subject: [PATCH 20/20] Add open_as_void option to zarr v2 driver (#6) --- tensorstore/driver/zarr/driver.cc | 211 ++++++++++++++-- tensorstore/driver/zarr/driver_impl.h | 8 +- tensorstore/driver/zarr/driver_test.cc | 322 +++++++++++++++++++++++++ tensorstore/driver/zarr/schema.yml | 8 + tensorstore/driver/zarr/spec.cc | 22 +- tensorstore/driver/zarr/spec.h | 13 +- 6 files changed, 561 insertions(+), 23 deletions(-) diff --git a/tensorstore/driver/zarr/driver.cc b/tensorstore/driver/zarr/driver.cc index 69164648e..8a0943ae5 100644 --- a/tensorstore/driver/zarr/driver.cc +++ b/tensorstore/driver/zarr/driver.cc @@ -29,6 +29,10 @@ #include "absl/status/status.h" #include "absl/strings/cord.h" #include +#include "riegeli/bytes/cord_reader.h" +#include "riegeli/bytes/cord_writer.h" +#include "riegeli/bytes/read_all.h" +#include "riegeli/bytes/write.h" #include "tensorstore/array.h" #include "tensorstore/array_storage_statistics.h" #include "tensorstore/box.h" @@ -137,6 +141,20 @@ absl::Status ZarrDriverSpec::ApplyOptions(SpecOptions&& options) { } Result ZarrDriverSpec::GetSpecInfo() const { + // For open_as_void, we don't use normal field resolution + // Note: When opening an existing array, dtype may not be known yet, + // so we can't determine the exact rank until metadata is loaded. + if (open_as_void && partial_metadata.dtype) { + SpecRankAndFieldInfo info; + info.full_rank = schema.rank(); + info.chunked_rank = partial_metadata.rank; + // For void access, add one dimension for the bytes + info.field_rank = 1; // The bytes dimension + if (info.chunked_rank != dynamic_rank) { + info.full_rank = info.chunked_rank + 1; + } + return info; + } return GetSpecRankAndFieldInfo(partial_metadata, selected_field, schema); } @@ -171,6 +189,10 @@ TENSORSTORE_DEFINE_JSON_DEFAULT_BINDER( jb::Member("field", jb::Projection<&ZarrDriverSpec::selected_field>( jb::DefaultValue( [](auto* obj) { *obj = std::string{}; }))), + jb::Member("open_as_void", + jb::Projection<&ZarrDriverSpec::open_as_void>( + jb::DefaultValue( + [](auto* v) { *v = false; }))), jb::Initialize([](auto* obj) { TENSORSTORE_ASSIGN_OR_RETURN(auto info, obj->GetSpecInfo()); if (info.full_rank != dynamic_rank) { @@ -210,8 +232,19 @@ Result> ZarrDriverSpec::GetFillValue( const auto& metadata = partial_metadata; if (metadata.dtype && metadata.fill_value) { TENSORSTORE_ASSIGN_OR_RETURN( - size_t field_index, GetFieldIndex(*metadata.dtype, selected_field)); - fill_value = (*metadata.fill_value)[field_index]; + size_t field_index, + GetFieldIndex(*metadata.dtype, selected_field, open_as_void)); + + // For void access, synthesize a byte-level fill value + if (field_index == kVoidFieldIndex) { + const Index nbytes = metadata.dtype->bytes_per_outer_element; + auto byte_arr = AllocateArray( + span({nbytes}), c_order, value_init, + dtype_v); + fill_value = byte_arr; + } else { + fill_value = (*metadata.fill_value)[field_index]; + } } if (!fill_value.valid() || !transform.valid()) { @@ -238,13 +271,15 @@ Result> ZarrDriverSpec::GetFillValue( DataCache::DataCache(Initializer&& initializer, std::string key_prefix, DimensionSeparator dimension_separator, - std::string metadata_key) + std::string metadata_key, bool open_as_void) : Base(std::move(initializer), GetChunkGridSpecification( - *static_cast(initializer.metadata.get()))), + *static_cast(initializer.metadata.get()), + open_as_void)), key_prefix_(std::move(key_prefix)), dimension_separator_(dimension_separator), - metadata_key_(std::move(metadata_key)) {} + metadata_key_(std::move(metadata_key)), + open_as_void_(open_as_void) {} absl::Status DataCache::ValidateMetadataCompatibility( const void* existing_metadata_ptr, const void* new_metadata_ptr) { @@ -268,12 +303,40 @@ void DataCache::GetChunkGridBounds(const void* metadata_ptr, DimensionSet& implicit_lower_bounds, DimensionSet& implicit_upper_bounds) { const auto& metadata = *static_cast(metadata_ptr); - assert(bounds.rank() == static_cast(metadata.shape.size())); - std::fill(bounds.origin().begin(), bounds.origin().end(), Index(0)); + // Use >= assertion like zarr3 to allow for extra dimensions + assert(bounds.rank() >= static_cast(metadata.shape.size())); + std::fill(bounds.origin().begin(), + bounds.origin().begin() + metadata.shape.size(), Index(0)); std::copy(metadata.shape.begin(), metadata.shape.end(), bounds.shape().begin()); implicit_lower_bounds = false; - implicit_upper_bounds = true; + implicit_upper_bounds = false; + for (DimensionIndex i = 0; + i < static_cast(metadata.shape.size()); ++i) { + implicit_upper_bounds[i] = true; + } + // Handle extra dimensions for void access or field shapes + if (bounds.rank() > static_cast(metadata.shape.size())) { + if (open_as_void_) { + // For void access, the extra dimension is the bytes_per_outer_element + if (static_cast(metadata.shape.size() + 1) == + bounds.rank()) { + bounds.shape()[metadata.rank] = metadata.dtype.bytes_per_outer_element; + bounds.origin()[metadata.rank] = 0; + } + } else if (metadata.dtype.fields.size() == 1) { + // Handle single field with field_shape (like zarr3) + const auto& field = metadata.dtype.fields[0]; + if (static_cast(metadata.shape.size() + + field.field_shape.size()) == + bounds.rank()) { + for (size_t i = 0; i < field.field_shape.size(); ++i) { + bounds.shape()[metadata.shape.size() + i] = field.field_shape[i]; + bounds.origin()[metadata.shape.size() + i] = 0; + } + } + } + } } Result> DataCache::GetResizedMetadata( @@ -294,13 +357,61 @@ Result> DataCache::GetResizedMetadata( } internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( - const ZarrMetadata& metadata) { + const ZarrMetadata& metadata, bool open_as_void) { internal::ChunkGridSpecification::ComponentList components; - components.reserve(metadata.dtype.fields.size()); std::vector chunked_to_cell_dimensions( metadata.chunks.size()); std::iota(chunked_to_cell_dimensions.begin(), chunked_to_cell_dimensions.end(), static_cast(0)); + + // Special case: void access - create single component for raw bytes + if (open_as_void) { + const Index bytes_per_element = metadata.dtype.bytes_per_outer_element; + + // Create a zero-filled byte array as the fill value + auto base_fill_value = AllocateArray( + span({bytes_per_element}), c_order, value_init, + dtype_v); + + // The full chunk shape includes the extra bytes dimension + std::vector chunk_shape_with_bytes = metadata.chunks; + chunk_shape_with_bytes.push_back(bytes_per_element); + + const DimensionIndex cell_rank = metadata.rank + 1; + + // Broadcast fill value to target shape [unbounded, ..., bytes_per_element] + // like zarr3 does + std::vector target_shape(metadata.rank, kInfIndex); + target_shape.push_back(bytes_per_element); + auto chunk_fill_value = + BroadcastArray(base_fill_value, BoxView<>(target_shape)).value(); + + // Create valid data bounds - unbounded for chunked dimensions, + // explicit for bytes dimension + Box<> valid_data_bounds(cell_rank); + for (DimensionIndex i = 0; i < metadata.rank; ++i) { + valid_data_bounds[i] = IndexInterval::Infinite(); + } + valid_data_bounds[metadata.rank] = + IndexInterval::UncheckedSized(0, bytes_per_element); + + // Create permutation: copy existing order and add the bytes dimension + DimensionIndex layout_order_buffer[kMaxRank]; + GetChunkInnerOrder(metadata.rank, metadata.order, + span(layout_order_buffer, metadata.rank)); + layout_order_buffer[metadata.rank] = metadata.rank; // Add bytes dimension + + components.emplace_back( + internal::AsyncWriteArray::Spec{ + std::move(chunk_fill_value), std::move(valid_data_bounds), + ContiguousLayoutPermutation<>(span(layout_order_buffer, cell_rank))}, + std::move(chunk_shape_with_bytes), chunked_to_cell_dimensions); + + return internal::ChunkGridSpecification{std::move(components)}; + } + + // Normal field-based access + components.reserve(metadata.dtype.fields.size()); for (size_t field_i = 0; field_i < metadata.dtype.fields.size(); ++field_i) { const auto& field = metadata.dtype.fields[field_i]; const auto& field_layout = metadata.chunk_layout.fields[field_i]; @@ -335,12 +446,70 @@ internal::ChunkGridSpecification DataCache::GetChunkGridSpecification( Result, 1>> DataCache::DecodeChunk( span chunk_indices, absl::Cord data) { + if (open_as_void_) { + // For void access, return raw bytes as a single component + const auto& md = metadata(); + + // Decompress the data first (if compressed) + absl::Cord decompressed = std::move(data); + if (md.compressor) { + riegeli::CordReader base_reader(std::move(decompressed)); + auto compressed_reader = md.compressor->GetReader( + base_reader, md.dtype.bytes_per_outer_element); + absl::Cord uncompressed; + TENSORSTORE_RETURN_IF_ERROR( + riegeli::ReadAll(std::move(compressed_reader), uncompressed)); + if (!base_reader.VerifyEndAndClose()) return base_reader.status(); + decompressed = std::move(uncompressed); + } + + // Build the shape: chunk_shape + bytes_per_element + std::vector shape = md.chunks; + shape.push_back(md.dtype.bytes_per_outer_element); + + // Create a byte array from the decompressed data + auto flat_data = decompressed.Flatten(); + auto byte_array = AllocateArray(shape, c_order, default_init, + dtype_v); + std::memcpy(byte_array.data(), flat_data.data(), + std::min(static_cast(byte_array.num_elements()), + flat_data.size())); + + absl::InlinedVector, 1> result; + result.push_back(std::move(byte_array)); + return result; + } return internal_zarr::DecodeChunk(metadata(), std::move(data)); } Result DataCache::EncodeChunk( span chunk_indices, span> component_arrays) { + if (open_as_void_) { + // For void access, encode raw bytes directly + const auto& md = metadata(); + if (component_arrays.size() != 1) { + return absl::InvalidArgumentError( + "Expected exactly one component array for void access"); + } + const auto& byte_array = component_arrays[0]; + absl::Cord uncompressed( + std::string_view(static_cast(byte_array.data()), + byte_array.num_elements())); + + // Compress if needed + if (md.compressor) { + absl::Cord encoded; + riegeli::CordWriter base_writer(&encoded); + auto writer = md.compressor->GetWriter( + base_writer, md.dtype.bytes_per_outer_element); + TENSORSTORE_RETURN_IF_ERROR( + riegeli::Write(std::move(uncompressed), std::move(writer))); + if (!base_writer.Close()) return base_writer.status(); + return encoded; + } + return uncompressed; + } return internal_zarr::EncodeChunk(metadata(), component_arrays); } @@ -356,6 +525,7 @@ absl::Status DataCache::GetBoundSpecData( const auto& metadata = *static_cast(metadata_ptr); spec.selected_field = EncodeSelectedField(component_index, metadata.dtype); spec.metadata_key = metadata_key_; + spec.open_as_void = open_as_void_; auto& pm = spec.partial_metadata; pm.rank = metadata.rank; pm.zarr_format = metadata.zarr_format; @@ -416,6 +586,10 @@ Result ZarrDriverSpec::ToUrl() const { return absl::InvalidArgumentError( "zarr2 URL syntax not supported with selected_field specified"); } + if (open_as_void) { + return absl::InvalidArgumentError( + "zarr2 URL syntax not supported with open_as_void specified"); + } TENSORSTORE_ASSIGN_OR_RETURN(auto base_url, store.ToUrl()); return tensorstore::StrCat(base_url, "|", kUrlScheme, ":"); } @@ -483,7 +657,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_ASSIGN_OR_RETURN( auto metadata, internal_zarr::GetNewMetadata(spec().partial_metadata, - spec().selected_field, spec().schema), + spec().selected_field, spec().schema, + spec().open_as_void), tensorstore::MaybeAnnotateStatus( _, "Cannot create using specified \"metadata\" and schema")); return metadata; @@ -496,7 +671,8 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { internal::EncodeCacheKey( &result, spec.store.path, GetDimensionSeparator(spec.partial_metadata, zarr_metadata), - zarr_metadata, spec.metadata_key); + zarr_metadata, spec.metadata_key, + spec.open_as_void ? "void" : "normal"); return result; } @@ -507,7 +683,7 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { return std::make_unique( std::move(initializer), spec().store.path, GetDimensionSeparator(spec().partial_metadata, metadata), - spec().metadata_key); + spec().metadata_key, spec().open_as_void); } Result GetComponentIndex(const void* metadata_ptr, @@ -516,7 +692,14 @@ class ZarrDriver::OpenState : public ZarrDriver::OpenStateBase { TENSORSTORE_RETURN_IF_ERROR( ValidateMetadata(metadata, spec().partial_metadata)); TENSORSTORE_ASSIGN_OR_RETURN( - auto field_index, GetFieldIndex(metadata.dtype, spec().selected_field)); + auto field_index, + GetFieldIndex(metadata.dtype, spec().selected_field, + spec().open_as_void)); + // For void access, map to component index 0 since we create a special + // component for raw byte access + if (field_index == kVoidFieldIndex) { + field_index = 0; + } TENSORSTORE_RETURN_IF_ERROR( ValidateMetadataSchema(metadata, field_index, spec().schema)); return field_index; diff --git a/tensorstore/driver/zarr/driver_impl.h b/tensorstore/driver/zarr/driver_impl.h index df3c3930f..c2933dd90 100644 --- a/tensorstore/driver/zarr/driver_impl.h +++ b/tensorstore/driver/zarr/driver_impl.h @@ -63,10 +63,11 @@ class ZarrDriverSpec ZarrPartialMetadata partial_metadata; SelectedField selected_field; std::string metadata_key; + bool open_as_void = false; constexpr static auto ApplyMembers = [](auto& x, auto f) { return f(internal::BaseCast(x), x.partial_metadata, - x.selected_field, x.metadata_key); + x.selected_field, x.metadata_key, x.open_as_void); }; absl::Status ApplyOptions(SpecOptions&& options) override; @@ -98,7 +99,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { public: explicit DataCache(Initializer&& initializer, std::string key_prefix, DimensionSeparator dimension_separator, - std::string metadata_key); + std::string metadata_key, bool open_as_void = false); const ZarrMetadata& metadata() { return *static_cast(initial_metadata().get()); @@ -117,7 +118,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { /// Returns the ChunkCache grid to use for the given metadata. static internal::ChunkGridSpecification GetChunkGridSpecification( - const ZarrMetadata& metadata); + const ZarrMetadata& metadata, bool open_as_void = false); Result, 1>> DecodeChunk( span chunk_indices, absl::Cord data) override; @@ -140,6 +141,7 @@ class DataCache : public internal_kvs_backed_chunk_driver::DataCache { std::string key_prefix_; DimensionSeparator dimension_separator_; std::string metadata_key_; + bool open_as_void_; }; class ZarrDriver; diff --git a/tensorstore/driver/zarr/driver_test.cc b/tensorstore/driver/zarr/driver_test.cc index 92c5be48a..a5014987d 100644 --- a/tensorstore/driver/zarr/driver_test.cc +++ b/tensorstore/driver/zarr/driver_test.cc @@ -3499,4 +3499,326 @@ TEST(DriverTest, UrlSchemeRoundtrip) { {"kvstore", {{"driver", "memory"}, {"path", "abc.zarr/def/"}}}}); } +// Tests for open_as_void functionality + +TEST(ZarrDriverTest, OpenAsVoidSimpleType) { + // Test open_as_void with a simple data type (int16) + auto context = Context::Default(); + + // First create a normal array + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", "({{1, 2}, {3, 4}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be the size of the data type (2 bytes for int16) + EXPECT_EQ(2, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidStructuredType) { + // Test open_as_void with a structured data type + auto context = Context::Default(); + + // Create an array with a structured dtype + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"field", "y"}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", ::nlohmann::json::array_t{{"x", "|u1"}, {"y", "({{100, 200}, {300, 400}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true - this should give raw access to the entire + // struct + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 3 bytes (1 byte for u1 + 2 bytes for i2) + EXPECT_EQ(3, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidWithCompression) { + // Test open_as_void with compression enabled + auto context = Context::Default(); + + // Create an array with blosc compression + ::nlohmann::json create_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"metadata", + { + {"compressor", {{"id", "blosc"}}}, + {"dtype", "({{0x01020304, 0x05060708}, + {0x090a0b0c, 0x0d0e0f10}}); + TENSORSTORE_EXPECT_OK( + tensorstore::Write(data, store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + + // Now open with open_as_void=true + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read) + .result()); + + // The void store should have rank = original_rank + 1 (for bytes dimension) + EXPECT_EQ(3, void_store.rank()); + + // The last dimension should be 4 bytes for int32 + EXPECT_EQ(4, void_store.domain().shape()[2]); + + // The data type should be byte + EXPECT_EQ(tensorstore::dtype_v, + void_store.dtype()); + + // Read the raw bytes and verify decompression works + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto read_result, + tensorstore::Read(void_store | tensorstore::Dims(0, 1).SizedInterval( + {0, 0}, {2, 2})) + .result()); + EXPECT_EQ(read_result.shape()[0], 2); + EXPECT_EQ(read_result.shape()[1], 2); + EXPECT_EQ(read_result.shape()[2], 4); +} + +TEST(ZarrDriverTest, OpenAsVoidSpecRoundtrip) { + // Test that open_as_void is properly preserved in spec round-trips + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"compressor", nullptr}, + {"dtype", ", + void_store.dtype()); +} + +TEST(ZarrDriverTest, OpenAsVoidUrlNotSupported) { + // Test that open_as_void is not supported with URL syntax + ::nlohmann::json json_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + {"metadata", + { + {"dtype", "({{0x0102, 0x0304}, + {0x0506, 0x0708}}); + TENSORSTORE_EXPECT_OK(tensorstore::Write(data, store).result()); + + // Open as void and read + ::nlohmann::json void_spec{ + {"driver", "zarr"}, + {"kvstore", {{"driver", "memory"}, {"path", "prefix/"}}}, + {"open_as_void", true}, + }; + + TENSORSTORE_ASSERT_OK_AND_ASSIGN( + auto void_store, + tensorstore::Open(void_spec, context, tensorstore::OpenMode::open, + tensorstore::ReadWriteMode::read_write) + .result()); + + // Read the raw bytes + TENSORSTORE_ASSERT_OK_AND_ASSIGN(auto bytes_read, + tensorstore::Read(void_store).result()); + + // Verify shape: [2, 2, 2] where last dim is 2 bytes per uint16 + EXPECT_EQ(bytes_read.shape()[0], 2); + EXPECT_EQ(bytes_read.shape()[1], 2); + EXPECT_EQ(bytes_read.shape()[2], 2); + + // Verify the raw bytes (little endian) + auto bytes_ptr = static_cast(bytes_read.data()); + // First element: 0x0102 -> bytes 0x02, 0x01 (little endian) + EXPECT_EQ(bytes_ptr[0], 0x02); + EXPECT_EQ(bytes_ptr[1], 0x01); +} + } // namespace diff --git a/tensorstore/driver/zarr/schema.yml b/tensorstore/driver/zarr/schema.yml index 45711648c..a90fb7e3a 100644 --- a/tensorstore/driver/zarr/schema.yml +++ b/tensorstore/driver/zarr/schema.yml @@ -17,6 +17,14 @@ allOf: Must be specified if the `.metadata.dtype` specified in the array metadata has more than one field. default: null + open_as_void: + type: boolean + default: false + title: Raw byte access mode. + description: | + When true, opens the array as raw bytes instead of interpreting it + as structured data. The resulting array will have an additional + dimension representing the byte layout of each element. metadata: title: Zarr array metadata. description: | diff --git a/tensorstore/driver/zarr/spec.cc b/tensorstore/driver/zarr/spec.cc index 34a2825f9..4857d045b 100644 --- a/tensorstore/driver/zarr/spec.cc +++ b/tensorstore/driver/zarr/spec.cc @@ -151,7 +151,8 @@ absl::Status ValidateMetadata(const ZarrMetadata& metadata, Result GetNewMetadata( const ZarrPartialMetadata& partial_metadata, - const SelectedField& selected_field, const Schema& schema) { + const SelectedField& selected_field, const Schema& schema, + bool open_as_void) { ZarrMetadataPtr metadata = std::make_shared(); metadata->zarr_format = partial_metadata.zarr_format.value_or(2); metadata->dimension_separator = partial_metadata.dimension_separator.value_or( @@ -172,7 +173,12 @@ Result GetNewMetadata( // multi-field zarr dtype is desired, it must be specified explicitly. TENSORSTORE_ASSIGN_OR_RETURN( selected_field_index, - GetFieldIndex(*partial_metadata.dtype, selected_field)); + GetFieldIndex(*partial_metadata.dtype, selected_field, open_as_void)); + // For void access, use field 0 for metadata creation since we use all + // fields as raw bytes + if (selected_field_index == kVoidFieldIndex) { + selected_field_index = 0; + } metadata->dtype = *partial_metadata.dtype; } else { if (!selected_field.empty()) { @@ -527,7 +533,17 @@ std::string GetFieldNames(const ZarrDType& dtype) { } // namespace Result GetFieldIndex(const ZarrDType& dtype, - const SelectedField& selected_field) { + const SelectedField& selected_field, + bool open_as_void) { + // Special case: open_as_void requests raw byte access (works for any dtype) + if (open_as_void) { + if (dtype.fields.empty()) { + return absl::FailedPreconditionError( + "Requested void access but dtype has no fields"); + } + return kVoidFieldIndex; + } + if (selected_field.empty()) { if (dtype.fields.size() != 1) { return absl::FailedPreconditionError(tensorstore::StrCat( diff --git a/tensorstore/driver/zarr/spec.h b/tensorstore/driver/zarr/spec.h index 0ef3ab9d3..597fc32f0 100644 --- a/tensorstore/driver/zarr/spec.h +++ b/tensorstore/driver/zarr/spec.h @@ -70,9 +70,11 @@ using SelectedField = std::string; /// \param partial_metadata Constraints in the form of partial zarr metadata. /// \param selected_field The field to which `schema` applies. /// \param schema Schema constraints for the `selected_field`. +/// \param open_as_void If true, opens the array as raw bytes. Result GetNewMetadata( const ZarrPartialMetadata& partial_metadata, - const SelectedField& selected_field, const Schema& schema); + const SelectedField& selected_field, const Schema& schema, + bool open_as_void = false); struct SpecRankAndFieldInfo { /// Full rank of the TensorStore, if known. Equal to the chunked rank plus @@ -134,11 +136,16 @@ Result ParseSelectedField(const ::nlohmann::json& value); /// \param dtype The parsed zarr "dtype" specification. /// \param selected_field The label of the field, or an empty string to indicate /// that the zarr array must have only a single field. -/// \returns The field index. +/// \param open_as_void If true, returns kVoidFieldIndex for raw byte access. +/// \returns The field index, or kVoidFieldIndex if open_as_void is true. /// \error `absl::StatusCode::kFailedPrecondition` if `selected_field` is not /// valid. Result GetFieldIndex(const ZarrDType& dtype, - const SelectedField& selected_field); + const SelectedField& selected_field, + bool open_as_void = false); + +/// Special field index indicating void (raw byte) access. +constexpr size_t kVoidFieldIndex = size_t(-1); /// Encodes a field index as a `SelectedField` JSON specification. ///