diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index b126396e3c..59e688c70f 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -1127,7 +1127,7 @@ mod tests { .file_size_in_bytes(1024) .record_count(100) .partition_spec_id(1) - .partition(Struct::empty()) + .partition(Struct::from_iter([None::])) .column_sizes(HashMap::from([(1, 512), (2, 1024)])) .value_counts(HashMap::from([(1, 100), (2, 500)])) .null_value_counts(HashMap::from([(1, 0), (2, 1)])) @@ -1140,7 +1140,7 @@ mod tests { .file_size_in_bytes(2048) .record_count(200) .partition_spec_id(1) - .partition(Struct::empty()) + .partition(Struct::from_iter([None::])) .column_sizes(HashMap::from([(1, 1024), (2, 2048)])) .value_counts(HashMap::from([(1, 200), (2, 600)])) .null_value_counts(HashMap::from([(1, 10), (2, 999)])) @@ -1163,7 +1163,7 @@ mod tests { "content": 0, "file_path": "path/to/file1.parquet", "file_format": "PARQUET", - "partition": {}, + "partition": { "id_partition": null }, "record_count": 100, "file_size_in_bytes": 1024, "column_sizes": [ @@ -1194,7 +1194,7 @@ mod tests { "content": 0, "file_path": "path/to/file2.parquet", "file_format": "PARQUET", - "partition": {}, + "partition": { "id_partition": null }, "record_count": 200, "file_size_in_bytes": 2048, "column_sizes": [ diff --git a/crates/iceberg/src/spec/values/serde.rs b/crates/iceberg/src/spec/values/serde.rs index 053acca8b0..a9f1d77f41 100644 --- a/crates/iceberg/src/spec/values/serde.rs +++ b/crates/iceberg/src/spec/values/serde.rs @@ -18,6 +18,8 @@ //\! Serialization and deserialization support for Iceberg values pub(crate) mod _serde { + use std::collections::HashMap; + use serde::de::Visitor; use serde::ser::{SerializeMap, SerializeSeq, SerializeStruct}; use serde::{Deserialize, Serialize}; @@ -668,15 +670,13 @@ pub(crate) mod _serde { } _ => Err(invalid_err("list")), }, - RawLiteralEnum::Record(Record { - required, - optional: _, - }) => match ty { + RawLiteralEnum::Record(Record { required, optional }) => match ty { Type::Struct(struct_ty) => { - let iters: Vec> = required - .into_iter() - .map(|(field_name, value)| { - let field = struct_ty + let mut value_by_name = HashMap::new(); + + for (field_name, value) in required.into_iter() { + let field = + struct_ty .field_by_name(field_name.as_str()) .ok_or_else(|| { invalid_err_with_reason( @@ -684,10 +684,49 @@ pub(crate) mod _serde { &format!("field {} is not exist", &field_name), ) })?; - let value = value.try_into(&field.field_type)?; - Ok(value) - }) - .collect::>()?; + let value = value.try_into(&field.field_type)?; + if value.is_none() && field.required { + return Err(invalid_err_with_reason( + "record", + &format!("required field {} is null", &field_name), + )); + } + value_by_name.insert(field.name.clone(), value); + } + + for (field_name, value) in optional.into_iter() { + let field = + struct_ty + .field_by_name(field_name.as_str()) + .ok_or_else(|| { + invalid_err_with_reason( + "record", + &format!("field {} is not exist", &field_name), + ) + })?; + let value = match value { + Some(v) => v.try_into(&field.field_type)?, + None => None, + }; + value_by_name.insert(field.name.clone(), value); + } + + let mut iters = Vec::with_capacity(struct_ty.fields().len()); + for field in struct_ty.fields() { + match value_by_name.remove(&field.name) { + Some(value) => iters.push(value), + None => { + if field.required { + return Err(invalid_err_with_reason( + "record", + &format!("required field {} is missing", field.name), + )); + } + iters.push(None); + } + } + } + Ok(Some(Literal::Struct(Struct::from_iter(iters)))) } Type::Map(map_ty) => { diff --git a/crates/iceberg/src/spec/values/tests.rs b/crates/iceberg/src/spec/values/tests.rs index 73343a9a1a..9a154379ec 100644 --- a/crates/iceberg/src/spec/values/tests.rs +++ b/crates/iceberg/src/spec/values/tests.rs @@ -272,6 +272,27 @@ fn json_struct() { ); } +#[test] +fn json_struct_preserves_schema_order() { + // struct fields are deliberately ordered as b, then a to detect ordering drift + let struct_type = StructType::new(vec![ + NestedField::required(2, "b", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(1, "a", Type::Primitive(PrimitiveType::Long)).into(), + ]); + let literal = Literal::Struct(Struct::from_iter(vec![ + Some(Literal::Primitive(PrimitiveLiteral::Int(5))), + Some(Literal::Primitive(PrimitiveLiteral::Long(10))), + ])); + + let raw = RawLiteral::try_from(literal.clone(), &Type::Struct(struct_type.clone())).unwrap(); + // serde_json::Value uses BTreeMap (sorted keys), which mimics the RW metadata path. + let value = serde_json::to_value(&raw).unwrap(); + let deser: RawLiteral = serde_json::from_value(value).unwrap(); + let roundtrip = deser.try_into(&Type::Struct(struct_type)).unwrap().unwrap(); + + assert_eq!(roundtrip, literal); +} + #[test] fn json_list() { let record = r#"[1, 2, 3, null]"#;