From 748b34ebb27bb1273bb153f9820313c3cbef98ce Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Tue, 18 Feb 2025 09:57:10 +0100 Subject: [PATCH 01/15] Proposed API interfaces - squashed for rebase --- .../arrow/vectorized/ArrowFormatModels.java | 39 ++ .../iceberg/arrow/vectorized/ArrowReader.java | 33 +- .../java/org/apache/iceberg/avro/Avro.java | 7 +- .../apache/iceberg/avro/AvroFormatModel.java | 260 ++++++++++++++ .../iceberg/formats/CommonWriteBuilder.java | 122 +++++++ .../formats/CommonWriteBuilderImpl.java | 337 ++++++++++++++++++ .../iceberg/formats/DataWriteBuilder.java | 59 +++ .../formats/EqualityDeleteWriteBuilder.java | 74 ++++ .../apache/iceberg/formats/FormatModel.java | 91 +++++ .../iceberg/formats/FormatModelRegistry.java | 223 ++++++++++++ .../formats/PositionDeleteWriteBuilder.java | 47 +++ .../apache/iceberg/formats/ReadBuilder.java | 110 ++++++ .../apache/iceberg/formats/WriteBuilder.java | 120 +++++++ .../formats/TestFormatModelRegistry.java | 125 +++++++ .../apache/iceberg/data/BaseDeleteLoader.java | 53 +-- .../iceberg/data/BaseFileWriterFactory.java | 15 +- .../data/GenericFileWriterFactory.java | 173 +++++++-- .../iceberg/data/GenericFormatModels.java | 99 +++++ .../apache/iceberg/data/GenericReader.java | 74 +--- .../data/RegistryBasedFileWriterFactory.java | 184 ++++++++++ .../iceberg/flink/data/FlinkFormatModels.java | 54 +++ .../flink/data/FlinkSchemaVisitor.java | 39 +- .../flink/sink/FlinkFileWriterFactory.java | 98 +---- .../source/RowDataFileScanTaskReader.java | 112 +----- .../flink/sink/TestCompressionSettings.java | 4 +- .../flink/sink/TestIcebergStreamWriter.java | 93 ++++- .../flink/sink/dynamic/TestDynamicWriter.java | 4 +- .../mr/mapreduce/IcebergInputFormat.java | 134 ++----- .../main/java/org/apache/iceberg/orc/ORC.java | 23 +- .../apache/iceberg/orc/ORCFormatModel.java | 317 ++++++++++++++++ .../org/apache/iceberg/parquet/Parquet.java | 5 +- .../iceberg/parquet/ParquetFormatModel.java | 334 +++++++++++++++++ .../parquet/VectorizedParquetReader.java | 8 +- .../apache/iceberg/parquet/TestParquet.java | 2 +- .../actions/RewriteTablePathSparkAction.java | 96 +++-- .../VectorizedSparkParquetReaders.java | 16 + .../iceberg/spark/source/BaseBatchReader.java | 106 ++---- .../iceberg/spark/source/BaseRowReader.java | 73 +--- .../spark/source/SparkFileWriterFactory.java | 214 ++++++----- .../spark/source/SparkFormatModels.java | 71 ++++ 40 files changed, 3268 insertions(+), 780 deletions(-) create mode 100644 arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java create mode 100644 core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/FormatModel.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java create mode 100644 core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java create mode 100644 core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java create mode 100644 data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java create mode 100644 data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java create mode 100644 flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java create mode 100644 orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java create mode 100644 parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java create mode 100644 spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java new file mode 100644 index 000000000000..2036ce6f4d9b --- /dev/null +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.arrow.vectorized; + +import org.apache.arrow.vector.NullCheckingForGet; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.parquet.ParquetFormatModel; + +public class ArrowFormatModels { + public static void register() { + FormatModelRegistry.register( + new ParquetFormatModel<>( + ColumnarBatch.class, + Object.class, + (schema, messageType, constantValues, properties) -> + ArrowReader.VectorizedCombinedScanIterator.buildReader( + schema, + messageType, /* setArrowValidityVector */ + NullCheckingForGet.NULL_CHECKING_ENABLED))); + } + + private ArrowFormatModels() {} +} diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java index 06b7baec27d5..68a27bdfb8eb 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java @@ -29,7 +29,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.stream.StreamSupport; -import org.apache.arrow.vector.NullCheckingForGet; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.iceberg.CombinedScanTask; @@ -40,13 +39,14 @@ import org.apache.iceberg.encryption.EncryptedFiles; import org.apache.iceberg.encryption.EncryptedInputFile; import org.apache.iceberg.encryption.EncryptionManager; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableGroup; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.parquet.TypeWithSchemaVisitor; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; @@ -189,8 +189,7 @@ public void close() throws IOException { * Reads the data file and returns an iterator of {@link VectorSchemaRoot}. Only Parquet data file * format is supported. */ - private static final class VectorizedCombinedScanIterator - implements CloseableIterator { + static final class VectorizedCombinedScanIterator implements CloseableIterator { private final Iterator fileItr; private final Map inputFiles; @@ -324,19 +323,8 @@ CloseableIterator open(FileScanTask task) { InputFile location = getInputFile(task); Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); if (task.file().format() == FileFormat.PARQUET) { - Parquet.ReadBuilder builder = - Parquet.read(location) - .project(expectedSchema) - .split(task.start(), task.length()) - .createBatchedReaderFunc( - fileSchema -> - buildReader( - expectedSchema, - fileSchema, /* setArrowValidityVector */ - NullCheckingForGet.NULL_CHECKING_ENABLED)) - .recordsPerBatch(batchSize) - .filter(task.residual()) - .caseSensitive(caseSensitive); + ReadBuilder builder = + FormatModelRegistry.readBuilder(FileFormat.PARQUET, ColumnarBatch.class, location); if (reuseContainers) { builder.reuseContainers(); @@ -345,7 +333,14 @@ CloseableIterator open(FileScanTask task) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } - iter = builder.build(); + iter = + builder + .project(expectedSchema) + .split(task.start(), task.length()) + .recordsPerBatch(batchSize) + .caseSensitive(caseSensitive) + .filter(task.residual()) + .build(); } else { throw new UnsupportedOperationException( "Format: " + task.file().format() + " not supported for batched reads"); @@ -376,7 +371,7 @@ private InputFile getInputFile(FileScanTask task) { * @param fileSchema Schema of the data file. * @param setArrowValidityVector Indicates whether to set the validity vector in Arrow vectors. */ - private static ArrowBatchReader buildReader( + static ArrowBatchReader buildReader( Schema expectedSchema, MessageType fileSchema, boolean setArrowValidityVector) { return (ArrowBatchReader) TypeWithSchemaVisitor.visit( diff --git a/core/src/main/java/org/apache/iceberg/avro/Avro.java b/core/src/main/java/org/apache/iceberg/avro/Avro.java index 6c7edc25b691..4a5136f58e71 100644 --- a/core/src/main/java/org/apache/iceberg/avro/Avro.java +++ b/core/src/main/java/org/apache/iceberg/avro/Avro.java @@ -182,8 +182,7 @@ public WriteBuilder overwrite(boolean enabled) { } // supposed to always be a private method used strictly by data and delete write builders - private WriteBuilder createContextFunc( - Function, Context> newCreateContextFunc) { + WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) { this.createContextFunc = newCreateContextFunc; return this; } @@ -217,7 +216,7 @@ public FileAppender build() throws IOException { overwrite); } - private static class Context { + static class Context { private final CodecFactory codec; private Context(CodecFactory codec) { @@ -568,7 +567,7 @@ public PositionDeleteWriter buildPositionWriter() throws IOException { } /** A {@link DatumWriter} implementation that wraps another to produce position deletes. */ - private static class PositionDatumWriter implements MetricsAwareDatumWriter> { + static class PositionDatumWriter implements MetricsAwareDatumWriter> { private static final ValueWriter PATH_WRITER = ValueWriters.strings(); private static final ValueWriter POS_WRITER = ValueWriters.longs(); diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java new file mode 100644 index 000000000000..3b6355a5104b --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -0,0 +1,260 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.avro; + +import java.nio.ByteBuffer; +import java.util.Map; +import java.util.function.BiFunction; +import org.apache.avro.io.DatumReader; +import org.apache.avro.io.DatumWriter; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.Schema; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.formats.FormatModel; +import org.apache.iceberg.formats.ReadBuilder; +import org.apache.iceberg.formats.WriteBuilder; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; + +public class AvroFormatModel implements FormatModel { + private final Class type; + private final Class schemaType; + private final BiFunction, DatumReader> readerFunction; + private final BiFunction> writerFunction; + + public AvroFormatModel(Class type) { + this(type, null, null, null); + } + + public AvroFormatModel( + Class type, + Class schemaType, + BiFunction, DatumReader> readerFunction, + BiFunction> writerFunction) { + this.type = type; + this.schemaType = schemaType; + this.readerFunction = readerFunction; + this.writerFunction = writerFunction; + } + + @Override + public FileFormat format() { + return FileFormat.AVRO; + } + + @Override + public Class type() { + return type; + } + + @Override + public Class schemaType() { + return schemaType; + } + + @Override + public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + return new WriteBuilderWrapper<>(outputFile, writerFunction); + } + + @Override + public ReadBuilder readBuilder(InputFile inputFile) { + return new ReadBuilderWrapper<>(inputFile, readerFunction); + } + + private static class ReadBuilderWrapper implements ReadBuilder { + private final Avro.ReadBuilder internal; + private final BiFunction, DatumReader> readerFunction; + private Schema icebergSchema; + private Map idToConstant = ImmutableMap.of(); + + private ReadBuilderWrapper( + InputFile inputFile, BiFunction, DatumReader> readerFunction) { + this.internal = Avro.read(inputFile); + this.readerFunction = readerFunction; + } + + @Override + public ReadBuilder split(long newStart, long newLength) { + internal.split(newStart, newLength); + return this; + } + + @Override + public ReadBuilder project(Schema schema) { + this.icebergSchema = schema; + internal.project(schema); + return this; + } + + @Override + public ReadBuilder caseSensitive(boolean caseSensitive) { + // Filtering is not supported in Avro reader, so case sensitivity does not matter + return this; + } + + @Override + public ReadBuilder filter(Expression filter) { + // Filtering is not supported in Avro reader + return this; + } + + @Override + public ReadBuilder set(String key, String value) { + // Configuration is not used for Avro reader creation + return this; + } + + @Override + public ReadBuilder reuseContainers() { + internal.reuseContainers(); + return this; + } + + @Override + public ReadBuilder recordsPerBatch(int numRowsPerBatch) { + throw new UnsupportedOperationException("Batch reading is not supported in Avro reader"); + } + + @Override + public ReadBuilder idToConstant(Map newIdToConstant) { + this.idToConstant = newIdToConstant; + return this; + } + + @Override + public ReadBuilder withNameMapping(org.apache.iceberg.mapping.NameMapping nameMapping) { + internal.withNameMapping(nameMapping); + return this; + } + + @Override + public CloseableIterable build() { + return internal + .createResolvingReader(unused -> readerFunction.apply(icebergSchema, idToConstant)) + .build(); + } + } + + private static class WriteBuilderWrapper implements WriteBuilder { + private final Avro.WriteBuilder internal; + private final BiFunction> writerFunction; + private S inputSchema; + private FileContent content; + + private WriteBuilderWrapper( + EncryptedOutputFile outputFile, + BiFunction> writerFunction) { + this.internal = Avro.write(outputFile.encryptingOutputFile()); + this.writerFunction = writerFunction; + } + + @Override + public WriteBuilder schema(Schema schema) { + internal.schema(schema); + return this; + } + + @Override + public WriteBuilder inputSchema(S schema) { + this.inputSchema = schema; + return this; + } + + @Override + public WriteBuilder set(String property, String value) { + internal.set(property, value); + return this; + } + + @Override + public WriteBuilder setAll(Map properties) { + internal.setAll(properties); + return this; + } + + @Override + public WriteBuilder meta(String property, String value) { + internal.meta(property, value); + return this; + } + + @Override + public WriteBuilder meta(Map properties) { + internal.meta(properties); + return this; + } + + @Override + public WriteBuilder content(FileContent newContent) { + this.content = newContent; + return this; + } + + @Override + public WriteBuilder metricsConfig(MetricsConfig metricsConfig) { + internal.metricsConfig(metricsConfig); + return this; + } + + @Override + public WriteBuilder overwrite() { + internal.overwrite(); + return this; + } + + @Override + public WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { + throw new UnsupportedOperationException("Avro does not support file encryption keys"); + } + + @Override + public WriteBuilder withAADPrefix(ByteBuffer aadPrefix) { + throw new UnsupportedOperationException("Avro does not support AAD prefix"); + } + + @Override + public org.apache.iceberg.io.FileAppender build() throws java.io.IOException { + switch (content) { + case DATA: + internal.createContextFunc(Avro.WriteBuilder.Context::dataContext); + internal.createWriterFunc(avroSchema -> writerFunction.apply(avroSchema, inputSchema)); + break; + case EQUALITY_DELETES: + internal.createContextFunc(Avro.WriteBuilder.Context::deleteContext); + internal.createWriterFunc(avroSchema -> writerFunction.apply(avroSchema, inputSchema)); + break; + case POSITION_DELETES: + internal.createContextFunc(Avro.WriteBuilder.Context::deleteContext); + internal.createWriterFunc(unused -> new Avro.PositionDatumWriter()); + internal.schema(DeleteSchemaUtil.pathPosSchema()); + break; + default: + throw new IllegalArgumentException("Unknown file content: " + content); + } + + return internal.build(); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilder.java new file mode 100644 index 000000000000..b37e755926f1 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilder.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.nio.ByteBuffer; +import java.util.Map; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptionKeyMetadata; +import org.apache.iceberg.io.DataWriter; + +/** + * A generic builder interface for creating specialized file writers in the Iceberg ecosystem. + * + *

This builder provides a unified configuration API for generating various types of content + * writers: + * + *

    + *
  • {@link DataWriter} for creating data files with table records + *
  • {@link EqualityDeleteWriter} for creating files with equality-based delete records + *
  • {@link PositionDeleteWriter} for creating files with position-based delete records + *
+ * + *

Each concrete implementation configures the underlying file format writer while adding + * content-specific metadata and behaviors. + * + * @param the concrete builder type for method chaining + */ +interface CommonWriteBuilder> { + + /** + * Set a writer configuration property which affects the writer behavior. + * + * @param property a writer config property name + * @param value config value + * @return this for method chaining + */ + B set(String property, String value); + + /** + * Adds the new properties to the writer configuration. + * + * @param properties a map of writer config properties + * @return this for method chaining + */ + default B setAll(Map properties) { + properties.forEach(this::set); + return self(); + } + + /** + * Set a file metadata property in the created file. + * + * @param property a file metadata property name + * @param value config value + * @return this for method chaining + */ + B meta(String property, String value); + + /** + * Add the new properties to file metadata for the created file. + * + * @param properties a map of file metadata properties + * @return this for method chaining + */ + default B meta(Map properties) { + properties.forEach(this::meta); + return self(); + } + + /** Sets the metrics configuration used for collecting column metrics for the created file. */ + B metricsConfig(MetricsConfig metricsConfig); + + /** Overwrite the file if it already exists. By default, overwrite is disabled. */ + B overwrite(); + + /** + * Sets the encryption key used for writing the file. If the writer does not support encryption, + * then an exception should be thrown. + */ + B withFileEncryptionKey(ByteBuffer encryptionKey); + + /** + * Sets the additional authentication data (AAD) prefix used for writing the file. If the writer + * does not support encryption, then an exception should be thrown. + */ + B withAADPrefix(ByteBuffer aadPrefix); + + /** Sets the partition specification for the Iceberg metadata. */ + B spec(PartitionSpec newSpec); + + /** Sets the partition value for the Iceberg metadata. */ + B partition(StructLike partition); + + /** Sets the encryption key metadata for Iceberg metadata. */ + B keyMetadata(EncryptionKeyMetadata keyMetadata); + + /** Sets the sort order for the Iceberg metadata. */ + B sortOrder(SortOrder sortOrder); + + B self(); +} diff --git a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java b/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java new file mode 100644 index 000000000000..9e5d9b1605cb --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java @@ -0,0 +1,337 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptionKeyMetadata; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +/** + * An internal implementation that handles all {@link CommonWriteBuilder} interface variants. + * + *

This unified implementation serves as a backend for multiple specialized content writers: + * + *

    + *
  • {@link DataWriteBuilder} for creating data files + *
  • {@link EqualityDeleteWriteBuilder} for creating equality delete files + *
  • {@link PositionDeleteWriteBuilder} for creating position delete files + *
+ * + *

The implementation delegates to a format-specific {@link WriteBuilder} while enriching it with + * content-specific functionality. When building a writer, the implementation configures the + * underlying builder and calls its {@link WriteBuilder#build()} method to create the appropriate + * specialized writer for the requested content type. + * + * @param the concrete builder type for method chaining + * @param the type of data records the writer will accept + * @param the type of the schema for the input data + */ +abstract class CommonWriteBuilderImpl, D, S> + implements CommonWriteBuilder { + private final WriteBuilder writeBuilder; + private final String location; + private final FileFormat format; + private PartitionSpec spec = null; + private StructLike partition = null; + private EncryptionKeyMetadata keyMetadata = null; + private SortOrder sortOrder = null; + + static DataWriteBuilder forDataFile( + WriteBuilder writeBuilder, String location, FileFormat format) { + return new DataFileWriteBuilder<>(writeBuilder.content(FileContent.DATA), location, format); + } + + static EqualityDeleteWriteBuilder forEqualityDelete( + WriteBuilder writeBuilder, String location, FileFormat format) { + return new EqualityDeleteFileWriteBuilder<>( + writeBuilder.content(FileContent.EQUALITY_DELETES), location, format); + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + static PositionDeleteWriteBuilder forPositionDelete( + WriteBuilder writeBuilder, String location, FileFormat format) { + return new PositionDeleteFileWriteBuilder( + (WriteBuilder) writeBuilder.content(FileContent.POSITION_DELETES), + location, + format); + } + + private CommonWriteBuilderImpl( + WriteBuilder writeBuilder, String location, FileFormat format) { + this.writeBuilder = writeBuilder; + this.location = location; + this.format = format; + } + + @Override + public B set(String property, String value) { + writeBuilder.set(property, value); + return self(); + } + + @Override + public B meta(String property, String value) { + writeBuilder.meta(property, value); + return self(); + } + + @Override + public B metricsConfig(MetricsConfig metricsConfig) { + writeBuilder.metricsConfig(metricsConfig); + return self(); + } + + @Override + public B overwrite() { + writeBuilder.overwrite(); + return self(); + } + + @Override + public B withFileEncryptionKey(ByteBuffer encryptionKey) { + writeBuilder.withFileEncryptionKey(encryptionKey); + return self(); + } + + @Override + public B withAADPrefix(ByteBuffer aadPrefix) { + writeBuilder.withAADPrefix(aadPrefix); + return self(); + } + + @Override + public B spec(PartitionSpec newSpec) { + this.spec = newSpec; + return self(); + } + + @Override + public B partition(StructLike newPartition) { + this.partition = newPartition; + return self(); + } + + @Override + public B keyMetadata(EncryptionKeyMetadata newKeyMetadata) { + this.keyMetadata = newKeyMetadata; + return self(); + } + + @Override + public B sortOrder(SortOrder newSortOrder) { + this.sortOrder = newSortOrder; + return self(); + } + + private static class DataFileWriteBuilder + extends CommonWriteBuilderImpl, D, S> + implements DataWriteBuilder { + private DataFileWriteBuilder( + WriteBuilder writeBuilder, String location, FileFormat format) { + super(writeBuilder, location, format); + } + + @Override + public DataFileWriteBuilder schema(Schema schema) { + super.writeBuilder.schema(schema); + return this; + } + + @Override + public DataFileWriteBuilder inputSchema(S schema) { + super.writeBuilder.inputSchema(schema); + return this; + } + + @Override + public DataFileWriteBuilder self() { + return this; + } + + @Override + public DataWriter build() throws IOException { + Preconditions.checkArgument(super.spec != null, "Cannot create data writer without spec"); + Preconditions.checkArgument( + super.spec.isUnpartitioned() || super.partition != null, + "Partition must not be null when creating data writer for partitioned spec"); + + return new DataWriter<>( + super.writeBuilder.build(), + super.format, + super.location, + super.spec, + super.partition, + super.keyMetadata, + super.sortOrder); + } + } + + private static class EqualityDeleteFileWriteBuilder + extends CommonWriteBuilderImpl, D, S> + implements EqualityDeleteWriteBuilder { + private Schema rowSchema = null; + private int[] equalityFieldIds = null; + + private EqualityDeleteFileWriteBuilder( + WriteBuilder writeBuilder, String location, FileFormat format) { + super(writeBuilder, location, format); + } + + @Override + public EqualityDeleteFileWriteBuilder inputSchema(S schema) { + super.writeBuilder.inputSchema(schema); + return this; + } + + @Override + public EqualityDeleteFileWriteBuilder self() { + return this; + } + + @Override + public EqualityDeleteFileWriteBuilder rowSchema(Schema schema) { + this.rowSchema = schema; + return this; + } + + @Override + public EqualityDeleteFileWriteBuilder equalityFieldIds(int... fieldIds) { + this.equalityFieldIds = fieldIds; + return this; + } + + @Override + public EqualityDeleteWriter build() throws IOException { + Preconditions.checkState( + rowSchema != null, "Cannot create equality delete file without a schema"); + Preconditions.checkState( + equalityFieldIds != null, "Cannot create equality delete file without delete field ids"); + Preconditions.checkArgument( + super.spec != null, "Spec must not be null when creating equality delete writer"); + Preconditions.checkArgument( + super.spec.isUnpartitioned() || super.partition != null, + "Partition must not be null for partitioned writes"); + + return new EqualityDeleteWriter<>( + super.writeBuilder + .schema(rowSchema) + .meta("delete-type", "equality") + .meta( + "delete-field-ids", + IntStream.of(equalityFieldIds) + .mapToObj(Objects::toString) + .collect(Collectors.joining(", "))) + .build(), + super.format, + super.location, + super.spec, + super.partition, + super.keyMetadata, + super.sortOrder, + equalityFieldIds); + } + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private static class PositionDeleteFileWriteBuilder + extends CommonWriteBuilderImpl + implements PositionDeleteWriteBuilder { + + private PositionDeleteFileWriteBuilder( + WriteBuilder writeBuilder, String location, FileFormat format) { + super(writeBuilder, location, format); + } + + @Override + public PositionDeleteFileWriteBuilder self() { + return this; + } + + @Override + @SuppressWarnings("unchecked") + public PositionDeleteWriter build() throws IOException { + Preconditions.checkArgument( + super.spec != null, "Spec must not be null when creating position delete writer"); + Preconditions.checkArgument( + super.spec.isUnpartitioned() || super.partition != null, + "Partition must not be null for partitioned writes"); + + return new PositionDeleteWriter<>( + new PositionDeleteFileAppender( + super.writeBuilder.meta("delete-type", "position").build()), + super.format, + super.location, + super.spec, + super.partition, + super.keyMetadata); + } + } + + @SuppressWarnings("rawtypes") + private static class PositionDeleteFileAppender implements FileAppender { + private final FileAppender appender; + + PositionDeleteFileAppender(FileAppender appender) { + this.appender = appender; + } + + @Override + public void add(StructLike positionDelete) { + appender.add((PositionDelete) positionDelete); + } + + @Override + public Metrics metrics() { + return appender.metrics(); + } + + @Override + public long length() { + return appender.length(); + } + + @Override + public void close() throws IOException { + appender.close(); + } + + @Override + public List splitOffsets() { + return appender.splitOffsets(); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java new file mode 100644 index 000000000000..d81734794874 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.io.IOException; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.io.DataWriter; + +/** + * A specialized builder for creating data content file writers. + * + *

This builder extends the generic {@link CommonWriteBuilder} interface with functionality + * specific to creating {@link DataWriter} instances. Data writers produce table content files + * containing actual data records stored in an Iceberg table, configured according to the table's + * schema and partition specification. + * + * @param the type of data records the writer will accept + * @param the type of the schema for the input data + */ +public interface DataWriteBuilder extends CommonWriteBuilder> { + + /** Set the file schema. */ + DataWriteBuilder schema(Schema schema); + + /** + * Sets the input schema accepted by the writer. If not provided derived from the {@link + * #schema(Schema)}. + */ + DataWriteBuilder inputSchema(S schema); + + /** + * Creates a data file writer configured with the current builder settings. + * + *

The returned {@link DataWriter} produces files that conform to the Iceberg table format, + * generating proper {@link DataFile} metadata on completion. The writer accepts input records + * exactly matching the Iceberg schema specified via {@link #schema(Schema)} for writing. + * + * @return a fully configured {@link DataWriter} instance + * @throws IOException if the writer cannot be created due to I/O errors + */ + DataWriter build() throws IOException; +} diff --git a/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java new file mode 100644 index 000000000000..25b38da22159 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.io.IOException; +import java.util.List; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.Schema; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.util.ArrayUtil; + +/** + * A specialized builder for creating equality-based delete file writers. + * + *

This builder extends the generic {@link CommonWriteBuilder} interface with functionality + * specific to creating {@link EqualityDeleteWriter} instances. + * + *

The builder provides methods to configure which fields should be used for equality comparison + * through {@link #equalityFieldIds(List)} or {@link #equalityFieldIds(int...)}, along with schema + * configuration for the delete records. + * + * @param the type of data records the writer will accept + * @param the type of the schema for the input data + */ +public interface EqualityDeleteWriteBuilder + extends CommonWriteBuilder> { + + /** + * Sets the input schema accepted by the writer. If not provided derived from the {@link + * #rowSchema(Schema)}. + */ + EqualityDeleteWriteBuilder inputSchema(S schema); + + /** Sets the row schema for the delete writers. */ + EqualityDeleteWriteBuilder rowSchema(Schema rowSchema); + + /** Sets the equality field ids for the equality delete writer. */ + default EqualityDeleteWriteBuilder equalityFieldIds(List fieldIds) { + return equalityFieldIds(ArrayUtil.toIntArray(fieldIds)); + } + + /** Sets the equality field ids for the equality delete writer. */ + EqualityDeleteWriteBuilder equalityFieldIds(int... fieldIds); + + /** + * Creates an equality-based delete file writer configured with the current builder settings. + * + *

The returned {@link EqualityDeleteWriter} produces files that identify records to be deleted + * based on field equality, generating proper {@link DeleteFile} metadata on completion. + * + *

The writer accepts input records exactly matching the input schema specified via {@link + * #rowSchema(Schema)} for deletion. + * + * @return a fully configured {@link EqualityDeleteWriter} instance + * @throws IOException if the writer cannot be created due to I/O errors + */ + EqualityDeleteWriter build() throws IOException; +} diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModel.java b/core/src/main/java/org/apache/iceberg/formats/FormatModel.java new file mode 100644 index 000000000000..84ca467b627e --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModel.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.io.InputFile; + +/** + * Interface that provides a unified abstraction for converting between data file formats and + * input/output data representations. + * + *

{@link FormatModel} serves as a bridge between storage formats ({@link FileFormat}) and + * expected input/output data structures, optimizing performance through direct conversion without + * intermediate representations. File format implementations handle the low-level parsing details + * while the object model determines the in-memory representation used for the parsed data. + * Together, these provide a consistent API for consuming data files while optimizing for specific + * processing engines. + * + *

Iceberg provides some built-in object models and processing engines can implement custom + * object models to integrate with Iceberg's file reading and writing capabilities. + * + * @param output type used for reading data, and input type for writing data and deletes + * @param the type of the schema for the input/output data + */ +public interface FormatModel { + /** The file format which is read/written by the object model. */ + FileFormat format(); + + /** + * Return the row type class for the object model implementation processed by this factory. + * + *

The model types act as a contract specifying the expected data structures for both reading + * (converting file formats into output objects) and writing (converting input objects into file + * formats). This ensures proper integration between Iceberg's storage layer and processing + * engines. + * + *

Processing engines can define their own object models by implementing this interface and + * using their own model name. They can register these models with Iceberg by using the {@link + * FormatModelRegistry}. This allows custom data representations to be seamlessly integrated with + * Iceberg's file format handlers. + * + * @return the type of the data structures handled by this model implementation + */ + Class type(); + + /** + * Return the schema type class for the object model implementation processed by this factory. + * + * @return the type of the schema for the data structures handled by this model implementation + */ + Class schemaType(); + + /** + * Creates a writer builder for data files. + * + *

The returned {@link WriteBuilder} configures and creates a writer that converts input + * objects into the file format supported by this factory. + * + * @param outputFile destination for the written data + * @return configured writer builder + */ + WriteBuilder writeBuilder(EncryptedOutputFile outputFile); + + /** + * Creates a file reader builder for the specified input file. + * + *

The returned {@link ReadBuilder} configures and creates a reader that converts data from the + * file format into output objects supported by this factory. + * + * @param inputFile source file to read from + * @return configured reader builder for the specified input + */ + ReadBuilder readBuilder(InputFile inputFile); +} diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java new file mode 100644 index 000000000000..df78b4a2a7b7 --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.util.List; +import java.util.Map; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.common.DynMethods; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.util.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A registry that manages file-format-specific readers and writers through a unified object model + * factory interface. + * + *

This registry provides access to {@link ReadBuilder}s for data consumption and various writer + * builders: + * + *

    + *
  • {@link WriteBuilder} for basic file writing, + *
  • {@link DataWriteBuilder} for data files, + *
  • {@link EqualityDeleteWriteBuilder} for equality deletes, + *
  • {@link PositionDeleteWriteBuilder} for position deletes. + *
+ * + * The appropriate builder is selected based on {@link FileFormat} and object model name. + * + *

{@link FormatModel} objects are registered through {@link #register(FormatModel)} and used for + * creating readers and writers. Read builders are returned directly from the factory. Write + * builders may be wrapped in specialized content file writer implementations depending on the + * requested builder type. + */ +public final class FormatModelRegistry { + private static final Logger LOG = LoggerFactory.getLogger(FormatModelRegistry.class); + // The list of classes which are used for registering the reader and writer builders + private static final List CLASSES_TO_REGISTER = + ImmutableList.of( + "org.apache.iceberg.data.GenericFormatModels", + "org.apache.iceberg.arrow.vectorized.ArrowFormatModels", + "org.apache.iceberg.flink.data.FlinkFormatModels", + "org.apache.iceberg.spark.source.SparkFormatModels"); + + // Format models indexed by file format and object model class + private static final Map>, FormatModel> MODELS = + Maps.newConcurrentMap(); + + static { + registerSupportedFormats(); + } + + /** + * Registers an {@link FormatModel} in this registry. + * + *

The {@link FormatModel} creates readers and writers for a specific combinations of file + * format (Parquet, ORC, Avro) and object model (for example: "generic", "spark", "flink", etc.). + * Registering custom factories allows integration of new data processing engines for the + * supported file formats with Iceberg's file access mechanisms. + * + *

Each factory must be uniquely identified by its combination of file format and object model + * name. This uniqueness constraint prevents ambiguity when selecting factories for read and write + * operations. + * + * @param formatModel the factory implementation to register + * @throws IllegalArgumentException if a factory is already registered for the combination of + * {@link FormatModel#format()} and {@link FormatModel#type()} + */ + public static synchronized void register(FormatModel formatModel) { + Pair> key = Pair.of(formatModel.format(), formatModel.type()); + + FormatModel existing = MODELS.get(key); + Preconditions.checkArgument( + existing == null, + "Cannot register %s: %s is registered for format=%s type=%s schemaType=%s", + formatModel.getClass(), + existing == null ? null : existing.getClass(), + key.first(), + key.second(), + existing == null ? null : existing.schemaType()); + + MODELS.put(key, formatModel); + } + + /** + * Returns a reader builder for the specified file format and object model. + * + *

The returned {@link ReadBuilder} provides a fluent interface for configuring how data is + * read from the input file and converted to the output objects. + * + * @param format the file format (Parquet, Avro, ORC) that determines the parsing implementation + * @param type the output type + * @param inputFile source file to read data from + * @param the type of data records the reader will produce + * @param the type of the output schema for the reader + * @return a configured reader builder for the specified format and object model + */ + public static ReadBuilder readBuilder( + FileFormat format, Class type, InputFile inputFile) { + FormatModel factory = factoryFor(format, type); + return factory.readBuilder(inputFile); + } + + /** + * Returns a writer builder for generating a {@link DataFile}. + * + *

The returned builder produces a writer that accepts records defined by the specified object + * model and persists them using the provided file format. Unlike basic writers, this writer + * collects file metadata during the writing process and generates a {@link DataFile} that can be + * used for table operations. + * + * @param format the file format used for writing + * @param type the input type + * @param outputFile destination for the written data + * @param the type of data records the writer will accept + * @param the type of the input schema for the writer + * @return a configured data write builder for creating a {@link DataWriter} + */ + public static DataWriteBuilder dataWriteBuilder( + FileFormat format, Class type, EncryptedOutputFile outputFile) { + FormatModel factory = factoryFor(format, type); + return CommonWriteBuilderImpl.forDataFile( + factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); + } + + /** + * Creates a writer builder for generating a {@link DeleteFile} with equality deletes. + * + *

The returned builder produces a writer that accepts records defined by the specified object + * model and persists them using the given file format. The writer persists equality delete + * records that identify rows to be deleted based on the configured equality fields, producing a + * {@link DeleteFile} that can be used for table operations. + * + * @param format the file format used for writing + * @param type the input type + * @param outputFile destination for the written data + * @param the type of data records the writer will accept + * @param the type of the input schema for the writer + * @return a configured delete write builder for creating an {@link EqualityDeleteWriter} + */ + public static EqualityDeleteWriteBuilder equalityDeleteWriteBuilder( + FileFormat format, Class type, EncryptedOutputFile outputFile) { + FormatModel factory = factoryFor(format, type); + return CommonWriteBuilderImpl.forEqualityDelete( + factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); + } + + /** + * Creates a writer builder for generating a {@link DeleteFile} with position-based deletes. + * + *

The returned builder produces a writer that accepts records defined by the specified object + * model and persists them using the given file format. The writer accepts {@link PositionDelete} + * records that identify rows to be deleted by file path and position, producing a {@link + * DeleteFile} that can be used for table operations. + * + * @param format the file format used for writing + * @param outputFile destination for the written data + * @return a configured delete write builder for creating a {@link PositionDeleteWriter} + */ + @SuppressWarnings("rawtypes") + public static PositionDeleteWriteBuilder positionDeleteWriteBuilder( + FileFormat format, EncryptedOutputFile outputFile) { + FormatModel factory = factoryFor(format, PositionDelete.class); + return CommonWriteBuilderImpl.forPositionDelete( + factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); + } + + @VisibleForTesting + static Map>, FormatModel> models() { + return MODELS; + } + + @SuppressWarnings("unchecked") + private static FormatModel factoryFor(FileFormat format, Class type) { + FormatModel model = (FormatModel) MODELS.get(Pair.of(format, type)); + Preconditions.checkArgument( + model != null, "Format model is not registered for format %s and type %s", format, type); + return model; + } + + @SuppressWarnings("CatchBlockLogException") + private static void registerSupportedFormats() { + // Uses dynamic methods to call the `register` for the listed classes + for (String classToRegister : CLASSES_TO_REGISTER) { + try { + DynMethods.builder("register").impl(classToRegister).buildStaticChecked().invoke(); + } catch (NoSuchMethodException e) { + // failing to register a factory is normal and does not require a stack trace + LOG.info( + "Skip registration of {}. Likely the jar is not in the classpath", classToRegister); + } + } + } + + private FormatModelRegistry() {} +} diff --git a/core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java new file mode 100644 index 000000000000..ee379bfc249d --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.io.IOException; +import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteWriter; + +/** + * A specialized builder for creating position-based delete file writers. + * + *

This builder extends the generic {@link CommonWriteBuilder} interface with functionality + * specific to creating {@link PositionDeleteWriter} instances. + */ +public interface PositionDeleteWriteBuilder extends CommonWriteBuilder { + + /** + * Creates a position-based delete file writer configured with the current builder settings. + * + *

The returned {@link PositionDeleteWriter} produces files that identify records to be deleted + * by their file path and position, generating proper {@link DeleteFile} metadata on completion. + * The writer expects {@link PositionDelete} records as input. + * + * @param Only kept for backwards compatibility, the writer expects {@link PositionDelete} + * records as input, and the actual row data is not used. + * @return a fully configured {@link PositionDeleteWriter} instance + * @throws IOException if the writer cannot be created due to I/O errors + */ + PositionDeleteWriter build() throws IOException; +} diff --git a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java new file mode 100644 index 000000000000..d023e2d028ab --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.util.Map; +import org.apache.iceberg.Schema; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.mapping.NameMapping; + +/** + * Builder interface for creating file readers across supported data file formats. The {@link + * FormatModel} implementations provides appropriate {@link ReadBuilder} instances + * + *

The {@link ReadBuilder} follows the builder pattern to configure and create {@link + * CloseableIterable} instances that read data from source files. Configuration options include + * schema projection, predicate filtering, record batching, and encryption settings. + * + *

This interface is directly exposed to users for parameterizing readers. + * + * @param the output data type produced by the reader + * @param the type of the schema for the output data type + */ +public interface ReadBuilder { + /** + * Restricts the read to the given range: [start, start + length). + * + * @param start the start position for this read + * @param length the length of the range this read should scan + */ + ReadBuilder split(long start, long length); + + /** Set the projection schema. */ + ReadBuilder project(Schema schema); + + /** + * Configures whether filtering should be case-sensitive. If the reader supports filtering, it + * must respect this setting. The default value is true. + * + * @param caseSensitive indicates if filtering is case-sensitive + */ + ReadBuilder caseSensitive(boolean caseSensitive); + + /** + * Pushes down the {@link Expression} filter for the reader to prevent reading unnecessary + * records. Some readers may not support filtering, or may only support filtering for certain + * expressions. In this case the reader might return unfiltered or partially filtered rows. It is + * the caller's responsibility to apply the filter again. + * + * @param filter the filter to set + */ + ReadBuilder filter(Expression filter); + + /** + * Set a reader configuration property which affects the reader behavior. Reader builders should + * ignore configuration keys not known for them. + * + * @param key a reader config property name + * @param value config value + * @return this for method chaining + */ + ReadBuilder set(String key, String value); + + /** + * Sets multiple reader configuration properties that affect the reader behavior. Reader builders + * should ignore configuration keys not known for them. + * + * @param properties reader config properties to set + * @return this for method chaining + */ + default ReadBuilder setAll(Map properties) { + properties.forEach(this::set); + return this; + } + + /** Enables reusing the containers returned by the reader. Decreases pressure on GC. */ + ReadBuilder reuseContainers(); + + /** Sets the batch size for vectorized readers. */ + ReadBuilder recordsPerBatch(int rowsPerBatch); + + /** + * Contains the values in the result objects which are coming from metadata and not coming from + * the data files themselves. The keys of the map are the column ids, the values are the constant + * values to be used in the result. + */ + ReadBuilder idToConstant(Map idToConstant); + + /** Sets a mapping from external schema names to Iceberg type IDs. */ + ReadBuilder withNameMapping(NameMapping nameMapping); + + /** Builds the reader. */ + CloseableIterable build(); +} diff --git a/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java new file mode 100644 index 000000000000..f1fee495e3da --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Map; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.Schema; +import org.apache.iceberg.io.FileAppender; + +/** + * Builder interface for creating file writers across supported data file formats. The {@link + * FormatModel} implementations provide the appropriate {@link WriteBuilder} instances. + * + *

The {@link WriteBuilder} follows the builder pattern to configure and create {@link + * FileAppender} instances that write data to the target output files. + * + *

This interface is directly exposed to users for parameterizing when only an appender is + * required. + * + * @param the output data type produced by the reader + * @param the type of the schema for the output data type + */ +public interface WriteBuilder { + /** Set the file schema. */ + WriteBuilder schema(Schema schema); + + /** + * Sets the input schema accepted by the writer. If not provided derived from the {@link + * #schema(Schema)}. + */ + WriteBuilder inputSchema(S schema); + + /** + * Set a writer configuration property which affects the writer behavior. Writer builders should + * ignore configuration keys not known for them. + * + * @param property a writer config property name + * @param value config value + * @return this for method chaining + */ + WriteBuilder set(String property, String value); + + /** + * Sets multiple writer configuration properties that affect the writer behavior. Writer builders + * should ignore configuration keys not known for them. + * + * @param properties writer config properties to set + * @return this for method chaining + */ + default WriteBuilder setAll(Map properties) { + properties.forEach(this::set); + return this; + } + + /** + * Set a file metadata property in the created file. + * + * @param property a file metadata property name + * @param value config value + * @return this for method chaining + */ + WriteBuilder meta(String property, String value); + + /** + * Sets multiple file metadata properties in the created file. + * + * @param properties file metadata properties to set + * @return this for method chaining + */ + default WriteBuilder meta(Map properties) { + properties.forEach(this::meta); + return this; + } + + /** + * Based on the target file content the generated {@link FileAppender} needs different + * configuration. + */ + WriteBuilder content(FileContent content); + + /** Sets the metrics configuration used for collecting column metrics for the created file. */ + WriteBuilder metricsConfig(MetricsConfig metricsConfig); + + /** Overwrite the file if it already exists. By default, overwrite is disabled. */ + WriteBuilder overwrite(); + + /** + * Sets the encryption key used for writing the file. If the writer does not support encryption, + * then an exception should be thrown. + */ + WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey); + + /** + * Sets the additional authentication data (AAD) prefix used for writing the file. If the writer + * does not support encryption, then an exception should be thrown. + */ + WriteBuilder withAADPrefix(ByteBuffer aadPrefix); + + /** Finalizes the configuration and builds the {@link FileAppender}. */ + FileAppender build() throws IOException; +} diff --git a/core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java b/core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java new file mode 100644 index 000000000000..24e168d3131b --- /dev/null +++ b/core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.assertj.core.api.AssertionsForInterfaceTypes.assertThat; + +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.util.Pair; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +class TestFormatModelRegistry { + + @BeforeEach + void clearRegistry() { + FormatModelRegistry.models().clear(); + } + + @Test + void testSuccessfulRegister() { + FormatModel model = new DummyParquetFormatModel(Object.class, Object.class); + FormatModelRegistry.register(model); + assertThat(FormatModelRegistry.models()) + .containsEntry(Pair.of(FileFormat.PARQUET, Object.class), model); + } + + /** Tests that registering the same class with the same configuration updates the registration. */ + @Test + void testRegistrationForDifferentType() { + FormatModel model1 = new DummyParquetFormatModel(Object.class, Object.class); + FormatModel model2 = new DummyParquetFormatModel(Long.class, Object.class); + FormatModelRegistry.register(model1); + assertThat(FormatModelRegistry.models().get(Pair.of(FileFormat.PARQUET, model1.type()))) + .isSameAs(model1); + + // Registering a new model with the different format will succeed + FormatModelRegistry.register(model2); + assertThat(FormatModelRegistry.models().get(Pair.of(FileFormat.PARQUET, model1.type()))) + .isSameAs(model1); + assertThat(FormatModelRegistry.models().get(Pair.of(FileFormat.PARQUET, model2.type()))) + .isSameAs(model2); + } + + /** + * Tests that registering different classes, or different schema type for the same file format and + * type is failing. + */ + @Test + void testFailingReRegistrations() { + FormatModel model = new DummyParquetFormatModel(Object.class, Object.class); + FormatModelRegistry.register(model); + assertThat(FormatModelRegistry.models()) + .containsEntry(Pair.of(FileFormat.PARQUET, Object.class), model); + + // Registering a new model with different schema type should fail + assertThatThrownBy( + () -> + FormatModelRegistry.register( + new DummyParquetFormatModel(Object.class, String.class))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot register class"); + + // Registering a new model with null schema type should fail + assertThatThrownBy( + () -> FormatModelRegistry.register(new DummyParquetFormatModel(Object.class, null))) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("Cannot register class"); + } + + private static class DummyParquetFormatModel implements FormatModel { + private final Class type; + private final Class schemaType; + + private DummyParquetFormatModel(Class type, Class schemaType) { + this.type = type; + this.schemaType = schemaType; + } + + @Override + public FileFormat format() { + return FileFormat.PARQUET; + } + + @Override + @SuppressWarnings("unchecked") + public Class type() { + return (Class) type; + } + + @Override + @SuppressWarnings("unchecked") + public Class schemaType() { + return (Class) schemaType; + } + + @Override + public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + return null; + } + + @Override + public ReadBuilder readBuilder(InputFile inputFile) { + return null; + } + } +} diff --git a/data/src/main/java/org/apache/iceberg/data/BaseDeleteLoader.java b/data/src/main/java/org/apache/iceberg/data/BaseDeleteLoader.java index 99f5c742d37c..8dbb9dd44b8e 100644 --- a/data/src/main/java/org/apache/iceberg/data/BaseDeleteLoader.java +++ b/data/src/main/java/org/apache/iceberg/data/BaseDeleteLoader.java @@ -30,24 +30,18 @@ import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.avro.PlannedDataReader; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.deletes.Deletes; import org.apache.iceberg.deletes.PositionDeleteIndex; import org.apache.iceberg.deletes.PositionDeleteIndexUtil; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.io.RangeReadable; import org.apache.iceberg.io.SeekableInputStream; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.orc.OrcRowReader; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.parquet.ParquetValueReader; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; @@ -58,8 +52,6 @@ import org.apache.iceberg.util.StructLikeSet; import org.apache.iceberg.util.Tasks; import org.apache.iceberg.util.ThreadPools; -import org.apache.orc.TypeDescription; -import org.apache.parquet.schema.MessageType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -229,44 +221,9 @@ private CloseableIterable openDeletes( LOG.trace("Opening delete file {}", deleteFile.location()); InputFile inputFile = loadInputFile.apply(deleteFile); - switch (format) { - case AVRO: - return Avro.read(inputFile) - .project(projection) - .reuseContainers() - .createResolvingReader(PlannedDataReader::create) - .build(); - - case PARQUET: - return Parquet.read(inputFile) - .project(projection) - .filter(filter) - .reuseContainers() - .createReaderFunc(newParquetReaderFunc(projection)) - .build(); - - case ORC: - // reusing containers is automatic for ORC, no need to call 'reuseContainers' - return ORC.read(inputFile) - .project(projection) - .filter(filter) - .createReaderFunc(newOrcReaderFunc(projection)) - .build(); - - default: - throw new UnsupportedOperationException( - String.format( - "Cannot read deletes, %s is not a supported file format: %s", - format.name(), inputFile.location())); - } - } - - private Function> newParquetReaderFunc(Schema projection) { - return fileSchema -> GenericParquetReaders.buildReader(projection, fileSchema); - } - - private Function> newOrcReaderFunc(Schema projection) { - return fileSchema -> GenericOrcReader.buildReader(projection, fileSchema); + ReadBuilder builder = + FormatModelRegistry.readBuilder(format, Record.class, inputFile); + return builder.project(projection).reuseContainers().filter(filter).build(); } private Iterable execute(Iterable objects, Function func) { diff --git a/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java b/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java index 55f3b5701e0b..486ea99f7aa6 100644 --- a/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java +++ b/data/src/main/java/org/apache/iceberg/data/BaseFileWriterFactory.java @@ -40,7 +40,13 @@ import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -/** A base writer factory to be extended by query engine integrations. */ +/** + * A base writer factory to be extended by query engine integrations. + * + * @deprecated deprecated as of version 1.11.0 and will be removed in 1.12.0. Use {@link + * RegistryBasedFileWriterFactory} + */ +@Deprecated public abstract class BaseFileWriterFactory implements FileWriterFactory, Serializable { private final Table table; private final FileFormat dataFileFormat; @@ -75,13 +81,6 @@ protected BaseFileWriterFactory( this.positionDeleteRowSchema = null; } - /** - * @deprecated This constructor is deprecated as of version 1.11.0 and will be removed in 1.12.0. - * Position deletes that include row data are no longer supported. Use {@link - * #BaseFileWriterFactory(Table, FileFormat, Schema, SortOrder, FileFormat, int[], Schema, - * SortOrder, Map)} instead. - */ - @Deprecated protected BaseFileWriterFactory( Table table, FileFormat dataFileFormat, diff --git a/data/src/main/java/org/apache/iceberg/data/GenericFileWriterFactory.java b/data/src/main/java/org/apache/iceberg/data/GenericFileWriterFactory.java index e6872cc6e136..8d9b43f89685 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericFileWriterFactory.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericFileWriterFactory.java @@ -22,21 +22,36 @@ import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; +import java.io.IOException; +import java.io.UncheckedIOException; import java.util.Map; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.avro.Avro; import org.apache.iceberg.data.avro.DataWriter; import org.apache.iceberg.data.orc.GenericOrcWriter; import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -public class GenericFileWriterFactory extends BaseFileWriterFactory { +public class GenericFileWriterFactory extends RegistryBasedFileWriterFactory { + private static final Logger LOG = LoggerFactory.getLogger(GenericFileWriterFactory.class); + + private Table table; + private FileFormat format; + private Schema positionDeleteRowSchema; GenericFileWriterFactory( Table table, @@ -50,13 +65,16 @@ public class GenericFileWriterFactory extends BaseFileWriterFactory { super( table, dataFileFormat, + Record.class, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSortOrder, - ImmutableMap.of()); + ImmutableMap.of(), + null, + null); } /** @@ -80,14 +98,19 @@ public class GenericFileWriterFactory extends BaseFileWriterFactory { super( table, dataFileFormat, + Record.class, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSortOrder, - positionDeleteRowSchema, - writerProperties); + writerProperties, + null, + null); + this.table = table; + this.format = dataFileFormat; + this.positionDeleteRowSchema = positionDeleteRowSchema; } /** @@ -107,62 +130,166 @@ public class GenericFileWriterFactory extends BaseFileWriterFactory { super( table, dataFileFormat, + Record.class, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSortOrder, - positionDeleteRowSchema); + ImmutableMap.of(), + dataSchema, + equalityDeleteRowSchema); + this.table = table; + this.format = dataFileFormat; + this.positionDeleteRowSchema = positionDeleteRowSchema; } static Builder builderFor(Table table) { return new Builder(table); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configureDataWrite(Avro.DataWriteBuilder builder) { - builder.createWriterFunc(DataWriter::create); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(DataWriter::create); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(DataWriter::create); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(GenericParquetWriter::create); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(GenericParquetWriter::create); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc(GenericParquetWriter::create); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc(GenericOrcWriter::buildWriter); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc(GenericOrcWriter::buildWriter); + throwUnsupportedOperationException(); } - @Override + /** + * @deprecated Since 1.10.0, will be removed in 1.11.0. It won't be called starting 1.10.0 as the + * configuration is done by the {@link FormatModelRegistry}. + */ + @Deprecated protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc(GenericOrcWriter::buildWriter); + throwUnsupportedOperationException(); + } + + private void throwUnsupportedOperationException() { + throw new UnsupportedOperationException( + "Method is deprecated and should not be called. " + + "Configuration is already done by the registry."); + } + + @Override + public PositionDeleteWriter newPositionDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { + if (positionDeleteRowSchema == null) { + return super.newPositionDeleteWriter(file, spec, partition); + } else { + LOG.info( + "Deprecated feature used. Position delete row schema is used to create the position delete writer."); + MetricsConfig metricsConfig = + table != null + ? MetricsConfig.forPositionDelete(table) + : MetricsConfig.fromProperties(ImmutableMap.of()); + + try { + switch (format) { + case AVRO: + return Avro.writeDeletes(file) + .createWriterFunc(DataWriter::create) + .withPartition(partition) + .overwrite() + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(file.keyMetadata()) + .buildPositionWriter(); + + case ORC: + return ORC.writeDeletes(file) + .createWriterFunc(GenericOrcWriter::buildWriter) + .withPartition(partition) + .overwrite() + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(file.keyMetadata()) + .buildPositionWriter(); + + case PARQUET: + return Parquet.writeDeletes(file) + .createWriterFunc(GenericParquetWriter::create) + .withPartition(partition) + .overwrite() + .metricsConfig(metricsConfig) + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(file.keyMetadata()) + .buildPositionWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to create new position delete writer", e); + } + } } public static class Builder { diff --git a/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java new file mode 100644 index 000000000000..540b98c483e9 --- /dev/null +++ b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import org.apache.iceberg.Schema; +import org.apache.iceberg.avro.AvroFormatModel; +import org.apache.iceberg.data.avro.DataWriter; +import org.apache.iceberg.data.avro.PlannedDataReader; +import org.apache.iceberg.data.orc.GenericOrcReader; +import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.data.parquet.GenericParquetReaders; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.orc.ORCFormatModel; +import org.apache.iceberg.parquet.ParquetFormatModel; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class GenericFormatModels { + private static final Logger LOG = LoggerFactory.getLogger(GenericFormatModels.class); + + public static void register() { + // ORC, Parquet are optional dependencies. If they are not present, we should just log and + // ignore NoClassDefFoundErrors + registerAvro(); + registerParquet(); + registerOrc(); + } + + private static void registerParquet() { + logAngIgnoreNoClassDefFoundError( + () -> + FormatModelRegistry.register( + new ParquetFormatModel<>( + Record.class, + Schema.class, + GenericParquetReaders::buildReader, + (schema, messageType, inputType) -> + GenericParquetWriter.create(schema, messageType)))); + logAngIgnoreNoClassDefFoundError( + () -> FormatModelRegistry.register(new ParquetFormatModel<>(PositionDelete.class))); + } + + private static void registerAvro() { + logAngIgnoreNoClassDefFoundError( + () -> + FormatModelRegistry.register( + new AvroFormatModel<>( + Record.class, + Schema.class, + PlannedDataReader::create, + (schema, inputSchema) -> DataWriter.create(schema)))); + logAngIgnoreNoClassDefFoundError( + () -> FormatModelRegistry.register(new AvroFormatModel<>(PositionDelete.class))); + } + + private static void registerOrc() { + logAngIgnoreNoClassDefFoundError( + () -> + FormatModelRegistry.register( + new ORCFormatModel<>( + Record.class, + Schema.class, + GenericOrcReader::buildReader, + (schema, typeDescription, unused) -> + GenericOrcWriter.buildWriter(schema, typeDescription)))); + logAngIgnoreNoClassDefFoundError( + () -> FormatModelRegistry.register(new ORCFormatModel<>(PositionDelete.class))); + } + + private GenericFormatModels() {} + + @SuppressWarnings("CatchBlockLogException") + private static void logAngIgnoreNoClassDefFoundError(Runnable runnable) { + try { + runnable.run(); + } catch (NoClassDefFoundError e) { + // Log the exception and ignore it + LOG.info("Exception occurred when trying to register format models: {}", e.getMessage()); + } + } +} diff --git a/data/src/main/java/org/apache/iceberg/data/GenericReader.java b/data/src/main/java/org/apache/iceberg/data/GenericReader.java index 9a1455f80fb0..f18f5785105f 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericReader.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericReader.java @@ -22,26 +22,19 @@ import java.util.Map; import org.apache.iceberg.CombinedScanTask; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.TableScan; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.avro.PlannedDataReader; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.expressions.Evaluator; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableGroup; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Iterables; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.PartitionUtil; class GenericReader implements Serializable { @@ -96,58 +89,19 @@ private CloseableIterable openFile(FileScanTask task, Schema fileProject Map partition = PartitionUtil.constantsMap(task, IdentityPartitionConverters::convertConstant); - switch (task.file().format()) { - case AVRO: - Avro.ReadBuilder avro = - Avro.read(input) - .project(fileProjection) - .createResolvingReader(schema -> PlannedDataReader.create(schema, partition)) - .split(task.start(), task.length()); - - if (reuseContainers) { - avro.reuseContainers(); - } - - return avro.build(); - - case PARQUET: - Parquet.ReadBuilder parquet = - Parquet.read(input) - .project(fileProjection) - .createReaderFunc( - fileSchema -> - GenericParquetReaders.buildReader(fileProjection, fileSchema, partition)) - .split(task.start(), task.length()) - .caseSensitive(caseSensitive) - .filter(task.residual()); - - if (reuseContainers) { - parquet.reuseContainers(); - } - - return parquet.build(); - - case ORC: - Schema projectionWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - fileProjection, Sets.union(partition.keySet(), MetadataColumns.metadataFieldIds())); - ORC.ReadBuilder orc = - ORC.read(input) - .project(projectionWithoutConstantAndMetadataFields) - .createReaderFunc( - fileSchema -> - GenericOrcReader.buildReader(fileProjection, fileSchema, partition)) - .split(task.start(), task.length()) - .caseSensitive(caseSensitive) - .filter(task.residual()); - - return orc.build(); - - default: - throw new UnsupportedOperationException( - String.format( - "Cannot read %s file: %s", task.file().format().name(), task.file().location())); + ReadBuilder builder = + FormatModelRegistry.readBuilder(task.file().format(), Record.class, input); + if (reuseContainers) { + builder = builder.reuseContainers(); } + + return builder + .project(fileProjection) + .idToConstant(partition) + .split(task.start(), task.length()) + .caseSensitive(caseSensitive) + .filter(task.residual()) + .build(); } private class CombinedTaskIterable extends CloseableGroup implements CloseableIterable { diff --git a/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java new file mode 100644 index 000000000000..57ceb2f4c565 --- /dev/null +++ b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java @@ -0,0 +1,184 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.data; + +import java.io.IOException; +import java.io.Serializable; +import java.io.UncheckedIOException; +import java.util.Map; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.Table; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.encryption.EncryptionKeyMetadata; +import org.apache.iceberg.formats.DataWriteBuilder; +import org.apache.iceberg.formats.EqualityDeleteWriteBuilder; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.PositionDeleteWriteBuilder; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; + +/** + * A base writer factory to be extended by query engine integrations. + * + * @param row type + */ +public abstract class RegistryBasedFileWriterFactory + implements FileWriterFactory, Serializable { + private final Table table; + private final FileFormat dataFileFormat; + private final Class inputType; + private final Schema dataSchema; + private final SortOrder dataSortOrder; + private final FileFormat deleteFileFormat; + private final int[] equalityFieldIds; + private final Schema equalityDeleteRowSchema; + private final SortOrder equalityDeleteSortOrder; + private final Map writerProperties; + private final S inputSchema; + private final S equalityDeleteInputSchema; + + protected RegistryBasedFileWriterFactory( + Table table, + FileFormat dataFileFormat, + Class inputType, + Schema dataSchema, + SortOrder dataSortOrder, + FileFormat deleteFileFormat, + int[] equalityFieldIds, + Schema equalityDeleteRowSchema, + SortOrder equalityDeleteSortOrder, + Map writerProperties, + S inputSchema, + S equalityDeleteInputSchema) { + this.table = table; + this.dataFileFormat = dataFileFormat; + this.inputType = inputType; + this.dataSchema = dataSchema; + this.dataSortOrder = dataSortOrder; + this.deleteFileFormat = deleteFileFormat; + this.equalityFieldIds = equalityFieldIds; + this.equalityDeleteRowSchema = equalityDeleteRowSchema; + this.equalityDeleteSortOrder = equalityDeleteSortOrder; + this.writerProperties = writerProperties != null ? writerProperties : ImmutableMap.of(); + this.inputSchema = inputSchema; + this.equalityDeleteInputSchema = equalityDeleteInputSchema; + } + + protected S inputSchema() { + return inputSchema; + } + + protected S equalityDeleteInputSchema() { + return equalityDeleteInputSchema; + } + + @Override + public DataWriter newDataWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { + Preconditions.checkNotNull(dataSchema, "Data schema must not be null"); + EncryptionKeyMetadata keyMetadata = file.keyMetadata(); + Map properties = table != null ? table.properties() : ImmutableMap.of(); + MetricsConfig metricsConfig = + table != null ? MetricsConfig.forTable(table) : MetricsConfig.getDefault(); + + try { + DataWriteBuilder builder = + FormatModelRegistry.dataWriteBuilder(dataFileFormat, inputType, file); + return builder + .schema(dataSchema) + .inputSchema(inputSchema()) + .setAll(properties) + .setAll(writerProperties) + .metricsConfig(metricsConfig) + .spec(spec) + .partition(partition) + .keyMetadata(keyMetadata) + .sortOrder(dataSortOrder) + .overwrite() + .build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to create new data writer", e); + } + } + + @Override + public EqualityDeleteWriter newEqualityDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { + Preconditions.checkNotNull(equalityDeleteRowSchema, "Equality delete schema must not be null"); + + EncryptionKeyMetadata keyMetadata = file.keyMetadata(); + Map properties = table != null ? table.properties() : ImmutableMap.of(); + MetricsConfig metricsConfig = + table != null ? MetricsConfig.forTable(table) : MetricsConfig.getDefault(); + + try { + EqualityDeleteWriteBuilder builder = + FormatModelRegistry.equalityDeleteWriteBuilder(deleteFileFormat, inputType, file); + return builder + .setAll(properties) + .setAll(writerProperties) + .metricsConfig(metricsConfig) + .rowSchema(equalityDeleteRowSchema) + .inputSchema(equalityDeleteInputSchema()) + .equalityFieldIds(equalityFieldIds) + .spec(spec) + .partition(partition) + .keyMetadata(keyMetadata) + .sortOrder(equalityDeleteSortOrder) + .overwrite() + .build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to create new equality delete writer", e); + } + } + + @Override + public PositionDeleteWriter newPositionDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { + EncryptionKeyMetadata keyMetadata = file.keyMetadata(); + Map properties = table != null ? table.properties() : ImmutableMap.of(); + MetricsConfig metricsConfig = + table != null ? MetricsConfig.forPositionDelete(table) : MetricsConfig.forPositionDelete(); + + try { + PositionDeleteWriteBuilder builder = + FormatModelRegistry.positionDeleteWriteBuilder(deleteFileFormat, file); + return builder + .setAll(properties) + .setAll(writerProperties) + .metricsConfig(metricsConfig) + .spec(spec) + .partition(partition) + .keyMetadata(keyMetadata) + .overwrite() + .build(); + } catch (IOException e) { + throw new UncheckedIOException("Failed to create new position delete writer", e); + } + } +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java new file mode 100644 index 000000000000..ee307b7a7c71 --- /dev/null +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.flink.data; + +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.types.logical.RowType; +import org.apache.iceberg.avro.AvroFormatModel; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.orc.ORCFormatModel; +import org.apache.iceberg.parquet.ParquetFormatModel; + +public class FlinkFormatModels { + public static void register() { + FormatModelRegistry.register( + new ParquetFormatModel<>( + RowData.class, + RowType.class, + FlinkParquetReaders::buildReader, + (unused, messageType, rowType) -> + FlinkParquetWriters.buildWriter(rowType, messageType))); + + FormatModelRegistry.register( + new AvroFormatModel<>( + RowData.class, + RowType.class, + FlinkPlannedAvroReader::create, + (unused, rowType) -> new FlinkAvroWriter(rowType))); + + FormatModelRegistry.register( + new ORCFormatModel<>( + RowData.class, + RowType.class, + FlinkOrcReader::new, + (schema, unused, rowType) -> FlinkOrcWriter.buildWriter(rowType, schema))); + } + + private FlinkFormatModels() {} +} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java index 1440fde3248c..a76bac515b3d 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkSchemaVisitor.java @@ -19,6 +19,7 @@ package org.apache.iceberg.flink.data; import java.util.List; +import org.apache.flink.annotation.Internal; import org.apache.flink.table.types.logical.ArrayType; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.MapType; @@ -29,9 +30,10 @@ import org.apache.iceberg.types.Type; import org.apache.iceberg.types.Types; +@Internal abstract class FlinkSchemaVisitor { - static T visit(RowType flinkType, Schema schema, FlinkSchemaVisitor visitor) { + public static T visit(RowType flinkType, Schema schema, FlinkSchemaVisitor visitor) { return visit(flinkType, schema.asStruct(), visitor); } @@ -94,24 +96,29 @@ private static T visitRecord( List fieldTypes = Lists.newArrayListWithExpectedSize(fieldSize); List nestedFields = struct.fields(); - for (int i = 0; i < fieldSize; i++) { - Types.NestedField iField = nestedFields.get(i); - int fieldIndex = rowType.getFieldIndex(iField.name()); - Preconditions.checkArgument( - fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); + visitor.beforeStruct(struct.asStructType()); - LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); + try { + for (int i = 0; i < fieldSize; i++) { + Types.NestedField iField = nestedFields.get(i); + int fieldIndex = rowType.getFieldIndex(iField.name()); + Preconditions.checkArgument( + fieldIndex >= 0, "NestedField: %s is not found in flink RowType: %s", iField, rowType); - fieldTypes.add(fieldFlinkType); + LogicalType fieldFlinkType = rowType.getTypeAt(fieldIndex); + fieldTypes.add(fieldFlinkType); - visitor.beforeField(iField); - try { - if (iField.type() != Types.UnknownType.get()) { - results.add(visit(fieldFlinkType, iField.type(), visitor)); + visitor.beforeField(iField); + try { + if (iField.type() != Types.UnknownType.get()) { + results.add(visit(fieldFlinkType, iField.type(), visitor)); + } + } finally { + visitor.afterField(iField); } - } finally { - visitor.afterField(iField); } + } finally { + visitor.afterStruct(struct.asStructType()); } return visitor.record(struct, results, fieldTypes); @@ -137,6 +144,10 @@ public void beforeField(Types.NestedField field) {} public void afterField(Types.NestedField field) {} + public void beforeStruct(Types.StructType type) {} + + public void afterStruct(Types.StructType type) {} + public void beforeListElement(Types.NestedField elementField) { beforeField(elementField); } diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java index b3ada41737bc..d5247941d863 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/sink/FlinkFileWriterFactory.java @@ -25,28 +25,19 @@ import java.io.Serializable; import java.util.Map; import org.apache.flink.table.data.RowData; -import org.apache.flink.table.data.StringData; import org.apache.flink.table.types.logical.RowType; import org.apache.iceberg.FileFormat; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.BaseFileWriterFactory; +import org.apache.iceberg.data.RegistryBasedFileWriterFactory; import org.apache.iceberg.flink.FlinkSchemaUtil; -import org.apache.iceberg.flink.data.FlinkAvroWriter; -import org.apache.iceberg.flink.data.FlinkOrcWriter; -import org.apache.iceberg.flink.data.FlinkParquetWriters; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -public class FlinkFileWriterFactory extends BaseFileWriterFactory implements Serializable { - private RowType dataFlinkType; - private RowType equalityDeleteFlinkType; - - private FlinkFileWriterFactory( +public class FlinkFileWriterFactory extends RegistryBasedFileWriterFactory + implements Serializable { + FlinkFileWriterFactory( Table table, FileFormat dataFileFormat, Schema dataSchema, @@ -62,85 +53,30 @@ private FlinkFileWriterFactory( super( table, dataFileFormat, + RowData.class, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSortOrder, - writeProperties); - - this.dataFlinkType = dataFlinkType; - this.equalityDeleteFlinkType = equalityDeleteFlinkType; - } - - static Builder builderFor(Table table) { - return new Builder(table); - } - - @Override - protected void configureDataWrite(Avro.DataWriteBuilder builder) { - builder.createWriterFunc(ignore -> new FlinkAvroWriter(dataFlinkType())); - } - - @Override - protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(ignored -> new FlinkAvroWriter(equalityDeleteFlinkType())); - } - - @Override - protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) {} - - @Override - protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(msgType -> FlinkParquetWriters.buildWriter(dataFlinkType(), msgType)); - } - - @Override - protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> FlinkParquetWriters.buildWriter(equalityDeleteFlinkType(), msgType)); - } - - @Override - protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - @Override - protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(dataFlinkType(), iSchema)); - } - - @Override - protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc( - (iSchema, typDesc) -> FlinkOrcWriter.buildWriter(equalityDeleteFlinkType(), iSchema)); + writeProperties, + dataFlinkType == null ? FlinkSchemaUtil.convert(dataSchema) : dataFlinkType, + equalityDeleteInputSchema(equalityDeleteFlinkType, equalityDeleteRowSchema)); } - @Override - protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.transformPaths(path -> StringData.fromString(path.toString())); - } - - private RowType dataFlinkType() { - if (dataFlinkType == null) { - Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); - this.dataFlinkType = FlinkSchemaUtil.convert(dataSchema()); + private static RowType equalityDeleteInputSchema(RowType rowType, Schema rowSchema) { + if (rowType != null) { + return rowType; + } else if (rowSchema != null) { + return FlinkSchemaUtil.convert(rowSchema); + } else { + return null; } - - return dataFlinkType; } - private RowType equalityDeleteFlinkType() { - if (equalityDeleteFlinkType == null) { - Preconditions.checkNotNull( - equalityDeleteRowSchema(), "Equality delete schema must not be null"); - this.equalityDeleteFlinkType = FlinkSchemaUtil.convert(equalityDeleteRowSchema()); - } - - return equalityDeleteFlinkType; + static Builder builderFor(Table table) { + return new Builder(table); } public static class Builder { diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java index b8fb1ba32edf..586cdc415993 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/source/RowDataFileScanTaskReader.java @@ -24,10 +24,8 @@ import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.RowType; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.Schema; import org.apache.iceberg.StructLike; -import org.apache.iceberg.avro.Avro; import org.apache.iceberg.data.DeleteFilter; import org.apache.iceberg.encryption.InputFilesDecryptor; import org.apache.iceberg.expressions.Expression; @@ -35,19 +33,14 @@ import org.apache.iceberg.flink.FlinkSchemaUtil; import org.apache.iceberg.flink.FlinkSourceFilter; import org.apache.iceberg.flink.RowDataWrapper; -import org.apache.iceberg.flink.data.FlinkOrcReader; -import org.apache.iceberg.flink.data.FlinkParquetReaders; -import org.apache.iceberg.flink.data.FlinkPlannedAvroReader; import org.apache.iceberg.flink.data.RowDataProjection; import org.apache.iceberg.flink.data.RowDataUtil; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.NameMappingParser; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.PartitionUtil; @Internal @@ -73,8 +66,7 @@ public RowDataFileScanTaskReader( if (filters != null && !filters.isEmpty()) { Expression combinedExpression = filters.stream().reduce(Expressions.alwaysTrue(), Expressions::and); - this.rowFilter = - new FlinkSourceFilter(this.projectedSchema, combinedExpression, this.caseSensitive); + this.rowFilter = new FlinkSourceFilter(projectedSchema, combinedExpression, caseSensitive); } else { this.rowFilter = null; } @@ -112,23 +104,23 @@ private CloseableIterable newIterable( if (task.isDataTask()) { throw new UnsupportedOperationException("Cannot read data task."); } else { - switch (task.file().format()) { - case PARQUET: - iter = newParquetIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case AVRO: - iter = newAvroIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - case ORC: - iter = newOrcIterable(task, schema, idToConstant, inputFilesDecryptor); - break; - - default: - throw new UnsupportedOperationException( - "Cannot read unknown format: " + task.file().format()); + ReadBuilder builder = + FormatModelRegistry.readBuilder( + task.file().format(), RowData.class, inputFilesDecryptor.getInputFile(task)); + + if (nameMapping != null) { + builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } + + iter = + builder + .project(schema) + .idToConstant(idToConstant) + .split(task.start(), task.length()) + .caseSensitive(caseSensitive) + .filter(task.residual()) + .reuseContainers() + .build(); } if (rowFilter != null) { @@ -137,72 +129,6 @@ private CloseableIterable newIterable( return iter; } - private CloseableIterable newAvroIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Avro.ReadBuilder builder = - Avro.read(inputFilesDecryptor.getInputFile(task)) - .reuseContainers() - .project(schema) - .split(task.start(), task.length()) - .createReaderFunc(readSchema -> FlinkPlannedAvroReader.create(schema, idToConstant)); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newParquetIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Parquet.ReadBuilder builder = - Parquet.read(inputFilesDecryptor.getInputFile(task)) - .split(task.start(), task.length()) - .project(schema) - .createReaderFunc( - fileSchema -> FlinkParquetReaders.buildReader(schema, fileSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .reuseContainers(); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - - private CloseableIterable newOrcIterable( - FileScanTask task, - Schema schema, - Map idToConstant, - InputFilesDecryptor inputFilesDecryptor) { - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - schema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - ORC.ReadBuilder builder = - ORC.read(inputFilesDecryptor.getInputFile(task)) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(task.start(), task.length()) - .createReaderFunc( - readOrcSchema -> new FlinkOrcReader(schema, readOrcSchema, idToConstant)) - .filter(task.residual()) - .caseSensitive(caseSensitive); - - if (nameMapping != null) { - builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - return builder.build(); - } - private static class FlinkDeleteFilter extends DeleteFilter { private final RowType requiredRowType; private final RowDataWrapper asStructLike; diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java index da5b5f6c28f0..3d032fac64e9 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestCompressionSettings.java @@ -35,7 +35,7 @@ import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.common.DynFields; -import org.apache.iceberg.data.BaseFileWriterFactory; +import org.apache.iceberg.data.RegistryBasedFileWriterFactory; import org.apache.iceberg.flink.FlinkWriteConf; import org.apache.iceberg.flink.FlinkWriteOptions; import org.apache.iceberg.flink.SimpleDataUtil; @@ -252,7 +252,7 @@ private static Map appenderProperties( .build(writerField.get()); DynFields.BoundField> propsField = DynFields.builder() - .hiddenImpl(BaseFileWriterFactory.class, "writerProperties") + .hiddenImpl(RegistryBasedFileWriterFactory.class, "writerProperties") .build(writerFactoryField.get()); return propsField.get(); } diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java index 7f4f7758e519..9f508bbe717d 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/TestIcebergStreamWriter.java @@ -34,6 +34,8 @@ import org.apache.flink.table.api.DataTypes; import org.apache.flink.table.catalog.Column; import org.apache.flink.table.catalog.ResolvedSchema; +import org.apache.flink.table.data.GenericArrayData; +import org.apache.flink.table.data.GenericMapData; import org.apache.flink.table.data.GenericRowData; import org.apache.flink.table.data.RowData; import org.apache.flink.table.types.logical.RowType; @@ -319,11 +321,32 @@ public void testTableWithTargetFileSize() throws Exception { public void testPromotedFlinkDataType() throws Exception { Schema iSchema = new Schema( + Types.NestedField.required( + 4, "array", Types.ListType.ofOptional(5, Types.IntegerType.get())), + Types.NestedField.required( + 6, + "map", + Types.MapType.ofOptional(7, 8, Types.IntegerType.get(), Types.IntegerType.get())), + Types.NestedField.required( + 9, + "struct", + Types.StructType.of( + Types.NestedField.optional(10, "struct_1", Types.IntegerType.get()), + Types.NestedField.optional(11, "struct_2", Types.IntegerType.get()))), Types.NestedField.required(1, "tinyint", Types.IntegerType.get()), Types.NestedField.required(2, "smallint", Types.IntegerType.get()), Types.NestedField.optional(3, "int", Types.IntegerType.get())); ResolvedSchema flinkSchema = ResolvedSchema.of( + Column.physical("array", DataTypes.ARRAY(DataTypes.TINYINT()).notNull()), + Column.physical( + "map", DataTypes.MAP(DataTypes.TINYINT(), DataTypes.TINYINT()).notNull()), + Column.physical( + "struct", + DataTypes.ROW( + DataTypes.FIELD("struct_1", DataTypes.TINYINT()), + DataTypes.FIELD("struct_2", DataTypes.TINYINT())) + .notNull()), Column.physical("tinyint", DataTypes.TINYINT().notNull()), Column.physical("smallint", DataTypes.SMALLINT().notNull()), Column.physical("int", DataTypes.INT().nullable())); @@ -347,16 +370,74 @@ public void testPromotedFlinkDataType() throws Exception { List rows = Lists.newArrayList( - GenericRowData.of((byte) 0x01, (short) -32768, 101), - GenericRowData.of((byte) 0x02, (short) 0, 102), - GenericRowData.of((byte) 0x03, (short) 32767, 103)); + GenericRowData.of( + new GenericArrayData(new byte[] {(byte) 0x04, (byte) 0x05}), + new GenericMapData(ImmutableMap.of((byte) 0x06, (byte) 0x07)), + GenericRowData.of((byte) 0x08, (byte) 0x09), + (byte) 0x01, + (short) -32768, + 101), + GenericRowData.of( + new GenericArrayData(new byte[] {(byte) 0x0a, (byte) 0x0b}), + new GenericMapData(ImmutableMap.of((byte) 0x0c, (byte) 0x0d)), + GenericRowData.of((byte) 0x0e, (byte) 0x0f), + (byte) 0x02, + (short) 0, + 102), + GenericRowData.of( + new GenericArrayData(new byte[] {(byte) 0x10, (byte) 0x11}), + new GenericMapData(ImmutableMap.of((byte) 0x12, (byte) 0x13)), + GenericRowData.of((byte) 0x14, (byte) 0x15), + (byte) 0x03, + (short) 32767, + 103)); Record record = GenericRecord.create(iSchema); + Record struct = GenericRecord.create(iSchema.findField("struct").type().asStructType()); List expected = Lists.newArrayList( - record.copy(ImmutableMap.of("tinyint", 1, "smallint", -32768, "int", 101)), - record.copy(ImmutableMap.of("tinyint", 2, "smallint", 0, "int", 102)), - record.copy(ImmutableMap.of("tinyint", 3, "smallint", 32767, "int", 103))); + record.copy( + ImmutableMap.of( + "array", + Lists.newArrayList(4, 5), + "map", + ImmutableMap.of(6, 7), + "struct", + struct.copy(ImmutableMap.of("struct_1", 8, "struct_2", 9)), + "tinyint", + 1, + "smallint", + -32768, + "int", + 101)), + record.copy( + ImmutableMap.of( + "array", + Lists.newArrayList(10, 11), + "map", + ImmutableMap.of(12, 13), + "struct", + struct.copy(ImmutableMap.of("struct_1", 14, "struct_2", 15)), + "tinyint", + 2, + "smallint", + 0, + "int", + 102)), + record.copy( + ImmutableMap.of( + "array", + Lists.newArrayList(16, 17), + "map", + ImmutableMap.of(18, 19), + "struct", + struct.copy(ImmutableMap.of("struct_1", 20, "struct_2", 21)), + "tinyint", + 3, + "smallint", + 32767, + "int", + 103))); try (OneInputStreamOperatorTestHarness testHarness = createIcebergStreamWriter(icebergTable, flinkSchema)) { diff --git a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java index d17848225f69..4dbf9f3129a3 100644 --- a/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java +++ b/flink/v2.1/flink/src/test/java/org/apache/iceberg/flink/sink/dynamic/TestDynamicWriter.java @@ -33,7 +33,7 @@ import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.common.DynFields; -import org.apache.iceberg.data.BaseFileWriterFactory; +import org.apache.iceberg.data.RegistryBasedFileWriterFactory; import org.apache.iceberg.flink.SimpleDataUtil; import org.apache.iceberg.flink.sink.TestFlinkIcebergSinkBase; import org.apache.iceberg.io.BaseTaskWriter; @@ -280,7 +280,7 @@ private Map properties(DynamicWriter dynamicWriter) { .build(writerField.get().values().iterator().next()); DynFields.BoundField> propsField = DynFields.builder() - .hiddenImpl(BaseFileWriterFactory.class, "writerProperties") + .hiddenImpl(RegistryBasedFileWriterFactory.class, "writerProperties") .build(writerFactoryField.get()); return propsField.get(); } diff --git a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java index 58966c666d5d..175109f38252 100644 --- a/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java +++ b/mr/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java @@ -21,13 +21,9 @@ import java.io.IOException; import java.io.Serializable; import java.io.UncheckedIOException; -import java.util.Collections; import java.util.Iterator; import java.util.List; -import java.util.Map; -import java.util.Set; import java.util.concurrent.ExecutorService; -import java.util.function.BiFunction; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; @@ -39,8 +35,6 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.DataTableScan; import org.apache.iceberg.FileScanTask; -import org.apache.iceberg.MetadataColumns; -import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; import org.apache.iceberg.SerializableTable; @@ -49,19 +43,17 @@ import org.apache.iceberg.Table; import org.apache.iceberg.TableProperties; import org.apache.iceberg.TableScan; -import org.apache.iceberg.avro.Avro; import org.apache.iceberg.data.DeleteFilter; import org.apache.iceberg.data.GenericDeleteFilter; -import org.apache.iceberg.data.IdentityPartitionConverters; import org.apache.iceberg.data.InternalRecordWrapper; -import org.apache.iceberg.data.avro.PlannedDataReader; -import org.apache.iceberg.data.orc.GenericOrcReader; -import org.apache.iceberg.data.parquet.GenericParquetReaders; +import org.apache.iceberg.data.Record; import org.apache.iceberg.encryption.EncryptedFiles; import org.apache.iceberg.encryption.EncryptionManager; import org.apache.iceberg.expressions.Evaluator; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.hadoop.HadoopConfigurable; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.CloseableIterator; @@ -70,13 +62,7 @@ import org.apache.iceberg.mapping.NameMappingParser; import org.apache.iceberg.mr.Catalogs; import org.apache.iceberg.mr.InputFormatConfig; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.types.Type; -import org.apache.iceberg.types.TypeUtil; -import org.apache.iceberg.util.PartitionUtil; import org.apache.iceberg.util.SerializationUtil; import org.apache.iceberg.util.ThreadPools; @@ -326,23 +312,27 @@ private CloseableIterable openTask(FileScanTask currentTask, Schema readSchem encryptionManager.decrypt( EncryptedFiles.encryptedInput(io.newInputFile(file.location()), file.keyMetadata())); - CloseableIterable iterable; - switch (file.format()) { - case AVRO: - iterable = newAvroIterable(inputFile, currentTask, readSchema); - break; - case ORC: - iterable = newOrcIterable(inputFile, currentTask, readSchema); - break; - case PARQUET: - iterable = newParquetIterable(inputFile, currentTask, readSchema); - break; - default: - throw new UnsupportedOperationException( - String.format("Cannot read %s file: %s", file.format().name(), file.location())); + ReadBuilder readBuilder = + FormatModelRegistry.readBuilder(file.format(), Record.class, inputFile); + + if (reuseContainers) { + readBuilder = readBuilder.reuseContainers(); } - return iterable; + if (nameMapping != null) { + readBuilder = readBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); + } + + return applyResidualFiltering( + (CloseableIterable) + readBuilder + .project(readSchema) + .split(currentTask.start(), currentTask.length()) + .caseSensitive(caseSensitive) + .filter(currentTask.residual()) + .build(), + currentTask.residual(), + readSchema); } @SuppressWarnings("unchecked") @@ -369,86 +359,6 @@ private CloseableIterable applyResidualFiltering( } } - private CloseableIterable newAvroIterable( - InputFile inputFile, FileScanTask task, Schema readSchema) { - Avro.ReadBuilder avroReadBuilder = - Avro.read(inputFile).project(readSchema).split(task.start(), task.length()); - if (reuseContainers) { - avroReadBuilder.reuseContainers(); - } - if (nameMapping != null) { - avroReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - - avroReadBuilder.createResolvingReader( - schema -> - PlannedDataReader.create( - schema, constantsMap(task, IdentityPartitionConverters::convertConstant))); - return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema); - } - - private CloseableIterable newParquetIterable( - InputFile inputFile, FileScanTask task, Schema readSchema) { - Parquet.ReadBuilder parquetReadBuilder = - Parquet.read(inputFile) - .project(readSchema) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .split(task.start(), task.length()); - if (reuseContainers) { - parquetReadBuilder.reuseContainers(); - } - if (nameMapping != null) { - parquetReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - parquetReadBuilder.createReaderFunc( - fileSchema -> - GenericParquetReaders.buildReader( - readSchema, - fileSchema, - constantsMap(task, IdentityPartitionConverters::convertConstant))); - CloseableIterable parquetIterator = parquetReadBuilder.build(); - return applyResidualFiltering(parquetIterator, task.residual(), readSchema); - } - - private CloseableIterable newOrcIterable( - InputFile inputFile, FileScanTask task, Schema readSchema) { - Map idToConstant = - constantsMap(task, IdentityPartitionConverters::convertConstant); - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - // ORC does not support reuse containers yet - ORC.ReadBuilder orcReadBuilder = - ORC.read(inputFile) - .project(readSchemaWithoutConstantAndMetadataFields) - .filter(task.residual()) - .caseSensitive(caseSensitive) - .split(task.start(), task.length()); - orcReadBuilder.createReaderFunc( - fileSchema -> GenericOrcReader.buildReader(readSchema, fileSchema, idToConstant)); - - if (nameMapping != null) { - orcReadBuilder.withNameMapping(NameMappingParser.fromJson(nameMapping)); - } - CloseableIterable orcIterator = orcReadBuilder.build(); - return applyResidualFiltering(orcIterator, task.residual(), readSchema); - } - - private Map constantsMap( - FileScanTask task, BiFunction converter) { - PartitionSpec spec = task.spec(); - Set idColumns = spec.identitySourceIds(); - Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns); - boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty(); - if (projectsIdentityPartitionColumns) { - return PartitionUtil.constantsMap(task, converter); - } else { - return Collections.emptyMap(); - } - } - private static Schema readSchema( Configuration conf, Schema tableSchema, boolean caseSensitive) { Schema readSchema = InputFormatConfig.readSchema(conf); diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java index 451c670fcd54..0d149b5c4349 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java @@ -45,6 +45,7 @@ import java.util.Locale; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.function.BiFunction; import java.util.function.Function; import java.util.stream.Collectors; @@ -52,6 +53,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; @@ -79,7 +81,10 @@ import org.apache.iceberg.io.OutputFile; import org.apache.iceberg.mapping.NameMapping; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.iceberg.relocated.com.google.common.collect.Sets; +import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.ArrayUtil; import org.apache.iceberg.util.PropertyUtil; import org.apache.orc.CompressionKind; @@ -180,14 +185,13 @@ public WriteBuilder metricsConfig(MetricsConfig newMetricsConfig) { } // supposed to always be a private method used strictly by data and delete write builders - private WriteBuilder createContextFunc( - Function, Context> newCreateContextFunc) { + WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) { this.createContextFunc = newCreateContextFunc; return this; } public FileAppender build() { - Preconditions.checkNotNull(schema, "Schema is required"); + // Preconditions.checkNotNull(schema, "Schema is required"); for (Map.Entry entry : config.entrySet()) { this.conf.set(entry.getKey(), entry.getValue()); @@ -219,7 +223,7 @@ public FileAppender build() { metricsConfig); } - private static class Context { + static class Context { private final long stripeSize; private final long blockSize; private final int vectorizedRowBatchSize; @@ -699,6 +703,7 @@ public static class ReadBuilder { private Function> readerFunc; private Function> batchedReaderFunc; private int recordsPerBatch = VectorizedRowBatch.DEFAULT_SIZE; + private Set constantFieldIds = ImmutableSet.of(); private ReadBuilder(InputFile file) { Preconditions.checkNotNull(file, "Input file cannot be null"); @@ -775,12 +780,20 @@ public ReadBuilder withNameMapping(NameMapping newNameMapping) { return this; } + ReadBuilder constantValues(Set newConstantFieldIds) { + this.constantFieldIds = newConstantFieldIds; + return this; + } + public CloseableIterable build() { Preconditions.checkNotNull(schema, "Schema is required"); return new OrcIterable<>( file, conf, - schema, + // This is a behavioral change. Previously there were an error if metadata columns were + // present in the schema, now they are removed and the correct reader is created + TypeUtil.selectNot( + schema, Sets.union(constantFieldIds, MetadataColumns.metadataFieldIds())), nameMapping, start, length, diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java new file mode 100644 index 000000000000..b39d9b8790fb --- /dev/null +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -0,0 +1,317 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.orc; + +import java.nio.ByteBuffer; +import java.util.Map; +import java.util.function.Function; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.orc.GenericOrcWriter; +import org.apache.iceberg.data.orc.GenericOrcWriters; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.formats.FormatModel; +import org.apache.iceberg.formats.ReadBuilder; +import org.apache.iceberg.formats.WriteBuilder; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.orc.TypeDescription; + +public class ORCFormatModel implements FormatModel { + private final Class type; + private final Class schemaType; + private final ReaderFunction readerFunction; + private final BatchReaderFunction batchReaderFunction; + private final WriterFunction writerFunction; + + private ORCFormatModel( + Class type, + Class schemaType, + ReaderFunction readerFunction, + BatchReaderFunction batchReaderFunction, + WriterFunction writerFunction) { + this.type = type; + this.schemaType = schemaType; + this.readerFunction = readerFunction; + this.batchReaderFunction = batchReaderFunction; + this.writerFunction = writerFunction; + } + + public ORCFormatModel( + Class type, + Class schemaType, + ReaderFunction readerFunction, + WriterFunction writerFunction) { + this(type, schemaType, readerFunction, null, writerFunction); + } + + public ORCFormatModel( + Class type, Class schemaType, BatchReaderFunction batchReaderFunction) { + this(type, schemaType, null, batchReaderFunction, null); + } + + public ORCFormatModel(Class type) { + this(type, null, null, null, null); + } + + @Override + public FileFormat format() { + return FileFormat.ORC; + } + + @Override + public Class type() { + return type; + } + + @Override + public Class schemaType() { + return schemaType; + } + + @Override + public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + return new WriteBuilderWrapper<>(outputFile, writerFunction); + } + + @Override + public ReadBuilder readBuilder(InputFile inputFile) { + return new ReadBuilderWrapper<>(inputFile, readerFunction, batchReaderFunction); + } + + @FunctionalInterface + public interface ReaderFunction { + OrcRowReader read( + Schema schema, TypeDescription messageType, Map constantValues); + } + + @FunctionalInterface + public interface BatchReaderFunction { + OrcBatchReader read( + Schema schema, TypeDescription messageType, Map constantValues); + } + + @FunctionalInterface + public interface WriterFunction { + OrcRowWriter write(Schema schema, TypeDescription messageType, E nativeSchema); + } + + private static class ReadBuilderWrapper implements ReadBuilder { + private final ORC.ReadBuilder internal; + private final ReaderFunction readerFunction; + private final BatchReaderFunction batchReaderFunction; + private boolean reuseContainers = false; + private Schema icebergSchema; + private Map idToConstant = ImmutableMap.of(); + + private ReadBuilderWrapper( + InputFile inputFile, + ReaderFunction readerFunction, + BatchReaderFunction batchReaderFunction) { + this.internal = ORC.read(inputFile); + this.readerFunction = readerFunction; + this.batchReaderFunction = batchReaderFunction; + } + + @Override + public ReadBuilder split(long newStart, long newLength) { + internal.split(newStart, newLength); + return this; + } + + @Override + public ReadBuilder project(Schema schema) { + this.icebergSchema = schema; + internal.project(schema); + return this; + } + + @Override + public ReadBuilder caseSensitive(boolean caseSensitive) { + internal.caseSensitive(caseSensitive); + return this; + } + + @Override + public ReadBuilder filter(Expression filter) { + internal.filter(filter); + return this; + } + + @Override + public ReadBuilder set(String key, String value) { + internal.config(key, value); + return this; + } + + @Override + public ReadBuilder reuseContainers() { + this.reuseContainers = true; + return this; + } + + @Override + public ReadBuilder recordsPerBatch(int numRowsPerBatch) { + internal.recordsPerBatch(numRowsPerBatch); + return this; + } + + @Override + public ReadBuilder idToConstant(Map newIdToConstant) { + internal.constantValues(newIdToConstant.keySet()); + this.idToConstant = newIdToConstant; + return this; + } + + @Override + public ReadBuilder withNameMapping(NameMapping nameMapping) { + internal.withNameMapping(nameMapping); + return this; + } + + @Override + public org.apache.iceberg.io.CloseableIterable build() { + Preconditions.checkNotNull(reuseContainers, "Reuse containers is required for ORC read"); + if (readerFunction != null) { + return internal + .createReaderFunc( + typeDescription -> + readerFunction.read(icebergSchema, typeDescription, idToConstant)) + .build(); + } else if (batchReaderFunction != null) { + return internal + .createBatchedReaderFunc( + typeDescription -> + batchReaderFunction.read(icebergSchema, typeDescription, idToConstant)) + .build(); + } else { + throw new IllegalStateException("Either readerFunction or batchReaderFunction must be set"); + } + } + } + + private static class WriteBuilderWrapper implements WriteBuilder { + private final ORC.WriteBuilder internal; + private final WriterFunction writerFunction; + private S inputSchema; + private FileContent content; + + private WriteBuilderWrapper(EncryptedOutputFile outputFile, WriterFunction writerFunction) { + this.internal = ORC.write(outputFile); + this.writerFunction = writerFunction; + } + + @Override + public WriteBuilder schema(Schema schema) { + internal.schema(schema); + return this; + } + + @Override + public WriteBuilder inputSchema(S schema) { + this.inputSchema = schema; + return this; + } + + @Override + public WriteBuilder set(String property, String value) { + internal.set(property, value); + return this; + } + + @Override + public WriteBuilder setAll(Map properties) { + internal.setAll(properties); + return this; + } + + @Override + public WriteBuilder meta(String property, String value) { + internal.metadata(property, value); + return this; + } + + @Override + public WriteBuilder content(FileContent newContent) { + this.content = newContent; + return this; + } + + @Override + public WriteBuilder metricsConfig(MetricsConfig metricsConfig) { + internal.metricsConfig(metricsConfig); + return this; + } + + @Override + public WriteBuilder overwrite() { + internal.overwrite(); + return this; + } + + @Override + public WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { + // ORC doesn't support file encryption + throw new UnsupportedOperationException("ORC does not support file encryption keys"); + } + + @Override + public WriteBuilder withAADPrefix(ByteBuffer aadPrefix) { + // ORC doesn't support file encryption + throw new UnsupportedOperationException("ORC does not support AAD prefix"); + } + + @Override + public org.apache.iceberg.io.FileAppender build() { + switch (content) { + case DATA: + internal.createContextFunc(ORC.WriteBuilder.Context::dataContext); + internal.createWriterFunc( + (icebergSchema, typeDescription) -> + writerFunction.write(icebergSchema, typeDescription, inputSchema)); + break; + case EQUALITY_DELETES: + internal.createContextFunc(ORC.WriteBuilder.Context::deleteContext); + internal.createWriterFunc( + (icebergSchema, typeDescription) -> + writerFunction.write(icebergSchema, typeDescription, inputSchema)); + break; + case POSITION_DELETES: + internal.createContextFunc(ORC.WriteBuilder.Context::deleteContext); + internal.createWriterFunc( + (icebergSchema, typeDescription) -> + GenericOrcWriters.positionDelete( + GenericOrcWriter.buildWriter(icebergSchema, typeDescription), + Function.identity())); + internal.schema(DeleteSchemaUtil.pathPosSchema()); + break; + default: + throw new IllegalArgumentException("Unknown file content: " + content); + } + + return internal.build(); + } + } +} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index 2b2e460ee994..1957ad209f6a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -302,8 +302,7 @@ WriteBuilder withWriterVersion(WriterVersion version) { } // supposed to always be a private method used strictly by data and delete write builders - private WriteBuilder createContextFunc( - Function, Context> newCreateContextFunc) { + WriteBuilder createContextFunc(Function, Context> newCreateContextFunc) { this.createContextFunc = newCreateContextFunc; return this; } @@ -498,7 +497,7 @@ public FileAppender build() throws IOException { } } - private static class Context { + static class Context { private final int rowGroupSize; private final int pageSize; private final int pageRowLimit; diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java new file mode 100644 index 000000000000..aa5afecf89cd --- /dev/null +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -0,0 +1,334 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.Map; +import java.util.function.Function; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.Schema; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.formats.FormatModel; +import org.apache.iceberg.formats.ReadBuilder; +import org.apache.iceberg.formats.WriteBuilder; +import org.apache.iceberg.io.CloseableIterable; +import org.apache.iceberg.io.DeleteSchemaUtil; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.parquet.column.ParquetProperties; +import org.apache.parquet.schema.MessageType; + +public class ParquetFormatModel implements FormatModel { + public static final String WRITER_VERSION_KEY = "parquet.writer.version"; + + private final Class type; + private final Class schemaType; + private final ReaderFunction readerFunction; + private final BatchReaderFunction batchReaderFunction; + private final WriterFunction writerFunction; + + private ParquetFormatModel( + Class type, + Class schemaType, + ReaderFunction readerFunction, + BatchReaderFunction batchReaderFunction, + WriterFunction writerFunction) { + this.type = type; + this.schemaType = schemaType; + this.readerFunction = readerFunction; + this.batchReaderFunction = batchReaderFunction; + this.writerFunction = writerFunction; + } + + public ParquetFormatModel(Class type) { + this(type, null, null, null); + } + + public ParquetFormatModel( + Class type, + Class schemaType, + ReaderFunction readerFunction, + WriterFunction writerFunction) { + this(type, schemaType, readerFunction, null, writerFunction); + } + + public ParquetFormatModel( + Class type, Class schemaType, BatchReaderFunction batchReaderFunction) { + this(type, schemaType, null, batchReaderFunction, null); + } + + @Override + public FileFormat format() { + return FileFormat.PARQUET; + } + + @Override + public Class type() { + return type; + } + + @Override + public Class schemaType() { + return schemaType; + } + + @Override + public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + return new WriteBuilderWrapper<>(outputFile, writerFunction); + } + + @Override + public ReadBuilder readBuilder(InputFile inputFile) { + return new ReadBuilderWrapper<>(inputFile, readerFunction, batchReaderFunction); + } + + @FunctionalInterface + public interface ReaderFunction { + ParquetValueReader read( + Schema schema, MessageType messageType, Map constantValues); + } + + @FunctionalInterface + public interface BatchReaderFunction { + VectorizedReader read( + Schema schema, + MessageType messageType, + Map constantValues, + Map config); + } + + @FunctionalInterface + public interface WriterFunction { + ParquetValueWriter write(Schema icebergSchema, MessageType messageType, S engineSchema); + } + + private static class WriteBuilderWrapper implements WriteBuilder { + private final Parquet.WriteBuilder internal; + private final WriterFunction writerFunction; + private S inputSchema; + private FileContent content; + + private WriteBuilderWrapper(EncryptedOutputFile outputFile, WriterFunction writerFunction) { + this.internal = Parquet.write(outputFile); + this.writerFunction = writerFunction; + } + + @Override + public WriteBuilder schema(Schema schema) { + internal.schema(schema); + return this; + } + + @Override + public WriteBuilder inputSchema(S schema) { + this.inputSchema = schema; + return this; + } + + @Override + public WriteBuilder set(String property, String value) { + if (WRITER_VERSION_KEY.equals(property)) { + internal.writerVersion(ParquetProperties.WriterVersion.valueOf(value)); + } + + internal.set(property, value); + return this; + } + + @Override + public WriteBuilder setAll(Map properties) { + internal.setAll(properties); + return this; + } + + @Override + public WriteBuilder meta(String property, String value) { + internal.meta(property, value); + return this; + } + + @Override + public WriteBuilder meta(Map properties) { + internal.meta(properties); + return this; + } + + @Override + public WriteBuilder content(FileContent newContent) { + this.content = newContent; + return this; + } + + @Override + public WriteBuilder metricsConfig(MetricsConfig metricsConfig) { + internal.metricsConfig(metricsConfig); + return this; + } + + @Override + public WriteBuilder overwrite() { + internal.overwrite(); + return this; + } + + @Override + public WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { + internal.withFileEncryptionKey(encryptionKey); + return this; + } + + @Override + public WriteBuilder withAADPrefix(ByteBuffer aadPrefix) { + internal.withAADPrefix(aadPrefix); + return this; + } + + @Override + public FileAppender build() throws IOException { + switch (content) { + case DATA: + internal.createContextFunc(Parquet.WriteBuilder.Context::dataContext); + internal.createWriterFunc( + (icebergSchema, messageType) -> + writerFunction.write(icebergSchema, messageType, inputSchema)); + break; + case EQUALITY_DELETES: + internal.createContextFunc(Parquet.WriteBuilder.Context::deleteContext); + internal.createWriterFunc( + (icebergSchema, messageType) -> + writerFunction.write(icebergSchema, messageType, inputSchema)); + break; + case POSITION_DELETES: + internal.createContextFunc(Parquet.WriteBuilder.Context::deleteContext); + internal.createWriterFunc( + (icebergSchema, messageType) -> + new ParquetValueWriters.PositionDeleteStructWriter( + (ParquetValueWriters.StructWriter) + GenericParquetWriter.create(icebergSchema, messageType), + Function.identity())); + internal.schema(DeleteSchemaUtil.pathPosSchema()); + break; + default: + throw new IllegalArgumentException("Unknown file content: " + content); + } + + return internal.build(); + } + } + + private static class ReadBuilderWrapper implements ReadBuilder { + private final Parquet.ReadBuilder internal; + private final ReaderFunction readerFunction; + private final BatchReaderFunction batchReaderFunction; + private final Map config = Maps.newHashMap(); + private Schema icebergSchema; + private Map idToConstant = ImmutableMap.of(); + + private ReadBuilderWrapper( + InputFile inputFile, + ReaderFunction readerFunction, + BatchReaderFunction batchReaderFunction) { + this.internal = Parquet.read(inputFile); + this.readerFunction = readerFunction; + this.batchReaderFunction = batchReaderFunction; + } + + @Override + public ReadBuilder split(long newStart, long newLength) { + internal.split(newStart, newLength); + return this; + } + + @Override + public ReadBuilder project(Schema schema) { + this.icebergSchema = schema; + internal.project(schema); + return this; + } + + @Override + public ReadBuilder caseSensitive(boolean caseSensitive) { + internal.caseSensitive(caseSensitive); + return this; + } + + @Override + public ReadBuilder filter(Expression filter) { + internal.filter(filter); + return this; + } + + @Override + public ReadBuilder set(String key, String value) { + this.config.put(key, value); + internal.set(key, value); + return this; + } + + @Override + public ReadBuilder reuseContainers() { + internal.reuseContainers(); + return this; + } + + @Override + public ReadBuilder recordsPerBatch(int numRowsPerBatch) { + internal.recordsPerBatch(numRowsPerBatch); + return this; + } + + @Override + public ReadBuilder idToConstant(Map newIdToConstant) { + this.idToConstant = newIdToConstant; + return this; + } + + @Override + public ReadBuilder withNameMapping(NameMapping nameMapping) { + internal.withNameMapping(nameMapping); + return this; + } + + @Override + public CloseableIterable build() { + if (readerFunction != null) { + return internal + .createReaderFunc( + messageType -> readerFunction.read(icebergSchema, messageType, idToConstant)) + .build(); + } else if (batchReaderFunction != null) { + return internal + .createBatchedReaderFunc( + messageType -> + batchReaderFunction.read(icebergSchema, messageType, idToConstant, config)) + .build(); + } else { + throw new IllegalStateException("Either readerFunction or batchReaderFunction must be set"); + } + } + } +} diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java index fc10a57ec0e0..6d4e64603771 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java @@ -46,7 +46,7 @@ public class VectorizedParquetReader extends CloseableGroup implements Closea private final Function> batchReaderFunc; private final Expression filter; private final boolean reuseContainers; - private final boolean caseSensitive; + private final boolean filterCaseSensitive; private final int batchSize; private final NameMapping nameMapping; @@ -58,7 +58,7 @@ public VectorizedParquetReader( NameMapping nameMapping, Expression filter, boolean reuseContainers, - boolean caseSensitive, + boolean filterCaseSensitive, int maxRecordsPerBatch) { this.input = input; this.expectedSchema = expectedSchema; @@ -67,7 +67,7 @@ public VectorizedParquetReader( // replace alwaysTrue with null to avoid extra work evaluating a trivial filter this.filter = filter == Expressions.alwaysTrue() ? null : filter; this.reuseContainers = reuseContainers; - this.caseSensitive = caseSensitive; + this.filterCaseSensitive = filterCaseSensitive; this.batchSize = maxRecordsPerBatch; this.nameMapping = nameMapping; } @@ -86,7 +86,7 @@ private ReadConf init() { batchReaderFunc, nameMapping, reuseContainers, - caseSensitive, + filterCaseSensitive, batchSize); this.conf = readConf.copy(); return readConf; diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java index 58850ec7c9f4..3132929fb330 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java @@ -218,7 +218,7 @@ public void testTwoLevelList() throws IOException { writer.close(); GenericData.Record recordRead = - Iterables.getOnlyElement( + Iterables.getOnlyElement( Parquet.read(Files.localInput(testFile)).project(schema).callInit().build()); assertThat(recordRead.get("arraybytes")).isEqualTo(expectedByteList); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java index d6a13bcd515d..220244f9197f 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java @@ -52,13 +52,14 @@ import org.apache.iceberg.avro.Avro; import org.apache.iceberg.data.Record; import org.apache.iceberg.data.avro.DataWriter; -import org.apache.iceberg.data.avro.PlannedDataReader; -import org.apache.iceberg.data.orc.GenericOrcReader; import org.apache.iceberg.data.orc.GenericOrcWriter; -import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedFiles; import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.PositionDeleteWriteBuilder; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.FileIO; @@ -681,6 +682,13 @@ public CloseableIterable reader( return positionDeletesReader(inputFile, format, spec); } + @Override + public PositionDeleteWriter writer( + OutputFile outputFile, FileFormat format, PartitionSpec spec, StructLike partition) + throws IOException { + return positionDeletesWriter(outputFile, format, spec, partition, null); + } + @Override public PositionDeleteWriter writer( OutputFile outputFile, @@ -720,31 +728,8 @@ private ForeachFunction rewritePositionDelete( private static CloseableIterable positionDeletesReader( InputFile inputFile, FileFormat format, PartitionSpec spec) { Schema deleteSchema = DeleteSchemaUtil.posDeleteReadSchema(spec.schema()); - switch (format) { - case AVRO: - return Avro.read(inputFile) - .project(deleteSchema) - .reuseContainers() - .createReaderFunc(fileSchema -> PlannedDataReader.create(deleteSchema)) - .build(); - - case PARQUET: - return Parquet.read(inputFile) - .project(deleteSchema) - .reuseContainers() - .createReaderFunc( - fileSchema -> GenericParquetReaders.buildReader(deleteSchema, fileSchema)) - .build(); - - case ORC: - return ORC.read(inputFile) - .project(deleteSchema) - .createReaderFunc(fileSchema -> GenericOrcReader.buildReader(deleteSchema, fileSchema)) - .build(); - - default: - throw new UnsupportedOperationException("Unsupported file format: " + format); - } + ReadBuilder builder = FormatModelRegistry.readBuilder(format, Record.class, inputFile); + return builder.project(deleteSchema).reuseContainers().build(); } private static PositionDeleteWriter positionDeletesWriter( @@ -754,30 +739,37 @@ private static PositionDeleteWriter positionDeletesWriter( StructLike partition, Schema rowSchema) throws IOException { - switch (format) { - case AVRO: - return Avro.writeDeletes(outputFile) - .createWriterFunc(DataWriter::create) - .withPartition(partition) - .rowSchema(rowSchema) - .withSpec(spec) - .buildPositionWriter(); - case PARQUET: - return Parquet.writeDeletes(outputFile) - .createWriterFunc(GenericParquetWriter::create) - .withPartition(partition) - .rowSchema(rowSchema) - .withSpec(spec) - .buildPositionWriter(); - case ORC: - return ORC.writeDeletes(outputFile) - .createWriterFunc(GenericOrcWriter::buildWriter) - .withPartition(partition) - .rowSchema(rowSchema) - .withSpec(spec) - .buildPositionWriter(); - default: - throw new UnsupportedOperationException("Unsupported file format: " + format); + if (rowSchema == null) { + PositionDeleteWriteBuilder builder = + FormatModelRegistry.positionDeleteWriteBuilder( + format, EncryptedFiles.plainAsEncryptedOutput(outputFile)); + return builder.partition(partition).spec(spec).build(); + } else { + switch (format) { + case AVRO: + return Avro.writeDeletes(outputFile) + .createWriterFunc(DataWriter::create) + .withPartition(partition) + .rowSchema(rowSchema) + .withSpec(spec) + .buildPositionWriter(); + case PARQUET: + return Parquet.writeDeletes(outputFile) + .createWriterFunc(GenericParquetWriter::create) + .withPartition(partition) + .rowSchema(rowSchema) + .withSpec(spec) + .buildPositionWriter(); + case ORC: + return ORC.writeDeletes(outputFile) + .createWriterFunc(GenericOrcWriter::buildWriter) + .withPartition(partition) + .rowSchema(rowSchema) + .withSpec(spec) + .buildPositionWriter(); + default: + throw new UnsupportedOperationException("Unsupported file format: " + format); + } } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index 8e25e81a05b2..21764cddb7cc 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -28,8 +28,10 @@ import org.apache.iceberg.arrow.vectorized.VectorizedReaderBuilder; import org.apache.iceberg.parquet.TypeWithSchemaVisitor; import org.apache.iceberg.parquet.VectorizedReader; +import org.apache.iceberg.spark.ParquetReaderType; import org.apache.iceberg.spark.SparkUtil; import org.apache.parquet.schema.MessageType; +import org.apache.spark.sql.vectorized.ColumnarBatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,6 +43,8 @@ public class VectorizedSparkParquetReaders { private static final String ENABLE_NULL_CHECK_FOR_GET = "arrow.enable_null_check_for_get"; private static final String ENABLE_NULL_CHECK_FOR_GET_ENV = "ARROW_ENABLE_NULL_CHECK_FOR_GET"; + public static final String PARQUET_READER_TYPE = "parquet.reader.type"; + static { try { enableUnsafeMemoryAccess(); @@ -52,6 +56,18 @@ public class VectorizedSparkParquetReaders { private VectorizedSparkParquetReaders() {} + public static VectorizedReader buildReader( + Schema expectedSchema, + MessageType fileSchema, + Map idToConstant, + Map config) { + if (ParquetReaderType.COMET.name().equals(config.get(PARQUET_READER_TYPE))) { + return buildCometReader(expectedSchema, fileSchema, idToConstant); + } else { + return buildReader(expectedSchema, fileSchema, idToConstant); + } + } + public static ColumnarBatchReader buildReader( Schema expectedSchema, MessageType fileSchema, diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index ff30f29aeae6..1147977825e6 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -19,7 +19,6 @@ package org.apache.iceberg.spark.source; import java.util.Map; -import java.util.Set; import javax.annotation.Nonnull; import org.apache.iceberg.FileFormat; import org.apache.iceberg.MetadataColumns; @@ -29,21 +28,17 @@ import org.apache.iceberg.Table; import org.apache.iceberg.data.DeleteFilter; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; -import org.apache.iceberg.spark.ParquetReaderType; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; -import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; -import org.apache.iceberg.types.TypeUtil; import org.apache.iceberg.util.Pair; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.vectorized.ColumnVector; @@ -68,6 +63,7 @@ abstract class BaseBatchReader extends BaseReader newBatchIterable( InputFile inputFile, FileFormat format, @@ -76,79 +72,35 @@ protected CloseableIterable newBatchIterable( Expression residual, Map idToConstant, @Nonnull SparkDeleteFilter deleteFilter) { - CloseableIterable iterable; - switch (format) { - case PARQUET: - iterable = - newParquetIterable( - inputFile, start, length, residual, idToConstant, deleteFilter.requiredSchema()); - break; - case ORC: - iterable = newOrcIterable(inputFile, start, length, residual, idToConstant); - break; - default: - throw new UnsupportedOperationException( - "Format: " + format + " not supported for batched reads"); + ReadBuilder readBuilder = + FormatModelRegistry.readBuilder(format, ColumnarBatch.class, inputFile); + if (parquetConf != null) { + readBuilder = + readBuilder + .recordsPerBatch(parquetConf.batchSize()) + .set( + VectorizedSparkParquetReaders.PARQUET_READER_TYPE, + parquetConf.readerType().name()); + } else if (orcConf != null) { + readBuilder = readBuilder.recordsPerBatch(orcConf.batchSize()); } - return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); - } - - private CloseableIterable newParquetIterable( - InputFile inputFile, - long start, - long length, - Expression residual, - Map idToConstant, - Schema requiredSchema) { - return Parquet.read(inputFile) - .project(requiredSchema) - .split(start, length) - .createBatchedReaderFunc( - fileSchema -> { - if (parquetConf.readerType() == ParquetReaderType.COMET) { - return VectorizedSparkParquetReaders.buildCometReader( - requiredSchema, fileSchema, idToConstant); - } else { - return VectorizedSparkParquetReaders.buildReader( - requiredSchema, fileSchema, idToConstant); - } - }) - .recordsPerBatch(parquetConf.batchSize()) - .filter(residual) - .caseSensitive(caseSensitive()) - // Spark eagerly consumes the batches. So the underlying memory allocated could be reused - // without worrying about subsequent reads clobbering over each other. This improves - // read performance as every batch read doesn't have to pay the cost of allocating memory. - .reuseContainers() - .withNameMapping(nameMapping()) - .build(); - } + CloseableIterable iterable = + readBuilder + .project(deleteFilter.requiredSchema()) + .idToConstant(idToConstant) + .split(start, length) + .caseSensitive(caseSensitive()) + .filter(residual) + // Spark eagerly consumes the batches. So the underlying memory allocated could be + // reused without worrying about subsequent reads clobbering over each other. This + // improves read performance as every batch read doesn't have to pay the cost of + // allocating memory. + .reuseContainers() + .withNameMapping(nameMapping()) + .build(); - private CloseableIterable newOrcIterable( - InputFile inputFile, - long start, - long length, - Expression residual, - Map idToConstant) { - Set constantFieldIds = idToConstant.keySet(); - Set metadataFieldIds = MetadataColumns.metadataFieldIds(); - Sets.SetView constantAndMetadataFieldIds = - Sets.union(constantFieldIds, metadataFieldIds); - Schema schemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot(expectedSchema(), constantAndMetadataFieldIds); - - return ORC.read(inputFile) - .project(schemaWithoutConstantAndMetadataFields) - .split(start, length) - .createBatchedReaderFunc( - fileSchema -> - VectorizedSparkOrcReaders.buildReader(expectedSchema(), fileSchema, idToConstant)) - .recordsPerBatch(orcConf.batchSize()) - .filter(residual) - .caseSensitive(caseSensitive()) - .withNameMapping(nameMapping()) - .build(); + return CloseableIterable.transform(iterable, new BatchDeleteFilter(deleteFilter)::filterBatch); } @VisibleForTesting diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java index c12931e786b1..53d44e760afe 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseRowReader.java @@ -20,22 +20,15 @@ import java.util.Map; import org.apache.iceberg.FileFormat; -import org.apache.iceberg.MetadataColumns; import org.apache.iceberg.ScanTask; import org.apache.iceberg.ScanTaskGroup; import org.apache.iceberg.Schema; import org.apache.iceberg.Table; -import org.apache.iceberg.avro.Avro; import org.apache.iceberg.expressions.Expression; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.InputFile; -import org.apache.iceberg.orc.ORC; -import org.apache.iceberg.parquet.Parquet; -import org.apache.iceberg.relocated.com.google.common.collect.Sets; -import org.apache.iceberg.spark.data.SparkOrcReader; -import org.apache.iceberg.spark.data.SparkParquetReaders; -import org.apache.iceberg.spark.data.SparkPlannedAvroReader; -import org.apache.iceberg.types.TypeUtil; import org.apache.spark.sql.catalyst.InternalRow; abstract class BaseRowReader extends BaseReader { @@ -58,69 +51,15 @@ protected CloseableIterable newIterable( Expression residual, Schema projection, Map idToConstant) { - switch (format) { - case PARQUET: - return newParquetIterable(file, start, length, residual, projection, idToConstant); - - case AVRO: - return newAvroIterable(file, start, length, projection, idToConstant); - - case ORC: - return newOrcIterable(file, start, length, residual, projection, idToConstant); - - default: - throw new UnsupportedOperationException("Cannot read unknown format: " + format); - } - } - - private CloseableIterable newAvroIterable( - InputFile file, long start, long length, Schema projection, Map idToConstant) { - return Avro.read(file) - .reuseContainers() + ReadBuilder reader = + FormatModelRegistry.readBuilder(format, InternalRow.class, file); + return reader .project(projection) - .split(start, length) - .createResolvingReader(schema -> SparkPlannedAvroReader.create(schema, idToConstant)) - .withNameMapping(nameMapping()) - .build(); - } - - private CloseableIterable newParquetIterable( - InputFile file, - long start, - long length, - Expression residual, - Schema readSchema, - Map idToConstant) { - return Parquet.read(file) + .idToConstant(idToConstant) .reuseContainers() .split(start, length) - .project(readSchema) - .createReaderFunc( - fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) - .filter(residual) .caseSensitive(caseSensitive()) - .withNameMapping(nameMapping()) - .build(); - } - - private CloseableIterable newOrcIterable( - InputFile file, - long start, - long length, - Expression residual, - Schema readSchema, - Map idToConstant) { - Schema readSchemaWithoutConstantAndMetadataFields = - TypeUtil.selectNot( - readSchema, Sets.union(idToConstant.keySet(), MetadataColumns.metadataFieldIds())); - - return ORC.read(file) - .project(readSchemaWithoutConstantAndMetadataFields) - .split(start, length) - .createReaderFunc( - readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) .filter(residual) - .caseSensitive(caseSensitive()) .withNameMapping(nameMapping()) .build(); } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java index a93db17e4a0f..1f8cee27a818 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFileWriterFactory.java @@ -23,13 +23,20 @@ import static org.apache.iceberg.TableProperties.DEFAULT_FILE_FORMAT_DEFAULT; import static org.apache.iceberg.TableProperties.DELETE_DEFAULT_FILE_FORMAT; +import java.io.IOException; +import java.io.UncheckedIOException; import java.util.Map; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; import org.apache.iceberg.Table; import org.apache.iceberg.avro.Avro; -import org.apache.iceberg.data.BaseFileWriterFactory; +import org.apache.iceberg.data.RegistryBasedFileWriterFactory; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.orc.ORC; import org.apache.iceberg.parquet.Parquet; @@ -40,14 +47,18 @@ import org.apache.iceberg.spark.data.SparkOrcWriter; import org.apache.iceberg.spark.data.SparkParquetWriters; import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.UTF8String; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; -class SparkFileWriterFactory extends BaseFileWriterFactory { - private StructType dataSparkType; - private StructType equalityDeleteSparkType; +class SparkFileWriterFactory extends RegistryBasedFileWriterFactory { + private static final Logger LOG = LoggerFactory.getLogger(SparkFileWriterFactory.class); private StructType positionDeleteSparkType; + private boolean useDeprecatedPositionDeleteWriter = false; + private final Schema positionDeleteRowSchema; + private final Table table; + private final FileFormat format; private final Map writeProperties; /** @@ -75,18 +86,26 @@ class SparkFileWriterFactory extends BaseFileWriterFactory { super( table, dataFileFormat, + InternalRow.class, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSortOrder, - positionDeleteRowSchema); + writeProperties, + calculateSparkType(dataSparkType, dataSchema), + calculateSparkType(equalityDeleteSparkType, equalityDeleteRowSchema)); - this.dataSparkType = dataSparkType; - this.equalityDeleteSparkType = equalityDeleteSparkType; - this.positionDeleteSparkType = positionDeleteSparkType; + this.table = table; + this.format = dataFileFormat; this.writeProperties = writeProperties != null ? writeProperties : ImmutableMap.of(); + this.positionDeleteRowSchema = positionDeleteRowSchema; + this.positionDeleteSparkType = positionDeleteSparkType; + this.useDeprecatedPositionDeleteWriter = + positionDeleteRowSchema != null + || (positionDeleteSparkType != null + && positionDeleteSparkType.getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined()); } SparkFileWriterFactory( @@ -105,119 +124,106 @@ class SparkFileWriterFactory extends BaseFileWriterFactory { super( table, dataFileFormat, + InternalRow.class, dataSchema, dataSortOrder, deleteFileFormat, equalityFieldIds, equalityDeleteRowSchema, equalityDeleteSortOrder, - ImmutableMap.of()); + writeProperties, + calculateSparkType(dataSparkType, dataSchema), + calculateSparkType(equalityDeleteSparkType, equalityDeleteRowSchema)); - this.dataSparkType = dataSparkType; - this.equalityDeleteSparkType = equalityDeleteSparkType; - this.positionDeleteSparkType = null; + this.table = table; + this.format = dataFileFormat; this.writeProperties = writeProperties != null ? writeProperties : ImmutableMap.of(); + this.positionDeleteRowSchema = null; + this.useDeprecatedPositionDeleteWriter = false; } static Builder builderFor(Table table) { return new Builder(table); } - @Override - protected void configureDataWrite(Avro.DataWriteBuilder builder) { - builder.createWriterFunc(ignored -> new SparkAvroWriter(dataSparkType())); - builder.setAll(writeProperties); - } - - @Override - protected void configureEqualityDelete(Avro.DeleteWriteBuilder builder) { - builder.createWriterFunc(ignored -> new SparkAvroWriter(equalityDeleteSparkType())); - builder.setAll(writeProperties); - } - - @Override - protected void configurePositionDelete(Avro.DeleteWriteBuilder builder) { - boolean withRow = - positionDeleteSparkType().getFieldIndex(DELETE_FILE_ROW_FIELD_NAME).isDefined(); - if (withRow) { - // SparkAvroWriter accepts just the Spark type of the row ignoring the path and pos - StructField rowField = positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME); - StructType positionDeleteRowSparkType = (StructType) rowField.dataType(); - builder.createWriterFunc(ignored -> new SparkAvroWriter(positionDeleteRowSparkType)); - } - - builder.setAll(writeProperties); - } - - @Override - protected void configureDataWrite(Parquet.DataWriteBuilder builder) { - builder.createWriterFunc(msgType -> SparkParquetWriters.buildWriter(dataSparkType(), msgType)); - builder.setAll(writeProperties); - } - - @Override - protected void configureEqualityDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> SparkParquetWriters.buildWriter(equalityDeleteSparkType(), msgType)); - builder.setAll(writeProperties); - } - - @Override - protected void configurePositionDelete(Parquet.DeleteWriteBuilder builder) { - builder.createWriterFunc( - msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)); - builder.transformPaths(path -> UTF8String.fromString(path.toString())); - builder.setAll(writeProperties); - } - - @Override - protected void configureDataWrite(ORC.DataWriteBuilder builder) { - builder.createWriterFunc(SparkOrcWriter::new); - builder.setAll(writeProperties); - } - - @Override - protected void configureEqualityDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc(SparkOrcWriter::new); - builder.setAll(writeProperties); - } - - @Override - protected void configurePositionDelete(ORC.DeleteWriteBuilder builder) { - builder.createWriterFunc(SparkOrcWriter::new); - builder.transformPaths(path -> UTF8String.fromString(path.toString())); - builder.setAll(writeProperties); - } - - private StructType dataSparkType() { - if (dataSparkType == null) { - Preconditions.checkNotNull(dataSchema(), "Data schema must not be null"); - this.dataSparkType = SparkSchemaUtil.convert(dataSchema()); - } - - return dataSparkType; - } - - private StructType equalityDeleteSparkType() { - if (equalityDeleteSparkType == null) { - Preconditions.checkNotNull( - equalityDeleteRowSchema(), "Equality delete schema must not be null"); - this.equalityDeleteSparkType = SparkSchemaUtil.convert(equalityDeleteRowSchema()); - } - - return equalityDeleteSparkType; - } - private StructType positionDeleteSparkType() { if (positionDeleteSparkType == null) { // wrap the optional row schema into the position delete schema containing path and position - Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema()); + Schema positionDeleteSchema = DeleteSchemaUtil.posDeleteSchema(positionDeleteRowSchema); this.positionDeleteSparkType = SparkSchemaUtil.convert(positionDeleteSchema); } return positionDeleteSparkType; } + @Override + public PositionDeleteWriter newPositionDeleteWriter( + EncryptedOutputFile file, PartitionSpec spec, StructLike partition) { + if (!useDeprecatedPositionDeleteWriter) { + return super.newPositionDeleteWriter(file, spec, partition); + } else { + LOG.info( + "Deprecated feature used. Position delete row schema is used to create the position delete writer."); + MetricsConfig metricsConfig = + table != null + ? MetricsConfig.forPositionDelete(table) + : MetricsConfig.fromProperties(ImmutableMap.of()); + + try { + switch (format) { + case AVRO: + StructType positionDeleteRowSparkType = + (StructType) positionDeleteSparkType().apply(DELETE_FILE_ROW_FIELD_NAME).dataType(); + + return Avro.writeDeletes(file) + .createWriterFunc(ignored -> new SparkAvroWriter(positionDeleteRowSparkType)) + .withPartition(partition) + .overwrite() + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(file.keyMetadata()) + .setAll(writeProperties) + .metricsConfig(metricsConfig) + .buildPositionWriter(); + + case ORC: + return ORC.writeDeletes(file) + .createWriterFunc(SparkOrcWriter::new) + .transformPaths(path -> UTF8String.fromString(path.toString())) + .withPartition(partition) + .overwrite() + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(file.keyMetadata()) + .setAll(writeProperties) + .metricsConfig(metricsConfig) + .buildPositionWriter(); + + case PARQUET: + return Parquet.writeDeletes(file) + .createWriterFunc( + msgType -> SparkParquetWriters.buildWriter(positionDeleteSparkType(), msgType)) + .transformPaths(path -> UTF8String.fromString(path.toString())) + .withPartition(partition) + .overwrite() + .metricsConfig(metricsConfig) + .rowSchema(positionDeleteRowSchema) + .withSpec(spec) + .withKeyMetadata(file.keyMetadata()) + .setAll(writeProperties) + .metricsConfig(metricsConfig) + .buildPositionWriter(); + + default: + throw new UnsupportedOperationException( + "Cannot write pos-deletes for unsupported file format: " + format); + } + } catch (IOException e) { + throw new UncheckedIOException("Failed to create new position delete writer", e); + } + } + } + static class Builder { private final Table table; private FileFormat dataFileFormat; @@ -340,4 +346,14 @@ SparkFileWriterFactory build() { writeProperties); } } + + private static StructType calculateSparkType(StructType sparkType, Schema schema) { + if (sparkType != null) { + return sparkType; + } else if (schema != null) { + return SparkSchemaUtil.convert(schema); + } else { + return null; + } + } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java new file mode 100644 index 000000000000..fb1b37b08cd2 --- /dev/null +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.spark.source; + +import org.apache.iceberg.avro.AvroFormatModel; +import org.apache.iceberg.formats.FormatModelRegistry; +import org.apache.iceberg.orc.ORCFormatModel; +import org.apache.iceberg.parquet.ParquetFormatModel; +import org.apache.iceberg.spark.data.SparkAvroWriter; +import org.apache.iceberg.spark.data.SparkOrcReader; +import org.apache.iceberg.spark.data.SparkOrcWriter; +import org.apache.iceberg.spark.data.SparkParquetReaders; +import org.apache.iceberg.spark.data.SparkParquetWriters; +import org.apache.iceberg.spark.data.SparkPlannedAvroReader; +import org.apache.iceberg.spark.data.vectorized.VectorizedSparkOrcReaders; +import org.apache.iceberg.spark.data.vectorized.VectorizedSparkParquetReaders; +import org.apache.spark.sql.catalyst.InternalRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.vectorized.ColumnarBatch; + +public class SparkFormatModels { + public static void register() { + FormatModelRegistry.register( + new AvroFormatModel<>( + InternalRow.class, + StructType.class, + SparkPlannedAvroReader::create, + (avroSchema, inputSchema) -> new SparkAvroWriter(inputSchema))); + + FormatModelRegistry.register( + new ParquetFormatModel<>( + InternalRow.class, + StructType.class, + SparkParquetReaders::buildReader, + (icebergSchema, messageType, inputType) -> + SparkParquetWriters.buildWriter(inputType, messageType))); + + FormatModelRegistry.register( + new ParquetFormatModel<>( + ColumnarBatch.class, StructType.class, VectorizedSparkParquetReaders::buildReader)); + + FormatModelRegistry.register( + new ORCFormatModel<>( + InternalRow.class, + StructType.class, + SparkOrcReader::new, + (schema, typeDescription, unused) -> new SparkOrcWriter(schema, typeDescription))); + + FormatModelRegistry.register( + new ORCFormatModel<>( + ColumnarBatch.class, StructType.class, VectorizedSparkOrcReaders::buildReader)); + } + + private SparkFormatModels() {} +} From 7117cc9de00650d0a22ae5ac6d805b40dd5788eb Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Sat, 22 Nov 2025 08:02:45 +0100 Subject: [PATCH 02/15] Using Parquet.icebergSchema instead of keeping track of it in the ParquetFormatModel.ReadBuilderWrapper --- .../org/apache/iceberg/parquet/Parquet.java | 67 +++++++++++++++++-- .../iceberg/parquet/ParquetFormatModel.java | 7 +- 2 files changed, 66 insertions(+), 8 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index 1957ad209f6a..b67b5db69c9c 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -1174,7 +1174,7 @@ public static class ReadBuilder implements InternalData.ReadBuilder { private Schema schema = null; private Expression filter = null; private ReadSupport readSupport = null; - private Function> batchedReaderFunc = null; + private BatchReaderFunction batchedReaderFunc = null; private ReaderFunction readerFunction = null; private boolean filterRecords = true; private boolean caseSensitive = true; @@ -1240,6 +1240,50 @@ public ReaderFunction withSchema(Schema expectedSchema) { } } + public interface BatchReaderFunction { + Function> apply(); + + default BatchReaderFunction withSchema(Schema schema) { + return this; + } + } + + private static class UnaryBatchReaderFunction implements BatchReaderFunction { + private final Function> readerFunc; + + UnaryBatchReaderFunction(Function> readerFunc) { + this.readerFunc = readerFunc; + } + + @Override + public Function> apply() { + return readerFunc; + } + } + + private static class BinaryBatchReaderFunction implements BatchReaderFunction { + private final BiFunction> readerFuncWithSchema; + private Schema schema; + + BinaryBatchReaderFunction( + BiFunction> readerFuncWithSchema) { + this.readerFuncWithSchema = readerFuncWithSchema; + } + + @Override + public Function> apply() { + Preconditions.checkArgument( + schema != null, "Schema must be set for 2-argument reader function"); + return messageType -> readerFuncWithSchema.apply(schema, messageType); + } + + @Override + public BinaryBatchReaderFunction withSchema(Schema expectedSchema) { + this.schema = expectedSchema; + return this; + } + } + private ReadBuilder(InputFile file) { this.file = file; } @@ -1314,14 +1358,27 @@ public ReadBuilder createReaderFunc( return this; } - public ReadBuilder createBatchedReaderFunc(Function> func) { + public ReadBuilder createBatchedReaderFunc( + Function> newReaderFunction) { + Preconditions.checkArgument( + this.batchedReaderFunc == null, + "Cannot set batched reader function: batched reader function already set"); + Preconditions.checkArgument( + this.readerFunction == null, + "Cannot set batched reader function: ReaderFunction already set"); + this.batchedReaderFunc = new UnaryBatchReaderFunction(newReaderFunction); + return this; + } + + public ReadBuilder createBatchedReaderFunc( + BiFunction> newReaderFunction) { Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set batched reader function: batched reader function already set"); Preconditions.checkArgument( this.readerFunction == null, "Cannot set batched reader function: ReaderFunction already set"); - this.batchedReaderFunc = func; + this.batchedReaderFunc = new BinaryBatchReaderFunction(newReaderFunction); return this; } @@ -1441,11 +1498,13 @@ public CloseableIterable build() { } if (batchedReaderFunc != null) { + Function> readBuilder = + batchedReaderFunc.withSchema(schema).apply(); return new VectorizedParquetReader<>( file, schema, options, - batchedReaderFunc, + readBuilder, mapping, filter, reuseContainers, diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index aa5afecf89cd..9a854ec614c7 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -245,7 +245,6 @@ private static class ReadBuilderWrapper implements ReadBuilder { private final ReaderFunction readerFunction; private final BatchReaderFunction batchReaderFunction; private final Map config = Maps.newHashMap(); - private Schema icebergSchema; private Map idToConstant = ImmutableMap.of(); private ReadBuilderWrapper( @@ -265,7 +264,6 @@ public ReadBuilder split(long newStart, long newLength) { @Override public ReadBuilder project(Schema schema) { - this.icebergSchema = schema; internal.project(schema); return this; } @@ -318,12 +316,13 @@ public CloseableIterable build() { if (readerFunction != null) { return internal .createReaderFunc( - messageType -> readerFunction.read(icebergSchema, messageType, idToConstant)) + (icebergSchema, messageType) -> + readerFunction.read(icebergSchema, messageType, idToConstant)) .build(); } else if (batchReaderFunction != null) { return internal .createBatchedReaderFunc( - messageType -> + (icebergSchema, messageType) -> batchReaderFunction.read(icebergSchema, messageType, idToConstant, config)) .build(); } else { From b41cb69aa99e8c5d34b4536c901e4defe7681f4b Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Mon, 8 Dec 2025 20:19:01 +0100 Subject: [PATCH 03/15] Create a marker class for the Comet reader and a few extra nits --- .../arrow/vectorized/ArrowFormatModels.java | 2 +- .../apache/iceberg/formats/FormatModel.java | 2 +- .../iceberg/formats/FormatModelRegistry.java | 6 ++--- .../main/java/org/apache/iceberg/orc/ORC.java | 2 +- .../apache/iceberg/orc/ORCFormatModel.java | 7 +++-- .../iceberg/parquet/ParquetFormatModel.java | 18 +++++-------- .../VectorizedSparkParquetReaders.java | 27 +++++++------------ .../iceberg/spark/source/BaseBatchReader.java | 22 ++++++++------- .../spark/source/SparkFormatModels.java | 6 +++++ 9 files changed, 45 insertions(+), 47 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java index 2036ce6f4d9b..4996c9533c11 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java @@ -28,7 +28,7 @@ public static void register() { new ParquetFormatModel<>( ColumnarBatch.class, Object.class, - (schema, messageType, constantValues, properties) -> + (schema, messageType, idToConstant) -> ArrowReader.VectorizedCombinedScanIterator.buildReader( schema, messageType, /* setArrowValidityVector */ diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModel.java b/core/src/main/java/org/apache/iceberg/formats/FormatModel.java index 84ca467b627e..c8164aba1d8f 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FormatModel.java +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModel.java @@ -58,7 +58,7 @@ public interface FormatModel { * * @return the type of the data structures handled by this model implementation */ - Class type(); + Class type(); /** * Return the schema type class for the object model implementation processed by this factory. diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java index df78b4a2a7b7..d1b2ddaac46b 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java @@ -123,7 +123,7 @@ public static synchronized void register(FormatModel formatModel) { * @return a configured reader builder for the specified format and object model */ public static ReadBuilder readBuilder( - FileFormat format, Class type, InputFile inputFile) { + FileFormat format, Class type, InputFile inputFile) { FormatModel factory = factoryFor(format, type); return factory.readBuilder(inputFile); } @@ -144,7 +144,7 @@ public static ReadBuilder readBuilder( * @return a configured data write builder for creating a {@link DataWriter} */ public static DataWriteBuilder dataWriteBuilder( - FileFormat format, Class type, EncryptedOutputFile outputFile) { + FileFormat format, Class type, EncryptedOutputFile outputFile) { FormatModel factory = factoryFor(format, type); return CommonWriteBuilderImpl.forDataFile( factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); @@ -198,7 +198,7 @@ public static PositionDeleteWriteBuilder positionDeleteWriteBuilder( } @SuppressWarnings("unchecked") - private static FormatModel factoryFor(FileFormat format, Class type) { + private static FormatModel factoryFor(FileFormat format, Class type) { FormatModel model = (FormatModel) MODELS.get(Pair.of(format, type)); Preconditions.checkArgument( model != null, "Format model is not registered for format %s and type %s", format, type); diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java index 0d149b5c4349..6eb53db28de6 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java @@ -780,7 +780,7 @@ public ReadBuilder withNameMapping(NameMapping newNameMapping) { return this; } - ReadBuilder constantValues(Set newConstantFieldIds) { + ReadBuilder constantFieldIds(Set newConstantFieldIds) { this.constantFieldIds = newConstantFieldIds; return this; } diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java index b39d9b8790fb..0bc719c10923 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -103,14 +103,13 @@ public ReadBuilder readBuilder(InputFile inputFile) { @FunctionalInterface public interface ReaderFunction { - OrcRowReader read( - Schema schema, TypeDescription messageType, Map constantValues); + OrcRowReader read(Schema schema, TypeDescription messageType, Map idToConstant); } @FunctionalInterface public interface BatchReaderFunction { OrcBatchReader read( - Schema schema, TypeDescription messageType, Map constantValues); + Schema schema, TypeDescription messageType, Map idToConstant); } @FunctionalInterface @@ -180,7 +179,7 @@ public ReadBuilder recordsPerBatch(int numRowsPerBatch) { @Override public ReadBuilder idToConstant(Map newIdToConstant) { - internal.constantValues(newIdToConstant.keySet()); + internal.constantFieldIds(newIdToConstant.keySet()); this.idToConstant = newIdToConstant; return this; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 9a854ec614c7..5e90ee3d088d 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -45,14 +45,14 @@ public class ParquetFormatModel implements FormatModel { public static final String WRITER_VERSION_KEY = "parquet.writer.version"; - private final Class type; + private final Class type; private final Class schemaType; private final ReaderFunction readerFunction; private final BatchReaderFunction batchReaderFunction; private final WriterFunction writerFunction; private ParquetFormatModel( - Class type, + Class type, Class schemaType, ReaderFunction readerFunction, BatchReaderFunction batchReaderFunction, @@ -77,7 +77,7 @@ public ParquetFormatModel( } public ParquetFormatModel( - Class type, Class schemaType, BatchReaderFunction batchReaderFunction) { + Class type, Class schemaType, BatchReaderFunction batchReaderFunction) { this(type, schemaType, null, batchReaderFunction, null); } @@ -87,7 +87,7 @@ public FileFormat format() { } @Override - public Class type() { + public Class type() { return type; } @@ -109,16 +109,12 @@ public ReadBuilder readBuilder(InputFile inputFile) { @FunctionalInterface public interface ReaderFunction { ParquetValueReader read( - Schema schema, MessageType messageType, Map constantValues); + Schema schema, MessageType messageType, Map idToConstant); } @FunctionalInterface public interface BatchReaderFunction { - VectorizedReader read( - Schema schema, - MessageType messageType, - Map constantValues, - Map config); + VectorizedReader read(Schema schema, MessageType messageType, Map idToConstant); } @FunctionalInterface @@ -323,7 +319,7 @@ public CloseableIterable build() { return internal .createBatchedReaderFunc( (icebergSchema, messageType) -> - batchReaderFunction.read(icebergSchema, messageType, idToConstant, config)) + batchReaderFunction.read(icebergSchema, messageType, idToConstant)) .build(); } else { throw new IllegalStateException("Either readerFunction or batchReaderFunction must be set"); diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index 21764cddb7cc..aa1b496cf302 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -28,9 +28,9 @@ import org.apache.iceberg.arrow.vectorized.VectorizedReaderBuilder; import org.apache.iceberg.parquet.TypeWithSchemaVisitor; import org.apache.iceberg.parquet.VectorizedReader; -import org.apache.iceberg.spark.ParquetReaderType; import org.apache.iceberg.spark.SparkUtil; import org.apache.parquet.schema.MessageType; +import org.apache.spark.sql.vectorized.ColumnVector; import org.apache.spark.sql.vectorized.ColumnarBatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,8 +43,6 @@ public class VectorizedSparkParquetReaders { private static final String ENABLE_NULL_CHECK_FOR_GET = "arrow.enable_null_check_for_get"; private static final String ENABLE_NULL_CHECK_FOR_GET_ENV = "ARROW_ENABLE_NULL_CHECK_FOR_GET"; - public static final String PARQUET_READER_TYPE = "parquet.reader.type"; - static { try { enableUnsafeMemoryAccess(); @@ -56,18 +54,6 @@ public class VectorizedSparkParquetReaders { private VectorizedSparkParquetReaders() {} - public static VectorizedReader buildReader( - Schema expectedSchema, - MessageType fileSchema, - Map idToConstant, - Map config) { - if (ParquetReaderType.COMET.name().equals(config.get(PARQUET_READER_TYPE))) { - return buildCometReader(expectedSchema, fileSchema, idToConstant); - } else { - return buildReader(expectedSchema, fileSchema, idToConstant); - } - } - public static ColumnarBatchReader buildReader( Schema expectedSchema, MessageType fileSchema, @@ -91,9 +77,9 @@ public static ColumnarBatchReader buildReader( return buildReader(expectedSchema, fileSchema, idToConstant, ArrowAllocation.rootAllocator()); } - public static CometColumnarBatchReader buildCometReader( + public static VectorizedReader buildCometReader( Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - return (CometColumnarBatchReader) + return (VectorizedReader) TypeWithSchemaVisitor.visit( expectedSchema.asStruct(), fileSchema, @@ -104,6 +90,13 @@ public static CometColumnarBatchReader buildCometReader( readers -> new CometColumnarBatchReader(readers, expectedSchema))); } + /** A subclass of ColumnarBatch to identify Comet readers. */ + public static class CometColumnarBatch extends ColumnarBatch { + public CometColumnarBatch(ColumnVector[] columns) { + super(columns); + } + } + // enables unsafe memory access to avoid costly checks to see if index is within bounds // as long as it is not configured explicitly (see BoundsChecking in Arrow) private static void enableUnsafeMemoryAccess() { diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java index 1147977825e6..110ce355a223 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/BaseBatchReader.java @@ -35,6 +35,7 @@ import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.spark.OrcBatchReadConf; import org.apache.iceberg.spark.ParquetBatchReadConf; +import org.apache.iceberg.spark.ParquetReaderType; import org.apache.iceberg.spark.data.vectorized.ColumnVectorWithFilter; import org.apache.iceberg.spark.data.vectorized.ColumnarBatchUtil; import org.apache.iceberg.spark.data.vectorized.UpdatableDeletedColumnVector; @@ -72,17 +73,20 @@ protected CloseableIterable newBatchIterable( Expression residual, Map idToConstant, @Nonnull SparkDeleteFilter deleteFilter) { - ReadBuilder readBuilder = - FormatModelRegistry.readBuilder(format, ColumnarBatch.class, inputFile); + ReadBuilder readBuilder; if (parquetConf != null) { readBuilder = - readBuilder - .recordsPerBatch(parquetConf.batchSize()) - .set( - VectorizedSparkParquetReaders.PARQUET_READER_TYPE, - parquetConf.readerType().name()); - } else if (orcConf != null) { - readBuilder = readBuilder.recordsPerBatch(orcConf.batchSize()); + parquetConf.readerType() == ParquetReaderType.COMET + ? FormatModelRegistry.readBuilder( + format, VectorizedSparkParquetReaders.CometColumnarBatch.class, inputFile) + : FormatModelRegistry.readBuilder(format, ColumnarBatch.class, inputFile); + + readBuilder = readBuilder.recordsPerBatch(parquetConf.batchSize()); + } else { + readBuilder = FormatModelRegistry.readBuilder(format, ColumnarBatch.class, inputFile); + if (orcConf != null) { + readBuilder = readBuilder.recordsPerBatch(orcConf.batchSize()); + } } CloseableIterable iterable = diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java index fb1b37b08cd2..63229a89d0ed 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -55,6 +55,12 @@ public static void register() { new ParquetFormatModel<>( ColumnarBatch.class, StructType.class, VectorizedSparkParquetReaders::buildReader)); + FormatModelRegistry.register( + new ParquetFormatModel<>( + VectorizedSparkParquetReaders.CometColumnarBatch.class, + StructType.class, + VectorizedSparkParquetReaders::buildCometReader)); + FormatModelRegistry.register( new ORCFormatModel<>( InternalRow.class, From 43282867f8b08cfdba16f3885597c726682d5ace Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 15 Jan 2026 11:57:15 +0100 Subject: [PATCH 04/15] Fix for Ryan's simple comments --- .../java/org/apache/iceberg/avro/AvroFormatModel.java | 4 +--- .../apache/iceberg/parquet/ParquetFormatModel.java | 11 +++++------ .../iceberg/parquet/VectorizedParquetReader.java | 8 ++++---- .../java/org/apache/iceberg/parquet/TestParquet.java | 2 +- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java index 3b6355a5104b..5d06e6837e18 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -86,7 +86,6 @@ public ReadBuilder readBuilder(InputFile inputFile) { private static class ReadBuilderWrapper implements ReadBuilder { private final Avro.ReadBuilder internal; private final BiFunction, DatumReader> readerFunction; - private Schema icebergSchema; private Map idToConstant = ImmutableMap.of(); private ReadBuilderWrapper( @@ -103,7 +102,6 @@ public ReadBuilder split(long newStart, long newLength) { @Override public ReadBuilder project(Schema schema) { - this.icebergSchema = schema; internal.project(schema); return this; } @@ -152,7 +150,7 @@ public ReadBuilder withNameMapping(org.apache.iceberg.mapping.NameMapping @Override public CloseableIterable build() { return internal - .createResolvingReader(unused -> readerFunction.apply(icebergSchema, idToConstant)) + .createResolvingReader(icebergSchema -> readerFunction.apply(icebergSchema, idToConstant)) .build(); } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 5e90ee3d088d..27eb24c18199 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -37,8 +37,8 @@ import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.NameMapping; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.MessageType; @@ -240,13 +240,15 @@ private static class ReadBuilderWrapper implements ReadBuilder { private final Parquet.ReadBuilder internal; private final ReaderFunction readerFunction; private final BatchReaderFunction batchReaderFunction; - private final Map config = Maps.newHashMap(); private Map idToConstant = ImmutableMap.of(); private ReadBuilderWrapper( InputFile inputFile, ReaderFunction readerFunction, BatchReaderFunction batchReaderFunction) { + Preconditions.checkArgument( + readerFunction == null || batchReaderFunction == null, + "Only one of readerFunction or batchReaderFunction can be non-null"); this.internal = Parquet.read(inputFile); this.readerFunction = readerFunction; this.batchReaderFunction = batchReaderFunction; @@ -278,7 +280,6 @@ public ReadBuilder filter(Expression filter) { @Override public ReadBuilder set(String key, String value) { - this.config.put(key, value); internal.set(key, value); return this; } @@ -315,14 +316,12 @@ public CloseableIterable build() { (icebergSchema, messageType) -> readerFunction.read(icebergSchema, messageType, idToConstant)) .build(); - } else if (batchReaderFunction != null) { + } else { return internal .createBatchedReaderFunc( (icebergSchema, messageType) -> batchReaderFunction.read(icebergSchema, messageType, idToConstant)) .build(); - } else { - throw new IllegalStateException("Either readerFunction or batchReaderFunction must be set"); } } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java index 6d4e64603771..fc10a57ec0e0 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/VectorizedParquetReader.java @@ -46,7 +46,7 @@ public class VectorizedParquetReader extends CloseableGroup implements Closea private final Function> batchReaderFunc; private final Expression filter; private final boolean reuseContainers; - private final boolean filterCaseSensitive; + private final boolean caseSensitive; private final int batchSize; private final NameMapping nameMapping; @@ -58,7 +58,7 @@ public VectorizedParquetReader( NameMapping nameMapping, Expression filter, boolean reuseContainers, - boolean filterCaseSensitive, + boolean caseSensitive, int maxRecordsPerBatch) { this.input = input; this.expectedSchema = expectedSchema; @@ -67,7 +67,7 @@ public VectorizedParquetReader( // replace alwaysTrue with null to avoid extra work evaluating a trivial filter this.filter = filter == Expressions.alwaysTrue() ? null : filter; this.reuseContainers = reuseContainers; - this.filterCaseSensitive = filterCaseSensitive; + this.caseSensitive = caseSensitive; this.batchSize = maxRecordsPerBatch; this.nameMapping = nameMapping; } @@ -86,7 +86,7 @@ private ReadConf init() { batchReaderFunc, nameMapping, reuseContainers, - filterCaseSensitive, + caseSensitive, batchSize); this.conf = readConf.copy(); return readConf; diff --git a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java index 3132929fb330..58850ec7c9f4 100644 --- a/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java +++ b/parquet/src/test/java/org/apache/iceberg/parquet/TestParquet.java @@ -218,7 +218,7 @@ public void testTwoLevelList() throws IOException { writer.close(); GenericData.Record recordRead = - Iterables.getOnlyElement( + Iterables.getOnlyElement( Parquet.read(Files.localInput(testFile)).project(schema).callInit().build()); assertThat(recordRead.get("arraybytes")).isEqualTo(expectedByteList); From 9627db83cd6d54dc2c36f1d8c5029bedbd9a0545 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 15 Jan 2026 12:28:51 +0100 Subject: [PATCH 05/15] Parquet vectorized read to use BiFunction --- .../org/apache/iceberg/parquet/Parquet.java | 82 +++++++------------ 1 file changed, 29 insertions(+), 53 deletions(-) diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index b67b5db69c9c..44f0459da531 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -1174,7 +1174,8 @@ public static class ReadBuilder implements InternalData.ReadBuilder { private Schema schema = null; private Expression filter = null; private ReadSupport readSupport = null; - private BatchReaderFunction batchedReaderFunc = null; + private Function> batchedReaderFunc = null; + private BiFunction> batchedReaderFuncWithSchema = null; private ReaderFunction readerFunction = null; private boolean filterRecords = true; private boolean caseSensitive = true; @@ -1240,50 +1241,6 @@ public ReaderFunction withSchema(Schema expectedSchema) { } } - public interface BatchReaderFunction { - Function> apply(); - - default BatchReaderFunction withSchema(Schema schema) { - return this; - } - } - - private static class UnaryBatchReaderFunction implements BatchReaderFunction { - private final Function> readerFunc; - - UnaryBatchReaderFunction(Function> readerFunc) { - this.readerFunc = readerFunc; - } - - @Override - public Function> apply() { - return readerFunc; - } - } - - private static class BinaryBatchReaderFunction implements BatchReaderFunction { - private final BiFunction> readerFuncWithSchema; - private Schema schema; - - BinaryBatchReaderFunction( - BiFunction> readerFuncWithSchema) { - this.readerFuncWithSchema = readerFuncWithSchema; - } - - @Override - public Function> apply() { - Preconditions.checkArgument( - schema != null, "Schema must be set for 2-argument reader function"); - return messageType -> readerFuncWithSchema.apply(schema, messageType); - } - - @Override - public BinaryBatchReaderFunction withSchema(Schema expectedSchema) { - this.schema = expectedSchema; - return this; - } - } - private ReadBuilder(InputFile file) { this.file = file; } @@ -1341,6 +1298,9 @@ public ReadBuilder createReaderFunc( Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set reader function: batched reader function already set"); + Preconditions.checkArgument( + this.batchedReaderFuncWithSchema == null, + "Cannot set reader function: batched reader function with schema already set"); Preconditions.checkArgument( this.readerFunction == null, "Cannot set reader function: reader function already set"); this.readerFunction = new UnaryReaderFunction(newReaderFunction); @@ -1352,6 +1312,9 @@ public ReadBuilder createReaderFunc( Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set reader function: batched reader function already set"); + Preconditions.checkArgument( + this.batchedReaderFuncWithSchema == null, + "Cannot set reader function: batched reader function with schema already set"); Preconditions.checkArgument( this.readerFunction == null, "Cannot set reader function: reader function already set"); this.readerFunction = new BinaryReaderFunction(newReaderFunction); @@ -1363,10 +1326,13 @@ public ReadBuilder createBatchedReaderFunc( Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set batched reader function: batched reader function already set"); + Preconditions.checkArgument( + this.batchedReaderFuncWithSchema == null, + "Cannot set reader function: batched reader function with schema already set"); Preconditions.checkArgument( this.readerFunction == null, "Cannot set batched reader function: ReaderFunction already set"); - this.batchedReaderFunc = new UnaryBatchReaderFunction(newReaderFunction); + this.batchedReaderFunc = newReaderFunction; return this; } @@ -1375,10 +1341,13 @@ public ReadBuilder createBatchedReaderFunc( Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set batched reader function: batched reader function already set"); + Preconditions.checkArgument( + this.batchedReaderFuncWithSchema == null, + "Cannot set reader function: batched reader function with schema already set"); Preconditions.checkArgument( this.readerFunction == null, "Cannot set batched reader function: ReaderFunction already set"); - this.batchedReaderFunc = new BinaryBatchReaderFunction(newReaderFunction); + this.batchedReaderFuncWithSchema = newReaderFunction; return this; } @@ -1386,6 +1355,9 @@ public ReadBuilder createReaderFunc(ReaderFunction reader) { Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set reader function: batched reader function already set"); + Preconditions.checkArgument( + this.batchedReaderFuncWithSchema == null, + "Cannot set reader function: batched reader function with schema already set"); Preconditions.checkArgument( this.readerFunction == null, "Cannot set reader function: reader function already set"); this.readerFunction = reader; @@ -1445,7 +1417,7 @@ public ReadBuilder withAADPrefix(ByteBuffer aadPrefix) { } @Override - @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity"}) + @SuppressWarnings({"unchecked", "checkstyle:CyclomaticComplexity", "checkstyle:MethodLength"}) public CloseableIterable build() { FileDecryptionProperties fileDecryptionProperties = null; if (fileEncryptionKey != null) { @@ -1460,7 +1432,9 @@ public CloseableIterable build() { Preconditions.checkState(fileAADPrefix == null, "AAD prefix set with null encryption key"); } - if (batchedReaderFunc != null || readerFunction != null) { + if (batchedReaderFunc != null + || batchedReaderFuncWithSchema != null + || readerFunction != null) { ParquetReadOptions.Builder optionsBuilder; if (file instanceof HadoopInputFile) { // remove read properties already set that may conflict with this read @@ -1497,14 +1471,16 @@ public CloseableIterable build() { mapping = NameMapping.empty(); } - if (batchedReaderFunc != null) { - Function> readBuilder = - batchedReaderFunc.withSchema(schema).apply(); + Function> batchedReaderBuilder = + batchedReaderFuncWithSchema != null + ? messageType -> batchedReaderFuncWithSchema.apply(schema, messageType) + : batchedReaderFunc; + if (batchedReaderBuilder != null) { return new VectorizedParquetReader<>( file, schema, options, - readBuilder, + batchedReaderBuilder, mapping, filter, reuseContainers, From 0f7010e8b606378837dd9326c1087911a1227b89 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 15 Jan 2026 13:16:34 +0100 Subject: [PATCH 06/15] Checks are not needed anymore in GenericFormatModels --- .../iceberg/data/GenericFormatModels.java | 81 ++++++------------- 1 file changed, 24 insertions(+), 57 deletions(-) diff --git a/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java index 540b98c483e9..8d88bea03b72 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java @@ -30,70 +30,37 @@ import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.orc.ORCFormatModel; import org.apache.iceberg.parquet.ParquetFormatModel; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class GenericFormatModels { - private static final Logger LOG = LoggerFactory.getLogger(GenericFormatModels.class); - public static void register() { - // ORC, Parquet are optional dependencies. If they are not present, we should just log and - // ignore NoClassDefFoundErrors - registerAvro(); - registerParquet(); - registerOrc(); - } + FormatModelRegistry.register( + new ParquetFormatModel<>( + Record.class, + Schema.class, + GenericParquetReaders::buildReader, + (schema, messageType, inputType) -> GenericParquetWriter.create(schema, messageType))); - private static void registerParquet() { - logAngIgnoreNoClassDefFoundError( - () -> - FormatModelRegistry.register( - new ParquetFormatModel<>( - Record.class, - Schema.class, - GenericParquetReaders::buildReader, - (schema, messageType, inputType) -> - GenericParquetWriter.create(schema, messageType)))); - logAngIgnoreNoClassDefFoundError( - () -> FormatModelRegistry.register(new ParquetFormatModel<>(PositionDelete.class))); - } + FormatModelRegistry.register( + new AvroFormatModel<>( + Record.class, + Schema.class, + PlannedDataReader::create, + (schema, inputSchema) -> DataWriter.create(schema))); - private static void registerAvro() { - logAngIgnoreNoClassDefFoundError( - () -> - FormatModelRegistry.register( - new AvroFormatModel<>( - Record.class, - Schema.class, - PlannedDataReader::create, - (schema, inputSchema) -> DataWriter.create(schema)))); - logAngIgnoreNoClassDefFoundError( - () -> FormatModelRegistry.register(new AvroFormatModel<>(PositionDelete.class))); - } + FormatModelRegistry.register( + new ORCFormatModel<>( + Record.class, + Schema.class, + GenericOrcReader::buildReader, + (schema, typeDescription, unused) -> + GenericOrcWriter.buildWriter(schema, typeDescription))); - private static void registerOrc() { - logAngIgnoreNoClassDefFoundError( - () -> - FormatModelRegistry.register( - new ORCFormatModel<>( - Record.class, - Schema.class, - GenericOrcReader::buildReader, - (schema, typeDescription, unused) -> - GenericOrcWriter.buildWriter(schema, typeDescription)))); - logAngIgnoreNoClassDefFoundError( - () -> FormatModelRegistry.register(new ORCFormatModel<>(PositionDelete.class))); - } + FormatModelRegistry.register(new ParquetFormatModel<>(PositionDelete.class)); - private GenericFormatModels() {} + FormatModelRegistry.register(new AvroFormatModel<>(PositionDelete.class)); - @SuppressWarnings("CatchBlockLogException") - private static void logAngIgnoreNoClassDefFoundError(Runnable runnable) { - try { - runnable.run(); - } catch (NoClassDefFoundError e) { - // Log the exception and ignore it - LOG.info("Exception occurred when trying to register format models: {}", e.getMessage()); - } + FormatModelRegistry.register(new ORCFormatModel<>(PositionDelete.class)); } + + private GenericFormatModels() {} } From a11216f696b1e26d600667d5915374a54525ba15 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 15 Jan 2026 15:09:59 +0100 Subject: [PATCH 07/15] Refactor FormatModels to inherit from the same Base class --- .../arrow/vectorized/ArrowFormatModels.java | 6 +- .../apache/iceberg/avro/AvroFormatModel.java | 195 ++++++------- .../iceberg/formats/BaseFormatModel.java | 75 +++++ .../iceberg/data/GenericFormatModels.java | 28 +- .../iceberg/flink/data/FlinkFormatModels.java | 18 +- .../main/java/org/apache/iceberg/orc/ORC.java | 2 +- .../apache/iceberg/orc/ORCFormatModel.java | 258 ++++++++---------- .../iceberg/parquet/ParquetFormatModel.java | 116 +++----- .../VectorizedSparkParquetReaders.java | 2 +- .../spark/source/SparkFormatModels.java | 33 ++- 10 files changed, 365 insertions(+), 368 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java index 4996c9533c11..8e814a5bddf5 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java @@ -28,11 +28,11 @@ public static void register() { new ParquetFormatModel<>( ColumnarBatch.class, Object.class, - (schema, messageType, idToConstant) -> + (schema, fileSchema, engineSchema, idToConstant) -> ArrowReader.VectorizedCombinedScanIterator.buildReader( schema, - messageType, /* setArrowValidityVector */ - NullCheckingForGet.NULL_CHECKING_ENABLED))); + fileSchema, + NullCheckingForGet.NULL_CHECKING_ENABLED /* setArrowValidityVector */))); } private ArrowFormatModels() {} diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java index 5d06e6837e18..386b916f063c 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -20,16 +20,15 @@ import java.nio.ByteBuffer; import java.util.Map; -import java.util.function.BiFunction; +import org.apache.avro.Schema; import org.apache.avro.io.DatumReader; import org.apache.avro.io.DatumWriter; import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.Schema; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.formats.FormatModel; +import org.apache.iceberg.formats.BaseFormatModel; import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.formats.WriteBuilder; import org.apache.iceberg.io.CloseableIterable; @@ -37,25 +36,19 @@ import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -public class AvroFormatModel implements FormatModel { - private final Class type; - private final Class schemaType; - private final BiFunction, DatumReader> readerFunction; - private final BiFunction> writerFunction; +public class AvroFormatModel + extends BaseFormatModel, DatumReader, Schema> { public AvroFormatModel(Class type) { - this(type, null, null, null); + super(type, null, null, null, false); } public AvroFormatModel( Class type, Class schemaType, - BiFunction, DatumReader> readerFunction, - BiFunction> writerFunction) { - this.type = type; - this.schemaType = schemaType; - this.readerFunction = readerFunction; - this.writerFunction = writerFunction; + WriterFunction, S, Schema> writerFunction, + ReaderFunction, S, Schema> readerFunction) { + super(type, schemaType, writerFunction, readerFunction, false /* batchReader */); } @Override @@ -63,113 +56,30 @@ public FileFormat format() { return FileFormat.AVRO; } - @Override - public Class type() { - return type; - } - - @Override - public Class schemaType() { - return schemaType; - } - @Override public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { - return new WriteBuilderWrapper<>(outputFile, writerFunction); + return new WriteBuilderWrapper<>(outputFile, writerFunction()); } @Override public ReadBuilder readBuilder(InputFile inputFile) { - return new ReadBuilderWrapper<>(inputFile, readerFunction); - } - - private static class ReadBuilderWrapper implements ReadBuilder { - private final Avro.ReadBuilder internal; - private final BiFunction, DatumReader> readerFunction; - private Map idToConstant = ImmutableMap.of(); - - private ReadBuilderWrapper( - InputFile inputFile, BiFunction, DatumReader> readerFunction) { - this.internal = Avro.read(inputFile); - this.readerFunction = readerFunction; - } - - @Override - public ReadBuilder split(long newStart, long newLength) { - internal.split(newStart, newLength); - return this; - } - - @Override - public ReadBuilder project(Schema schema) { - internal.project(schema); - return this; - } - - @Override - public ReadBuilder caseSensitive(boolean caseSensitive) { - // Filtering is not supported in Avro reader, so case sensitivity does not matter - return this; - } - - @Override - public ReadBuilder filter(Expression filter) { - // Filtering is not supported in Avro reader - return this; - } - - @Override - public ReadBuilder set(String key, String value) { - // Configuration is not used for Avro reader creation - return this; - } - - @Override - public ReadBuilder reuseContainers() { - internal.reuseContainers(); - return this; - } - - @Override - public ReadBuilder recordsPerBatch(int numRowsPerBatch) { - throw new UnsupportedOperationException("Batch reading is not supported in Avro reader"); - } - - @Override - public ReadBuilder idToConstant(Map newIdToConstant) { - this.idToConstant = newIdToConstant; - return this; - } - - @Override - public ReadBuilder withNameMapping(org.apache.iceberg.mapping.NameMapping nameMapping) { - internal.withNameMapping(nameMapping); - return this; - } - - @Override - public CloseableIterable build() { - return internal - .createResolvingReader(icebergSchema -> readerFunction.apply(icebergSchema, idToConstant)) - .build(); - } + return new ReadBuilderWrapper<>(inputFile, readerFunction()); } private static class WriteBuilderWrapper implements WriteBuilder { private final Avro.WriteBuilder internal; - private final BiFunction> writerFunction; + private final WriterFunction, S, Schema> writerFunction; private S inputSchema; private FileContent content; private WriteBuilderWrapper( - EncryptedOutputFile outputFile, - BiFunction> writerFunction) { + EncryptedOutputFile outputFile, WriterFunction, S, Schema> writerFunction) { this.internal = Avro.write(outputFile.encryptingOutputFile()); this.writerFunction = writerFunction; } @Override - public WriteBuilder schema(Schema schema) { + public WriteBuilder schema(org.apache.iceberg.Schema schema) { internal.schema(schema); return this; } @@ -237,11 +147,13 @@ public org.apache.iceberg.io.FileAppender build() throws java.io.IOException switch (content) { case DATA: internal.createContextFunc(Avro.WriteBuilder.Context::dataContext); - internal.createWriterFunc(avroSchema -> writerFunction.apply(avroSchema, inputSchema)); + internal.createWriterFunc( + avroSchema -> writerFunction.write(null, avroSchema, inputSchema)); break; case EQUALITY_DELETES: internal.createContextFunc(Avro.WriteBuilder.Context::deleteContext); - internal.createWriterFunc(avroSchema -> writerFunction.apply(avroSchema, inputSchema)); + internal.createWriterFunc( + avroSchema -> writerFunction.write(null, avroSchema, inputSchema)); break; case POSITION_DELETES: internal.createContextFunc(Avro.WriteBuilder.Context::deleteContext); @@ -255,4 +167,77 @@ public org.apache.iceberg.io.FileAppender build() throws java.io.IOException return internal.build(); } } + + private static class ReadBuilderWrapper implements ReadBuilder { + private final Avro.ReadBuilder internal; + private final ReaderFunction, S, Schema> readerFunction; + private Map idToConstant = ImmutableMap.of(); + + private ReadBuilderWrapper( + InputFile inputFile, ReaderFunction, S, Schema> readerFunction) { + this.internal = Avro.read(inputFile); + this.readerFunction = readerFunction; + } + + @Override + public ReadBuilder split(long newStart, long newLength) { + internal.split(newStart, newLength); + return this; + } + + @Override + public ReadBuilder project(org.apache.iceberg.Schema schema) { + internal.project(schema); + return this; + } + + @Override + public ReadBuilder caseSensitive(boolean caseSensitive) { + // Filtering is not supported in Avro reader, so case sensitivity does not matter + return this; + } + + @Override + public ReadBuilder filter(Expression filter) { + // Filtering is not supported in Avro reader + return this; + } + + @Override + public ReadBuilder set(String key, String value) { + // Configuration is not used for Avro reader creation + return this; + } + + @Override + public ReadBuilder reuseContainers() { + internal.reuseContainers(); + return this; + } + + @Override + public ReadBuilder recordsPerBatch(int numRowsPerBatch) { + throw new UnsupportedOperationException("Batch reading is not supported in Avro reader"); + } + + @Override + public ReadBuilder idToConstant(Map newIdToConstant) { + this.idToConstant = newIdToConstant; + return this; + } + + @Override + public ReadBuilder withNameMapping(org.apache.iceberg.mapping.NameMapping nameMapping) { + internal.withNameMapping(nameMapping); + return this; + } + + @Override + public CloseableIterable build() { + return internal + .createResolvingReader( + icebergSchema -> readerFunction.read(icebergSchema, null, null, idToConstant)) + .build(); + } + } } diff --git a/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java new file mode 100644 index 000000000000..db18cf840c2d --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.util.Map; +import org.apache.iceberg.Schema; + +public abstract class BaseFormatModel implements FormatModel { + private final Class type; + private final Class schemaType; + private final WriterFunction writerFunction; + private final ReaderFunction readerFunction; + private final boolean batchReader; + + public BaseFormatModel( + Class type, + Class schemaType, + WriterFunction writerFunction, + ReaderFunction readerFunction, + boolean batchReader) { + this.type = type; + this.schemaType = schemaType; + this.writerFunction = writerFunction; + this.readerFunction = readerFunction; + this.batchReader = batchReader; + } + + @Override + public Class type() { + return type; + } + + @Override + public Class schemaType() { + return schemaType; + } + + protected WriterFunction writerFunction() { + return writerFunction; + } + + protected ReaderFunction readerFunction() { + return readerFunction; + } + + protected boolean batchReader() { + return batchReader; + } + + @FunctionalInterface + public interface WriterFunction { + W write(Schema icebergSchema, F fileSchema, S engineSchema); + } + + @FunctionalInterface + public interface ReaderFunction { + R read(Schema icebergSchema, F fileSchema, S engineSchema, Map idToConstant); + } +} diff --git a/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java index 8d88bea03b72..120a5d50fac4 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java @@ -37,29 +37,33 @@ public static void register() { new ParquetFormatModel<>( Record.class, Schema.class, - GenericParquetReaders::buildReader, - (schema, messageType, inputType) -> GenericParquetWriter.create(schema, messageType))); + (icebergSchema, fileSchema, engineSchema) -> + GenericParquetWriter.create(icebergSchema, fileSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); + + FormatModelRegistry.register(new ParquetFormatModel<>(PositionDelete.class)); FormatModelRegistry.register( new AvroFormatModel<>( Record.class, Schema.class, - PlannedDataReader::create, - (schema, inputSchema) -> DataWriter.create(schema))); + (icebergSchema, fileSchema, engineSchema) -> DataWriter.create(fileSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + PlannedDataReader.create(icebergSchema, idToConstant))); + + FormatModelRegistry.register(new AvroFormatModel<>(PositionDelete.class)); FormatModelRegistry.register( new ORCFormatModel<>( Record.class, Schema.class, - GenericOrcReader::buildReader, - (schema, typeDescription, unused) -> - GenericOrcWriter.buildWriter(schema, typeDescription))); - - FormatModelRegistry.register(new ParquetFormatModel<>(PositionDelete.class)); - - FormatModelRegistry.register(new AvroFormatModel<>(PositionDelete.class)); + (icebergSchema, fileSchema, engineSchema) -> + GenericOrcWriter.buildWriter(icebergSchema, fileSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + GenericOrcReader.buildReader(icebergSchema, fileSchema, idToConstant))); - FormatModelRegistry.register(new ORCFormatModel<>(PositionDelete.class)); + FormatModelRegistry.register(new ORCFormatModel<>(PositionDelete.class)); } private GenericFormatModels() {} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java index ee307b7a7c71..a35cb98c7250 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java @@ -31,23 +31,27 @@ public static void register() { new ParquetFormatModel<>( RowData.class, RowType.class, - FlinkParquetReaders::buildReader, - (unused, messageType, rowType) -> - FlinkParquetWriters.buildWriter(rowType, messageType))); + (icebergSchema, fileSchema, engineSchema) -> + FlinkParquetWriters.buildWriter(engineSchema, fileSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + FlinkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( new AvroFormatModel<>( RowData.class, RowType.class, - FlinkPlannedAvroReader::create, - (unused, rowType) -> new FlinkAvroWriter(rowType))); + (icebergSchema, fileSchema, engineSchema) -> new FlinkAvroWriter(engineSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + FlinkPlannedAvroReader.create(icebergSchema, idToConstant))); FormatModelRegistry.register( new ORCFormatModel<>( RowData.class, RowType.class, - FlinkOrcReader::new, - (schema, unused, rowType) -> FlinkOrcWriter.buildWriter(rowType, schema))); + (icebergSchema, fileSchema, engineSchema) -> + FlinkOrcWriter.buildWriter(engineSchema, icebergSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + new FlinkOrcReader(icebergSchema, fileSchema, idToConstant))); } private FlinkFormatModels() {} diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORC.java b/orc/src/main/java/org/apache/iceberg/orc/ORC.java index 6eb53db28de6..c92aaa020546 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORC.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORC.java @@ -191,7 +191,7 @@ WriteBuilder createContextFunc(Function, Context> newCreateC } public FileAppender build() { - // Preconditions.checkNotNull(schema, "Schema is required"); + Preconditions.checkNotNull(schema, "Schema is required"); for (Map.Entry entry : config.entrySet()) { this.conf.set(entry.getKey(), entry.getValue()); diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java index 0bc719c10923..73036b04a37c 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -29,7 +29,7 @@ import org.apache.iceberg.data.orc.GenericOrcWriters; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.formats.FormatModel; +import org.apache.iceberg.formats.BaseFormatModel; import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.formats.WriteBuilder; import org.apache.iceberg.io.DeleteSchemaUtil; @@ -39,41 +39,26 @@ import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.orc.TypeDescription; -public class ORCFormatModel implements FormatModel { - private final Class type; - private final Class schemaType; - private final ReaderFunction readerFunction; - private final BatchReaderFunction batchReaderFunction; - private final WriterFunction writerFunction; +public class ORCFormatModel + extends BaseFormatModel, R, TypeDescription> { - private ORCFormatModel( + public ORCFormatModel( Class type, Class schemaType, - ReaderFunction readerFunction, - BatchReaderFunction batchReaderFunction, - WriterFunction writerFunction) { - this.type = type; - this.schemaType = schemaType; - this.readerFunction = readerFunction; - this.batchReaderFunction = batchReaderFunction; - this.writerFunction = writerFunction; + WriterFunction, S, TypeDescription> writerFunction, + ReaderFunction readerFunction) { + super(type, schemaType, writerFunction, readerFunction, false /* batchReader */); } public ORCFormatModel( Class type, Class schemaType, - ReaderFunction readerFunction, - WriterFunction writerFunction) { - this(type, schemaType, readerFunction, null, writerFunction); - } - - public ORCFormatModel( - Class type, Class schemaType, BatchReaderFunction batchReaderFunction) { - this(type, schemaType, null, batchReaderFunction, null); + ReaderFunction batchReaderFunction) { + super(type, schemaType, null, batchReaderFunction, true /* batchReader */); } public ORCFormatModel(Class type) { - this(type, null, null, null, null); + super(type, null, null, null, false /* batchReader */); } @Override @@ -81,143 +66,26 @@ public FileFormat format() { return FileFormat.ORC; } - @Override - public Class type() { - return type; - } - - @Override - public Class schemaType() { - return schemaType; - } - @Override public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { - return new WriteBuilderWrapper<>(outputFile, writerFunction); + return new WriteBuilderWrapper<>(outputFile, writerFunction()); } @Override public ReadBuilder readBuilder(InputFile inputFile) { - return new ReadBuilderWrapper<>(inputFile, readerFunction, batchReaderFunction); - } - - @FunctionalInterface - public interface ReaderFunction { - OrcRowReader read(Schema schema, TypeDescription messageType, Map idToConstant); - } - - @FunctionalInterface - public interface BatchReaderFunction { - OrcBatchReader read( - Schema schema, TypeDescription messageType, Map idToConstant); - } - - @FunctionalInterface - public interface WriterFunction { - OrcRowWriter write(Schema schema, TypeDescription messageType, E nativeSchema); - } - - private static class ReadBuilderWrapper implements ReadBuilder { - private final ORC.ReadBuilder internal; - private final ReaderFunction readerFunction; - private final BatchReaderFunction batchReaderFunction; - private boolean reuseContainers = false; - private Schema icebergSchema; - private Map idToConstant = ImmutableMap.of(); - - private ReadBuilderWrapper( - InputFile inputFile, - ReaderFunction readerFunction, - BatchReaderFunction batchReaderFunction) { - this.internal = ORC.read(inputFile); - this.readerFunction = readerFunction; - this.batchReaderFunction = batchReaderFunction; - } - - @Override - public ReadBuilder split(long newStart, long newLength) { - internal.split(newStart, newLength); - return this; - } - - @Override - public ReadBuilder project(Schema schema) { - this.icebergSchema = schema; - internal.project(schema); - return this; - } - - @Override - public ReadBuilder caseSensitive(boolean caseSensitive) { - internal.caseSensitive(caseSensitive); - return this; - } - - @Override - public ReadBuilder filter(Expression filter) { - internal.filter(filter); - return this; - } - - @Override - public ReadBuilder set(String key, String value) { - internal.config(key, value); - return this; - } - - @Override - public ReadBuilder reuseContainers() { - this.reuseContainers = true; - return this; - } - - @Override - public ReadBuilder recordsPerBatch(int numRowsPerBatch) { - internal.recordsPerBatch(numRowsPerBatch); - return this; - } - - @Override - public ReadBuilder idToConstant(Map newIdToConstant) { - internal.constantFieldIds(newIdToConstant.keySet()); - this.idToConstant = newIdToConstant; - return this; - } - - @Override - public ReadBuilder withNameMapping(NameMapping nameMapping) { - internal.withNameMapping(nameMapping); - return this; - } - - @Override - public org.apache.iceberg.io.CloseableIterable build() { - Preconditions.checkNotNull(reuseContainers, "Reuse containers is required for ORC read"); - if (readerFunction != null) { - return internal - .createReaderFunc( - typeDescription -> - readerFunction.read(icebergSchema, typeDescription, idToConstant)) - .build(); - } else if (batchReaderFunction != null) { - return internal - .createBatchedReaderFunc( - typeDescription -> - batchReaderFunction.read(icebergSchema, typeDescription, idToConstant)) - .build(); - } else { - throw new IllegalStateException("Either readerFunction or batchReaderFunction must be set"); - } - } + return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader()); } private static class WriteBuilderWrapper implements WriteBuilder { private final ORC.WriteBuilder internal; - private final WriterFunction writerFunction; + private final WriterFunction, S, TypeDescription> writerFunction; private S inputSchema; + private FileContent content; - private WriteBuilderWrapper(EncryptedOutputFile outputFile, WriterFunction writerFunction) { + private WriteBuilderWrapper( + EncryptedOutputFile outputFile, + WriterFunction, S, TypeDescription> writerFunction) { this.internal = ORC.write(outputFile); this.writerFunction = writerFunction; } @@ -313,4 +181,96 @@ public org.apache.iceberg.io.FileAppender build() { return internal.build(); } } + + private static class ReadBuilderWrapper implements ReadBuilder { + private final ORC.ReadBuilder internal; + private final ReaderFunction readerFunction; + private final boolean batchReader; + private boolean reuseContainers = false; + private Schema icebergSchema; + private Map idToConstant = ImmutableMap.of(); + + private ReadBuilderWrapper( + InputFile inputFile, + ReaderFunction readerFunction, + boolean batchReader) { + this.internal = ORC.read(inputFile); + this.readerFunction = readerFunction; + this.batchReader = batchReader; + } + + @Override + public ReadBuilder split(long newStart, long newLength) { + internal.split(newStart, newLength); + return this; + } + + @Override + public ReadBuilder project(Schema schema) { + this.icebergSchema = schema; + internal.project(schema); + return this; + } + + @Override + public ReadBuilder caseSensitive(boolean caseSensitive) { + internal.caseSensitive(caseSensitive); + return this; + } + + @Override + public ReadBuilder filter(Expression filter) { + internal.filter(filter); + return this; + } + + @Override + public ReadBuilder set(String key, String value) { + internal.config(key, value); + return this; + } + + @Override + public ReadBuilder reuseContainers() { + this.reuseContainers = true; + return this; + } + + @Override + public ReadBuilder recordsPerBatch(int numRowsPerBatch) { + internal.recordsPerBatch(numRowsPerBatch); + return this; + } + + @Override + public ReadBuilder idToConstant(Map newIdToConstant) { + internal.constantFieldIds(newIdToConstant.keySet()); + this.idToConstant = newIdToConstant; + return this; + } + + @Override + public ReadBuilder withNameMapping(NameMapping nameMapping) { + internal.withNameMapping(nameMapping); + return this; + } + + @Override + public org.apache.iceberg.io.CloseableIterable build() { + Preconditions.checkNotNull(reuseContainers, "Reuse containers is required for ORC read"); + return batchReader + ? internal + .createBatchedReaderFunc( + typeDescription -> + (OrcBatchReader) + readerFunction.read(icebergSchema, typeDescription, null, idToConstant)) + .build() + : internal + .createReaderFunc( + typeDescription -> + (OrcRowReader) + readerFunction.read(icebergSchema, typeDescription, null, idToConstant)) + .build(); + } + } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 27eb24c18199..62efabe11669 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -29,7 +29,7 @@ import org.apache.iceberg.data.parquet.GenericParquetWriter; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; -import org.apache.iceberg.formats.FormatModel; +import org.apache.iceberg.formats.BaseFormatModel; import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.formats.WriteBuilder; import org.apache.iceberg.io.CloseableIterable; @@ -37,48 +37,31 @@ import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.NameMapping; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.schema.MessageType; -public class ParquetFormatModel implements FormatModel { +public class ParquetFormatModel + extends BaseFormatModel, R, MessageType> { public static final String WRITER_VERSION_KEY = "parquet.writer.version"; - private final Class type; - private final Class schemaType; - private final ReaderFunction readerFunction; - private final BatchReaderFunction batchReaderFunction; - private final WriterFunction writerFunction; - - private ParquetFormatModel( - Class type, - Class schemaType, - ReaderFunction readerFunction, - BatchReaderFunction batchReaderFunction, - WriterFunction writerFunction) { - this.type = type; - this.schemaType = schemaType; - this.readerFunction = readerFunction; - this.batchReaderFunction = batchReaderFunction; - this.writerFunction = writerFunction; - } - public ParquetFormatModel(Class type) { - this(type, null, null, null); + super(type, null, null, null, false /* batchReader */); } public ParquetFormatModel( Class type, Class schemaType, - ReaderFunction readerFunction, - WriterFunction writerFunction) { - this(type, schemaType, readerFunction, null, writerFunction); + WriterFunction, S, MessageType> writerFunction, + ReaderFunction readerFunction) { + super(type, schemaType, writerFunction, readerFunction, false /* batchReader */); } public ParquetFormatModel( - Class type, Class schemaType, BatchReaderFunction batchReaderFunction) { - this(type, schemaType, null, batchReaderFunction, null); + Class type, + Class schemaType, + ReaderFunction batchReaderFunction) { + super(type, schemaType, null, batchReaderFunction, true /* batchReader */); } @Override @@ -86,49 +69,25 @@ public FileFormat format() { return FileFormat.PARQUET; } - @Override - public Class type() { - return type; - } - - @Override - public Class schemaType() { - return schemaType; - } - @Override public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { - return new WriteBuilderWrapper<>(outputFile, writerFunction); + return new WriteBuilderWrapper<>(outputFile, writerFunction()); } @Override public ReadBuilder readBuilder(InputFile inputFile) { - return new ReadBuilderWrapper<>(inputFile, readerFunction, batchReaderFunction); - } - - @FunctionalInterface - public interface ReaderFunction { - ParquetValueReader read( - Schema schema, MessageType messageType, Map idToConstant); - } - - @FunctionalInterface - public interface BatchReaderFunction { - VectorizedReader read(Schema schema, MessageType messageType, Map idToConstant); - } - - @FunctionalInterface - public interface WriterFunction { - ParquetValueWriter write(Schema icebergSchema, MessageType messageType, S engineSchema); + return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader()); } private static class WriteBuilderWrapper implements WriteBuilder { private final Parquet.WriteBuilder internal; - private final WriterFunction writerFunction; + private final WriterFunction, S, MessageType> writerFunction; private S inputSchema; private FileContent content; - private WriteBuilderWrapper(EncryptedOutputFile outputFile, WriterFunction writerFunction) { + private WriteBuilderWrapper( + EncryptedOutputFile outputFile, + WriterFunction, S, MessageType> writerFunction) { this.internal = Parquet.write(outputFile); this.writerFunction = writerFunction; } @@ -236,22 +195,19 @@ public FileAppender build() throws IOException { } } - private static class ReadBuilderWrapper implements ReadBuilder { + private static class ReadBuilderWrapper implements ReadBuilder { private final Parquet.ReadBuilder internal; - private final ReaderFunction readerFunction; - private final BatchReaderFunction batchReaderFunction; + private final ReaderFunction readerFunction; + private final boolean batchReader; private Map idToConstant = ImmutableMap.of(); private ReadBuilderWrapper( InputFile inputFile, - ReaderFunction readerFunction, - BatchReaderFunction batchReaderFunction) { - Preconditions.checkArgument( - readerFunction == null || batchReaderFunction == null, - "Only one of readerFunction or batchReaderFunction can be non-null"); + ReaderFunction readerFunction, + boolean batchReader) { this.internal = Parquet.read(inputFile); this.readerFunction = readerFunction; - this.batchReaderFunction = batchReaderFunction; + this.batchReader = batchReader; } @Override @@ -310,19 +266,19 @@ public ReadBuilder withNameMapping(NameMapping nameMapping) { @Override public CloseableIterable build() { - if (readerFunction != null) { - return internal - .createReaderFunc( - (icebergSchema, messageType) -> - readerFunction.read(icebergSchema, messageType, idToConstant)) - .build(); - } else { - return internal - .createBatchedReaderFunc( - (icebergSchema, messageType) -> - batchReaderFunction.read(icebergSchema, messageType, idToConstant)) - .build(); - } + return batchReader + ? internal + .createBatchedReaderFunc( + (icebergSchema, messageType) -> + (VectorizedReader) + readerFunction.read(icebergSchema, messageType, null, idToConstant)) + .build() + : internal + .createReaderFunc( + (icebergSchema, messageType) -> + (ParquetValueReader) + readerFunction.read(icebergSchema, messageType, null, idToConstant)) + .build(); } } } diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java index aa1b496cf302..55f9fc1768a3 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkParquetReaders.java @@ -79,7 +79,7 @@ public static ColumnarBatchReader buildReader( public static VectorizedReader buildCometReader( Schema expectedSchema, MessageType fileSchema, Map idToConstant) { - return (VectorizedReader) + return (CometColumnarBatchReader) TypeWithSchemaVisitor.visit( expectedSchema.asStruct(), fileSchema, diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java index 63229a89d0ed..17ac7e0d81f5 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -40,37 +40,50 @@ public static void register() { new AvroFormatModel<>( InternalRow.class, StructType.class, - SparkPlannedAvroReader::create, - (avroSchema, inputSchema) -> new SparkAvroWriter(inputSchema))); + (icebergSchema, fileSchema, engineSchema) -> new SparkAvroWriter(engineSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + SparkPlannedAvroReader.create(icebergSchema, idToConstant))); FormatModelRegistry.register( new ParquetFormatModel<>( InternalRow.class, StructType.class, - SparkParquetReaders::buildReader, - (icebergSchema, messageType, inputType) -> - SparkParquetWriters.buildWriter(inputType, messageType))); + (icebergSchema, fileSchema, engineSchema) -> + SparkParquetWriters.buildWriter(engineSchema, fileSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( new ParquetFormatModel<>( - ColumnarBatch.class, StructType.class, VectorizedSparkParquetReaders::buildReader)); + ColumnarBatch.class, + StructType.class, + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + VectorizedSparkParquetReaders.buildReader( + icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( new ParquetFormatModel<>( VectorizedSparkParquetReaders.CometColumnarBatch.class, StructType.class, - VectorizedSparkParquetReaders::buildCometReader)); + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + VectorizedSparkParquetReaders.buildCometReader( + icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( new ORCFormatModel<>( InternalRow.class, StructType.class, - SparkOrcReader::new, - (schema, typeDescription, unused) -> new SparkOrcWriter(schema, typeDescription))); + (icebergSchema, fileSchema, engineSchema) -> + new SparkOrcWriter(icebergSchema, fileSchema), + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + new SparkOrcReader(icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( new ORCFormatModel<>( - ColumnarBatch.class, StructType.class, VectorizedSparkOrcReaders::buildReader)); + ColumnarBatch.class, + StructType.class, + (icebergSchema, fileSchema, engineSchema, idToConstant) -> + VectorizedSparkOrcReaders.buildReader(icebergSchema, fileSchema, idToConstant))); } private SparkFormatModels() {} From 173c260195fa8504588c3b84b6f6da2bc040243d Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Fri, 16 Jan 2026 13:17:43 +0100 Subject: [PATCH 08/15] Added Javadoc --- .../iceberg/formats/BaseFormatModel.java | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java index db18cf840c2d..e529e693e01f 100644 --- a/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java @@ -21,6 +21,18 @@ import java.util.Map; import org.apache.iceberg.Schema; +/** + * Base implementation of {@link FormatModel} that provides common functionality for format models. + * + *

This abstract class serves as a foundation for creating format-specific models that handle + * reading and writing data in various file formats. + * + * @param output type used for reading data, and input type for writing data and deletes + * @param the type of the schema for the input/output data + * @param the writer type produced by the writer function + * @param the reader type produced by the reader function + * @param the file schema type used by the underlying file format + */ public abstract class BaseFormatModel implements FormatModel { private final Class type; private final Class schemaType; @@ -28,6 +40,16 @@ public abstract class BaseFormatModel implements FormatModel readerFunction; private final boolean batchReader; + /** + * Constructs a new BaseFormatModel with the specified configuration. + * + * @param type the row type class for the object model implementation processed by this factory. + * @param schemaType the schema type class for the object model implementation processed by this + * factory. + * @param writerFunction the function used to create writers for this format + * @param readerFunction the function used to create readers for this format + * @param batchReader whether this model does batch reading + */ public BaseFormatModel( Class type, Class schemaType, @@ -51,25 +73,73 @@ public Class schemaType() { return schemaType; } + /** + * Returns the writer function used to create writers for this format. + * + * @return the writer function + */ protected WriterFunction writerFunction() { return writerFunction; } + /** + * Returns the reader function used to create readers for this format. + * + * @return the reader function + */ protected ReaderFunction readerFunction() { return readerFunction; } + /** + * Returns whether the generated reader is a batch reader. + * + * @return {@code true} if the reading happens in batches, {@code false} if the reading is + * row-wise + */ protected boolean batchReader() { return batchReader; } + /** + * A functional interface for creating writers that can write data in a specific format. + * + * @param the writer type to be created + * @param the type of the schema for the input data + * @param the file schema type used by the underlying file format + */ @FunctionalInterface public interface WriterFunction { + /** + * Creates a writer for the given schemas. + * + * @param icebergSchema the Iceberg schema defining the table structure + * @param fileSchema the file format specific target schema for the output files + * @param engineSchema the engine specific schema for the input data + * @return a writer configured for the given schemas + */ W write(Schema icebergSchema, F fileSchema, S engineSchema); } + /** + * A functional interface for creating readers that can read data from a specific format. + * + * @param the reader type to be created + * @param the type of the schema for the output data + * @param the file schema type used by the underlying file format + */ @FunctionalInterface public interface ReaderFunction { + /** + * Creates a reader for the given schemas. + * + * @param icebergSchema the Iceberg schema defining the table structure + * @param fileSchema the file format specific source schema for the input files + * @param engineSchema the engine specific schema for the output data + * @param idToConstant a map of field IDs to constant values for partition columns and other + * fields not stored in data files + * @return a reader configured for the given schemas + */ R read(Schema icebergSchema, F fileSchema, S engineSchema, Map idToConstant); } } From a28d237f8576dc5ffefb53696b5b541cd876e0be Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Fri, 16 Jan 2026 13:18:48 +0100 Subject: [PATCH 09/15] Added back the ReadBuilder.outputSchema method --- .../org/apache/iceberg/avro/AvroFormatModel.java | 9 ++++++++- .../org/apache/iceberg/formats/ReadBuilder.java | 3 +++ .../java/org/apache/iceberg/orc/ORCFormatModel.java | 13 +++++++++++-- .../apache/iceberg/parquet/ParquetFormatModel.java | 13 +++++++++++-- 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java index 386b916f063c..6a3849eb5312 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -171,6 +171,7 @@ public org.apache.iceberg.io.FileAppender build() throws java.io.IOException private static class ReadBuilderWrapper implements ReadBuilder { private final Avro.ReadBuilder internal; private final ReaderFunction, S, Schema> readerFunction; + private S engineSchema; private Map idToConstant = ImmutableMap.of(); private ReadBuilderWrapper( @@ -191,6 +192,12 @@ public ReadBuilder project(org.apache.iceberg.Schema schema) { return this; } + @Override + public ReadBuilder outputSchema(S schema) { + this.engineSchema = schema; + return this; + } + @Override public ReadBuilder caseSensitive(boolean caseSensitive) { // Filtering is not supported in Avro reader, so case sensitivity does not matter @@ -236,7 +243,7 @@ public ReadBuilder withNameMapping(org.apache.iceberg.mapping.NameMapping public CloseableIterable build() { return internal .createResolvingReader( - icebergSchema -> readerFunction.read(icebergSchema, null, null, idToConstant)) + icebergSchema -> readerFunction.read(icebergSchema, null, engineSchema, idToConstant)) .build(); } } diff --git a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java index d023e2d028ab..20116d059c4b 100644 --- a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java @@ -49,6 +49,9 @@ public interface ReadBuilder { /** Set the projection schema. */ ReadBuilder project(Schema schema); + /** Sets the expected output schema. If not provided derived from the {@link #project(Schema)}. */ + ReadBuilder outputSchema(S schema); + /** * Configures whether filtering should be case-sensitive. If the reader supports filtering, it * must respect this setting. The default value is true. diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java index 73036b04a37c..68db2baa177b 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -186,6 +186,7 @@ private static class ReadBuilderWrapper implements ReadBuilder { private final ORC.ReadBuilder internal; private final ReaderFunction readerFunction; private final boolean batchReader; + private S engineSchema; private boolean reuseContainers = false; private Schema icebergSchema; private Map idToConstant = ImmutableMap.of(); @@ -212,6 +213,12 @@ public ReadBuilder project(Schema schema) { return this; } + @Override + public ReadBuilder outputSchema(S schema) { + this.engineSchema = schema; + return this; + } + @Override public ReadBuilder caseSensitive(boolean caseSensitive) { internal.caseSensitive(caseSensitive); @@ -263,13 +270,15 @@ public org.apache.iceberg.io.CloseableIterable build() { .createBatchedReaderFunc( typeDescription -> (OrcBatchReader) - readerFunction.read(icebergSchema, typeDescription, null, idToConstant)) + readerFunction.read( + icebergSchema, typeDescription, engineSchema, idToConstant)) .build() : internal .createReaderFunc( typeDescription -> (OrcRowReader) - readerFunction.read(icebergSchema, typeDescription, null, idToConstant)) + readerFunction.read( + icebergSchema, typeDescription, engineSchema, idToConstant)) .build(); } } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 62efabe11669..188d35e58b8a 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -199,6 +199,7 @@ private static class ReadBuilderWrapper implements ReadBuilder { private final Parquet.ReadBuilder internal; private final ReaderFunction readerFunction; private final boolean batchReader; + private S engineSchema; private Map idToConstant = ImmutableMap.of(); private ReadBuilderWrapper( @@ -222,6 +223,12 @@ public ReadBuilder project(Schema schema) { return this; } + @Override + public ReadBuilder outputSchema(S schema) { + this.engineSchema = schema; + return this; + } + @Override public ReadBuilder caseSensitive(boolean caseSensitive) { internal.caseSensitive(caseSensitive); @@ -271,13 +278,15 @@ public CloseableIterable build() { .createBatchedReaderFunc( (icebergSchema, messageType) -> (VectorizedReader) - readerFunction.read(icebergSchema, messageType, null, idToConstant)) + readerFunction.read( + icebergSchema, messageType, engineSchema, idToConstant)) .build() : internal .createReaderFunc( (icebergSchema, messageType) -> (ParquetValueReader) - readerFunction.read(icebergSchema, messageType, null, idToConstant)) + readerFunction.read( + icebergSchema, messageType, engineSchema, idToConstant)) .build(); } } From ae512b89539665622fd09a30ce9daaa665e47766 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Sun, 18 Jan 2026 08:49:59 +0100 Subject: [PATCH 10/15] Move the batchReader attribute to the specific format models --- .../apache/iceberg/avro/AvroFormatModel.java | 4 ++-- .../apache/iceberg/formats/BaseFormatModel.java | 16 +--------------- .../org/apache/iceberg/orc/ORCFormatModel.java | 17 ++++++++++------- .../iceberg/parquet/ParquetFormatModel.java | 11 +++++++---- 4 files changed, 20 insertions(+), 28 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java index 6a3849eb5312..cbaabd8b4b74 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -40,7 +40,7 @@ public class AvroFormatModel extends BaseFormatModel, DatumReader, Schema> { public AvroFormatModel(Class type) { - super(type, null, null, null, false); + super(type, null, null, null); } public AvroFormatModel( @@ -48,7 +48,7 @@ public AvroFormatModel( Class schemaType, WriterFunction, S, Schema> writerFunction, ReaderFunction, S, Schema> readerFunction) { - super(type, schemaType, writerFunction, readerFunction, false /* batchReader */); + super(type, schemaType, writerFunction, readerFunction); } @Override diff --git a/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java index e529e693e01f..083f447ffcd0 100644 --- a/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java @@ -38,7 +38,6 @@ public abstract class BaseFormatModel implements FormatModel schemaType; private final WriterFunction writerFunction; private final ReaderFunction readerFunction; - private final boolean batchReader; /** * Constructs a new BaseFormatModel with the specified configuration. @@ -48,19 +47,16 @@ public abstract class BaseFormatModel implements FormatModel type, Class schemaType, WriterFunction writerFunction, - ReaderFunction readerFunction, - boolean batchReader) { + ReaderFunction readerFunction) { this.type = type; this.schemaType = schemaType; this.writerFunction = writerFunction; this.readerFunction = readerFunction; - this.batchReader = batchReader; } @Override @@ -91,16 +87,6 @@ protected ReaderFunction readerFunction() { return readerFunction; } - /** - * Returns whether the generated reader is a batch reader. - * - * @return {@code true} if the reading happens in batches, {@code false} if the reading is - * row-wise - */ - protected boolean batchReader() { - return batchReader; - } - /** * A functional interface for creating writers that can write data in a specific format. * diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java index 68db2baa177b..b735d3b23701 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -41,24 +41,27 @@ public class ORCFormatModel extends BaseFormatModel, R, TypeDescription> { + private final boolean batchReader; + + public ORCFormatModel(Class type) { + this(type, null, null, null); + } public ORCFormatModel( Class type, Class schemaType, WriterFunction, S, TypeDescription> writerFunction, ReaderFunction readerFunction) { - super(type, schemaType, writerFunction, readerFunction, false /* batchReader */); + super(type, schemaType, writerFunction, readerFunction); + this.batchReader = false; } public ORCFormatModel( Class type, Class schemaType, ReaderFunction batchReaderFunction) { - super(type, schemaType, null, batchReaderFunction, true /* batchReader */); - } - - public ORCFormatModel(Class type) { - super(type, null, null, null, false /* batchReader */); + super(type, schemaType, null, batchReaderFunction); + this.batchReader = true; } @Override @@ -73,7 +76,7 @@ public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { @Override public ReadBuilder readBuilder(InputFile inputFile) { - return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader()); + return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader); } private static class WriteBuilderWrapper implements WriteBuilder { diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 188d35e58b8a..767f600a2743 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -44,9 +44,10 @@ public class ParquetFormatModel extends BaseFormatModel, R, MessageType> { public static final String WRITER_VERSION_KEY = "parquet.writer.version"; + private final boolean batchReader; public ParquetFormatModel(Class type) { - super(type, null, null, null, false /* batchReader */); + this(type, null, null, null); } public ParquetFormatModel( @@ -54,14 +55,16 @@ public ParquetFormatModel( Class schemaType, WriterFunction, S, MessageType> writerFunction, ReaderFunction readerFunction) { - super(type, schemaType, writerFunction, readerFunction, false /* batchReader */); + super(type, schemaType, writerFunction, readerFunction); + this.batchReader = false; } public ParquetFormatModel( Class type, Class schemaType, ReaderFunction batchReaderFunction) { - super(type, schemaType, null, batchReaderFunction, true /* batchReader */); + super(type, schemaType, null, batchReaderFunction); + this.batchReader = true; } @Override @@ -76,7 +79,7 @@ public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { @Override public ReadBuilder readBuilder(InputFile inputFile) { - return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader()); + return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader); } private static class WriteBuilderWrapper implements WriteBuilder { From 8be711942b18058e1ccddc8dab1171097348707a Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Tue, 20 Jan 2026 09:51:37 +0100 Subject: [PATCH 11/15] Add creation methods --- .../arrow/vectorized/ArrowFormatModels.java | 2 +- .../apache/iceberg/avro/AvroFormatModel.java | 15 ++++++++-- .../iceberg/formats/BaseFormatModel.java | 2 +- .../iceberg/data/GenericFormatModels.java | 13 ++++----- .../iceberg/flink/data/FlinkFormatModels.java | 6 ++-- .../apache/iceberg/orc/ORCFormatModel.java | 29 ++++++++++++------- .../iceberg/parquet/ParquetFormatModel.java | 29 ++++++++++++------- .../spark/source/SparkFormatModels.java | 12 ++++---- 8 files changed, 67 insertions(+), 41 deletions(-) diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java index 8e814a5bddf5..d70e12be7817 100644 --- a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java +++ b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowFormatModels.java @@ -25,7 +25,7 @@ public class ArrowFormatModels { public static void register() { FormatModelRegistry.register( - new ParquetFormatModel<>( + ParquetFormatModel.create( ColumnarBatch.class, Object.class, (schema, fileSchema, engineSchema, idToConstant) -> diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java index cbaabd8b4b74..67c5e213aa8b 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -26,6 +26,7 @@ import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.BaseFormatModel; @@ -39,11 +40,19 @@ public class AvroFormatModel extends BaseFormatModel, DatumReader, Schema> { - public AvroFormatModel(Class type) { - super(type, null, null, null); + public static AvroFormatModel forDelete() { + return new AvroFormatModel<>(PositionDelete.class, null, null, null); } - public AvroFormatModel( + public static AvroFormatModel create( + Class type, + Class schemaType, + WriterFunction, S, Schema> writerFunction, + ReaderFunction, S, Schema> readerFunction) { + return new AvroFormatModel<>(type, schemaType, writerFunction, readerFunction); + } + + private AvroFormatModel( Class type, Class schemaType, WriterFunction, S, Schema> writerFunction, diff --git a/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java index 083f447ffcd0..1dd60d164f7d 100644 --- a/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/formats/BaseFormatModel.java @@ -48,7 +48,7 @@ public abstract class BaseFormatModel implements FormatModel type, Class schemaType, WriterFunction writerFunction, diff --git a/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java index 120a5d50fac4..59782cca625a 100644 --- a/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java +++ b/data/src/main/java/org/apache/iceberg/data/GenericFormatModels.java @@ -26,7 +26,6 @@ import org.apache.iceberg.data.orc.GenericOrcWriter; import org.apache.iceberg.data.parquet.GenericParquetReaders; import org.apache.iceberg.data.parquet.GenericParquetWriter; -import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.formats.FormatModelRegistry; import org.apache.iceberg.orc.ORCFormatModel; import org.apache.iceberg.parquet.ParquetFormatModel; @@ -34,7 +33,7 @@ public class GenericFormatModels { public static void register() { FormatModelRegistry.register( - new ParquetFormatModel<>( + ParquetFormatModel.create( Record.class, Schema.class, (icebergSchema, fileSchema, engineSchema) -> @@ -42,20 +41,20 @@ public static void register() { (icebergSchema, fileSchema, engineSchema, idToConstant) -> GenericParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); - FormatModelRegistry.register(new ParquetFormatModel<>(PositionDelete.class)); + FormatModelRegistry.register(ParquetFormatModel.forDelete()); FormatModelRegistry.register( - new AvroFormatModel<>( + AvroFormatModel.create( Record.class, Schema.class, (icebergSchema, fileSchema, engineSchema) -> DataWriter.create(fileSchema), (icebergSchema, fileSchema, engineSchema, idToConstant) -> PlannedDataReader.create(icebergSchema, idToConstant))); - FormatModelRegistry.register(new AvroFormatModel<>(PositionDelete.class)); + FormatModelRegistry.register(AvroFormatModel.forDelete()); FormatModelRegistry.register( - new ORCFormatModel<>( + ORCFormatModel.create( Record.class, Schema.class, (icebergSchema, fileSchema, engineSchema) -> @@ -63,7 +62,7 @@ public static void register() { (icebergSchema, fileSchema, engineSchema, idToConstant) -> GenericOrcReader.buildReader(icebergSchema, fileSchema, idToConstant))); - FormatModelRegistry.register(new ORCFormatModel<>(PositionDelete.class)); + FormatModelRegistry.register(ORCFormatModel.forDelete()); } private GenericFormatModels() {} diff --git a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java index a35cb98c7250..0026c8a3021d 100644 --- a/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java +++ b/flink/v2.1/flink/src/main/java/org/apache/iceberg/flink/data/FlinkFormatModels.java @@ -28,7 +28,7 @@ public class FlinkFormatModels { public static void register() { FormatModelRegistry.register( - new ParquetFormatModel<>( + ParquetFormatModel.create( RowData.class, RowType.class, (icebergSchema, fileSchema, engineSchema) -> @@ -37,7 +37,7 @@ public static void register() { FlinkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( - new AvroFormatModel<>( + AvroFormatModel.create( RowData.class, RowType.class, (icebergSchema, fileSchema, engineSchema) -> new FlinkAvroWriter(engineSchema), @@ -45,7 +45,7 @@ public static void register() { FlinkPlannedAvroReader.create(icebergSchema, idToConstant))); FormatModelRegistry.register( - new ORCFormatModel<>( + ORCFormatModel.create( RowData.class, RowType.class, (icebergSchema, fileSchema, engineSchema) -> diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java index b735d3b23701..15e3c746dd9c 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -27,6 +27,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.data.orc.GenericOrcWriter; import org.apache.iceberg.data.orc.GenericOrcWriters; +import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.BaseFormatModel; @@ -43,25 +44,33 @@ public class ORCFormatModel extends BaseFormatModel, R, TypeDescription> { private final boolean batchReader; - public ORCFormatModel(Class type) { - this(type, null, null, null); + public static ORCFormatModel forDelete() { + return new ORCFormatModel<>(PositionDelete.class, null, null, null, false); } - public ORCFormatModel( + public static ORCFormatModel> create( Class type, Class schemaType, WriterFunction, S, TypeDescription> writerFunction, - ReaderFunction readerFunction) { - super(type, schemaType, writerFunction, readerFunction); - this.batchReader = false; + ReaderFunction, S, TypeDescription> readerFunction) { + return new ORCFormatModel<>(type, schemaType, writerFunction, readerFunction, false); } - public ORCFormatModel( + public static ORCFormatModel> create( Class type, Class schemaType, - ReaderFunction batchReaderFunction) { - super(type, schemaType, null, batchReaderFunction); - this.batchReader = true; + ReaderFunction, S, TypeDescription> batchReaderFunction) { + return new ORCFormatModel<>(type, schemaType, null, batchReaderFunction, true); + } + + private ORCFormatModel( + Class type, + Class schemaType, + WriterFunction, S, TypeDescription> writerFunction, + ReaderFunction readerFunction, + boolean batchReader) { + super(type, schemaType, writerFunction, readerFunction); + this.batchReader = batchReader; } @Override diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 767f600a2743..faee59b4ef32 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -27,6 +27,7 @@ import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.Schema; import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.BaseFormatModel; @@ -46,25 +47,33 @@ public class ParquetFormatModel public static final String WRITER_VERSION_KEY = "parquet.writer.version"; private final boolean batchReader; - public ParquetFormatModel(Class type) { - this(type, null, null, null); + public static ParquetFormatModel forDelete() { + return new ParquetFormatModel<>(PositionDelete.class, null, null, null, false); } - public ParquetFormatModel( + public static ParquetFormatModel> create( Class type, Class schemaType, WriterFunction, S, MessageType> writerFunction, - ReaderFunction readerFunction) { - super(type, schemaType, writerFunction, readerFunction); - this.batchReader = false; + ReaderFunction, S, MessageType> readerFunction) { + return new ParquetFormatModel<>(type, schemaType, writerFunction, readerFunction, false); } - public ParquetFormatModel( + public static ParquetFormatModel> create( Class type, Class schemaType, - ReaderFunction batchReaderFunction) { - super(type, schemaType, null, batchReaderFunction); - this.batchReader = true; + ReaderFunction, S, MessageType> batchReaderFunction) { + return new ParquetFormatModel<>(type, schemaType, null, batchReaderFunction, true); + } + + private ParquetFormatModel( + Class type, + Class schemaType, + WriterFunction, S, MessageType> writerFunction, + ReaderFunction readerFunction, + boolean batchReader) { + super(type, schemaType, writerFunction, readerFunction); + this.batchReader = batchReader; } @Override diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java index 17ac7e0d81f5..18390971e4d3 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/source/SparkFormatModels.java @@ -37,7 +37,7 @@ public class SparkFormatModels { public static void register() { FormatModelRegistry.register( - new AvroFormatModel<>( + AvroFormatModel.create( InternalRow.class, StructType.class, (icebergSchema, fileSchema, engineSchema) -> new SparkAvroWriter(engineSchema), @@ -45,7 +45,7 @@ public static void register() { SparkPlannedAvroReader.create(icebergSchema, idToConstant))); FormatModelRegistry.register( - new ParquetFormatModel<>( + ParquetFormatModel.create( InternalRow.class, StructType.class, (icebergSchema, fileSchema, engineSchema) -> @@ -54,7 +54,7 @@ public static void register() { SparkParquetReaders.buildReader(icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( - new ParquetFormatModel<>( + ParquetFormatModel.create( ColumnarBatch.class, StructType.class, (icebergSchema, fileSchema, engineSchema, idToConstant) -> @@ -62,7 +62,7 @@ public static void register() { icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( - new ParquetFormatModel<>( + ParquetFormatModel.create( VectorizedSparkParquetReaders.CometColumnarBatch.class, StructType.class, (icebergSchema, fileSchema, engineSchema, idToConstant) -> @@ -70,7 +70,7 @@ public static void register() { icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( - new ORCFormatModel<>( + ORCFormatModel.create( InternalRow.class, StructType.class, (icebergSchema, fileSchema, engineSchema) -> @@ -79,7 +79,7 @@ public static void register() { new SparkOrcReader(icebergSchema, fileSchema, idToConstant))); FormatModelRegistry.register( - new ORCFormatModel<>( + ORCFormatModel.create( ColumnarBatch.class, StructType.class, (icebergSchema, fileSchema, engineSchema, idToConstant) -> From c66e90f98c995f8adc7351ec92897eb29e55be10 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 22 Jan 2026 22:38:48 +0100 Subject: [PATCH 12/15] Ryan's comments --- .../apache/iceberg/avro/AvroFormatModel.java | 9 ++++++--- .../formats/CommonWriteBuilderImpl.java | 18 +++++++++--------- .../iceberg/formats/DataWriteBuilder.java | 14 +++++++++++--- .../formats/EqualityDeleteWriteBuilder.java | 16 +++++++++------- .../apache/iceberg/formats/ReadBuilder.java | 17 ++++++++++++++--- .../apache/iceberg/formats/WriteBuilder.java | 14 +++++++++++--- .../data/RegistryBasedFileWriterFactory.java | 6 +++--- .../org/apache/iceberg/orc/ORCFormatModel.java | 4 ++-- .../org/apache/iceberg/parquet/Parquet.java | 15 +++++++-------- .../iceberg/parquet/ParquetFormatModel.java | 4 ++-- 10 files changed, 74 insertions(+), 43 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java index 67c5e213aa8b..15438e6f21dc 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -18,6 +18,7 @@ */ package org.apache.iceberg.avro; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.Map; import org.apache.avro.Schema; @@ -94,7 +95,7 @@ public WriteBuilder schema(org.apache.iceberg.Schema schema) { } @Override - public WriteBuilder inputSchema(S schema) { + public WriteBuilder engineSchema(S schema) { this.inputSchema = schema; return this; } @@ -152,7 +153,7 @@ public WriteBuilder withAADPrefix(ByteBuffer aadPrefix) { } @Override - public org.apache.iceberg.io.FileAppender build() throws java.io.IOException { + public org.apache.iceberg.io.FileAppender build() throws IOException { switch (content) { case DATA: internal.createContextFunc(Avro.WriteBuilder.Context::dataContext); @@ -202,7 +203,7 @@ public ReadBuilder project(org.apache.iceberg.Schema schema) { } @Override - public ReadBuilder outputSchema(S schema) { + public ReadBuilder engineProjection(S schema) { this.engineSchema = schema; return this; } @@ -210,12 +211,14 @@ public ReadBuilder outputSchema(S schema) { @Override public ReadBuilder caseSensitive(boolean caseSensitive) { // Filtering is not supported in Avro reader, so case sensitivity does not matter + // This is not an error since filtering is best-effort. return this; } @Override public ReadBuilder filter(Expression filter) { // Filtering is not supported in Avro reader + // This is not an error since filtering is best-effort. return this; } diff --git a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java b/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java index 9e5d9b1605cb..2124f76f56e9 100644 --- a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java +++ b/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java @@ -172,8 +172,8 @@ public DataFileWriteBuilder schema(Schema schema) { } @Override - public DataFileWriteBuilder inputSchema(S schema) { - super.writeBuilder.inputSchema(schema); + public DataFileWriteBuilder engineSchema(S schema) { + super.writeBuilder.engineSchema(schema); return this; } @@ -203,7 +203,7 @@ public DataWriter build() throws IOException { private static class EqualityDeleteFileWriteBuilder extends CommonWriteBuilderImpl, D, S> implements EqualityDeleteWriteBuilder { - private Schema rowSchema = null; + private Schema schema = null; private int[] equalityFieldIds = null; private EqualityDeleteFileWriteBuilder( @@ -212,8 +212,8 @@ private EqualityDeleteFileWriteBuilder( } @Override - public EqualityDeleteFileWriteBuilder inputSchema(S schema) { - super.writeBuilder.inputSchema(schema); + public EqualityDeleteFileWriteBuilder engineSchema(S newSchema) { + super.writeBuilder.engineSchema(newSchema); return this; } @@ -223,8 +223,8 @@ public EqualityDeleteFileWriteBuilder self() { } @Override - public EqualityDeleteFileWriteBuilder rowSchema(Schema schema) { - this.rowSchema = schema; + public EqualityDeleteFileWriteBuilder schema(Schema newSchema) { + this.schema = newSchema; return this; } @@ -237,7 +237,7 @@ public EqualityDeleteFileWriteBuilder equalityFieldIds(int... fieldIds) { @Override public EqualityDeleteWriter build() throws IOException { Preconditions.checkState( - rowSchema != null, "Cannot create equality delete file without a schema"); + schema != null, "Cannot create equality delete file without a schema"); Preconditions.checkState( equalityFieldIds != null, "Cannot create equality delete file without delete field ids"); Preconditions.checkArgument( @@ -248,7 +248,7 @@ public EqualityDeleteWriter build() throws IOException { return new EqualityDeleteWriter<>( super.writeBuilder - .schema(rowSchema) + .schema(schema) .meta("delete-type", "equality") .meta( "delete-field-ids", diff --git a/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java index d81734794874..7aa6ea5462c9 100644 --- a/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java @@ -40,10 +40,18 @@ public interface DataWriteBuilder extends CommonWriteBuilder schema(Schema schema); /** - * Sets the input schema accepted by the writer. If not provided derived from the {@link - * #schema(Schema)}. + * Sets the engine's representation accepted by the writer. + * + *

When provided, this schema should be consistent with the provided Iceberg schema, while + * allowing representation differences that Iceberg considers equivalent. Examples include: + * + *

    + *
  • representing an Iceberg {@code int} column using {@code smallint}, + *
  • supplying a shredded representation for a variant type, or + *
  • selecting specific concrete classes for Iceberg structs. + *
*/ - DataWriteBuilder inputSchema(S schema); + DataWriteBuilder engineSchema(S schema); /** * Creates a data file writer configured with the current builder settings. diff --git a/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java index 25b38da22159..57ee3817d95b 100644 --- a/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java @@ -41,14 +41,16 @@ public interface EqualityDeleteWriteBuilder extends CommonWriteBuilder> { + /** Sets the row schema for the delete writers. */ + EqualityDeleteWriteBuilder schema(Schema schema); + /** - * Sets the input schema accepted by the writer. If not provided derived from the {@link - * #rowSchema(Schema)}. + * Sets the engine's representation accepted by the writer. + * + *

When provided, this schema should be consistent with the provided Iceberg schema, while + * allowing representation differences that Iceberg considers equivalent. */ - EqualityDeleteWriteBuilder inputSchema(S schema); - - /** Sets the row schema for the delete writers. */ - EqualityDeleteWriteBuilder rowSchema(Schema rowSchema); + EqualityDeleteWriteBuilder engineSchema(S schema); /** Sets the equality field ids for the equality delete writer. */ default EqualityDeleteWriteBuilder equalityFieldIds(List fieldIds) { @@ -65,7 +67,7 @@ default EqualityDeleteWriteBuilder equalityFieldIds(List fieldIds * based on field equality, generating proper {@link DeleteFile} metadata on completion. * *

The writer accepts input records exactly matching the input schema specified via {@link - * #rowSchema(Schema)} for deletion. + * #schema(Schema)} for deletion. * * @return a fully configured {@link EqualityDeleteWriter} instance * @throws IOException if the writer cannot be created due to I/O errors diff --git a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java index 20116d059c4b..d25009b5856e 100644 --- a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java @@ -46,11 +46,22 @@ public interface ReadBuilder { */ ReadBuilder split(long start, long length); - /** Set the projection schema. */ + /** Set the projection schema. This must be set before the reader is instantiated. */ ReadBuilder project(Schema schema); - /** Sets the expected output schema. If not provided derived from the {@link #project(Schema)}. */ - ReadBuilder outputSchema(S schema); + /** + * Sets the engine's representation of the projected schema. + * + *

When provided, this schema should be consistent with the requested Iceberg projection, while + * allowing representation differences that Iceberg considers equivalent. Examples include: + * + *

    + *
  • using a {@code long} to represent an Iceberg {@code int} column, + *
  • requesting a shredded representation for a variant type, or + *
  • selecting specific concrete classes for Iceberg structs. + *
+ */ + ReadBuilder engineProjection(S schema); /** * Configures whether filtering should be case-sensitive. If the reader supports filtering, it diff --git a/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java index f1fee495e3da..0e9a40cfdb38 100644 --- a/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java @@ -44,10 +44,18 @@ public interface WriteBuilder { WriteBuilder schema(Schema schema); /** - * Sets the input schema accepted by the writer. If not provided derived from the {@link - * #schema(Schema)}. + * Sets the engine's representation accepted by the writer. + * + *

When provided, this schema should be consistent with the provided Iceberg schema, while + * allowing representation differences that Iceberg considers equivalent. Examples include: + * + *

    + *
  • representing an Iceberg {@code int} column using {@code smallint}, + *
  • supplying a shredded representation for a variant type, or + *
  • selecting specific concrete classes for Iceberg structs. + *
*/ - WriteBuilder inputSchema(S schema); + WriteBuilder engineSchema(S schema); /** * Set a writer configuration property which affects the writer behavior. Writer builders should diff --git a/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java index 57ceb2f4c565..8fb841f2fc01 100644 --- a/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java +++ b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java @@ -111,7 +111,7 @@ public DataWriter newDataWriter( FormatModelRegistry.dataWriteBuilder(dataFileFormat, inputType, file); return builder .schema(dataSchema) - .inputSchema(inputSchema()) + .engineSchema(inputSchema()) .setAll(properties) .setAll(writerProperties) .metricsConfig(metricsConfig) @@ -143,8 +143,8 @@ public EqualityDeleteWriter newEqualityDeleteWriter( .setAll(properties) .setAll(writerProperties) .metricsConfig(metricsConfig) - .rowSchema(equalityDeleteRowSchema) - .inputSchema(equalityDeleteInputSchema()) + .schema(equalityDeleteRowSchema) + .engineSchema(equalityDeleteInputSchema()) .equalityFieldIds(equalityFieldIds) .spec(spec) .partition(partition) diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java index 15e3c746dd9c..50e977e6f19d 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -109,7 +109,7 @@ public WriteBuilder schema(Schema schema) { } @Override - public WriteBuilder inputSchema(S schema) { + public WriteBuilder engineSchema(S schema) { this.inputSchema = schema; return this; } @@ -226,7 +226,7 @@ public ReadBuilder project(Schema schema) { } @Override - public ReadBuilder outputSchema(S schema) { + public ReadBuilder engineProjection(S schema) { this.engineSchema = schema; return this; } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java index 44f0459da531..a738f33da00d 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/Parquet.java @@ -1321,8 +1321,7 @@ public ReadBuilder createReaderFunc( return this; } - public ReadBuilder createBatchedReaderFunc( - Function> newReaderFunction) { + public ReadBuilder createBatchedReaderFunc(Function> func) { Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set batched reader function: batched reader function already set"); @@ -1332,12 +1331,12 @@ public ReadBuilder createBatchedReaderFunc( Preconditions.checkArgument( this.readerFunction == null, "Cannot set batched reader function: ReaderFunction already set"); - this.batchedReaderFunc = newReaderFunction; + this.batchedReaderFunc = func; return this; } public ReadBuilder createBatchedReaderFunc( - BiFunction> newReaderFunction) { + BiFunction> func) { Preconditions.checkArgument( this.batchedReaderFunc == null, "Cannot set batched reader function: batched reader function already set"); @@ -1347,7 +1346,7 @@ public ReadBuilder createBatchedReaderFunc( Preconditions.checkArgument( this.readerFunction == null, "Cannot set batched reader function: ReaderFunction already set"); - this.batchedReaderFuncWithSchema = newReaderFunction; + this.batchedReaderFuncWithSchema = func; return this; } @@ -1471,16 +1470,16 @@ public CloseableIterable build() { mapping = NameMapping.empty(); } - Function> batchedReaderBuilder = + Function> batchedFunc = batchedReaderFuncWithSchema != null ? messageType -> batchedReaderFuncWithSchema.apply(schema, messageType) : batchedReaderFunc; - if (batchedReaderBuilder != null) { + if (batchedFunc != null) { return new VectorizedParquetReader<>( file, schema, options, - batchedReaderBuilder, + batchedFunc, mapping, filter, reuseContainers, diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index faee59b4ef32..0f176d535050 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -111,7 +111,7 @@ public WriteBuilder schema(Schema schema) { } @Override - public WriteBuilder inputSchema(S schema) { + public WriteBuilder engineSchema(S schema) { this.inputSchema = schema; return this; } @@ -236,7 +236,7 @@ public ReadBuilder project(Schema schema) { } @Override - public ReadBuilder outputSchema(S schema) { + public ReadBuilder engineProjection(S schema) { this.engineSchema = schema; return this; } From b01586b3813e3f8e57c511310197e615fe9b277c Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Fri, 23 Jan 2026 14:09:59 +0100 Subject: [PATCH 13/15] Javadoc reword --- .../apache/iceberg/formats/DataWriteBuilder.java | 13 ++++++------- .../iceberg/formats/EqualityDeleteWriteBuilder.java | 4 ++-- .../org/apache/iceberg/formats/ReadBuilder.java | 2 +- .../org/apache/iceberg/formats/WriteBuilder.java | 13 ++++++------- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java index 7aa6ea5462c9..d9f581eff263 100644 --- a/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java @@ -42,14 +42,13 @@ public interface DataWriteBuilder extends CommonWriteBuilderWhen provided, this schema should be consistent with the provided Iceberg schema, while - * allowing representation differences that Iceberg considers equivalent. Examples include: + *

Some data types require additional type information from the engine schema that cannot be + * fully expressed by the Iceberg schema or the data itself. For example, a variant type may use a + * shredded representation that relies on engine-specific metadata to map back to the Iceberg + * schema. * - *

    - *
  • representing an Iceberg {@code int} column using {@code smallint}, - *
  • supplying a shredded representation for a variant type, or - *
  • selecting specific concrete classes for Iceberg structs. - *
+ *

The engine schema must be aligned with the Iceberg schema, but may include representation + * details that Iceberg considers equivalent. */ DataWriteBuilder engineSchema(S schema); diff --git a/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java index 57ee3817d95b..85ff29663d1d 100644 --- a/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java @@ -47,8 +47,8 @@ public interface EqualityDeleteWriteBuilder /** * Sets the engine's representation accepted by the writer. * - *

When provided, this schema should be consistent with the provided Iceberg schema, while - * allowing representation differences that Iceberg considers equivalent. + *

The engine schema must be aligned with the Iceberg schema, but may include representation + * details that Iceberg considers equivalent. */ EqualityDeleteWriteBuilder engineSchema(S schema); diff --git a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java index d25009b5856e..2809750970a7 100644 --- a/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/ReadBuilder.java @@ -53,7 +53,7 @@ public interface ReadBuilder { * Sets the engine's representation of the projected schema. * *

When provided, this schema should be consistent with the requested Iceberg projection, while - * allowing representation differences that Iceberg considers equivalent. Examples include: + * allowing representation differences. Examples include: * *

    *
  • using a {@code long} to represent an Iceberg {@code int} column, diff --git a/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java index 0e9a40cfdb38..b3f2c43bb7c3 100644 --- a/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java @@ -46,14 +46,13 @@ public interface WriteBuilder { /** * Sets the engine's representation accepted by the writer. * - *

    When provided, this schema should be consistent with the provided Iceberg schema, while - * allowing representation differences that Iceberg considers equivalent. Examples include: + *

    Some data types require additional type information from the engine schema that cannot be + * fully expressed by the Iceberg schema or the data itself. For example, a variant type may use a + * shredded representation that relies on engine-specific metadata to map back to the Iceberg + * schema. * - *

      - *
    • representing an Iceberg {@code int} column using {@code smallint}, - *
    • supplying a shredded representation for a variant type, or - *
    • selecting specific concrete classes for Iceberg structs. - *
    + *

    The engine schema must be aligned with the Iceberg schema, but may include representation + * details that Iceberg considers equivalent. */ WriteBuilder engineSchema(S schema); From 1489bd94e5ab73a1d774568e1494cd72fb127dd5 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Thu, 29 Jan 2026 11:35:29 +0100 Subject: [PATCH 14/15] Merge DataWriteBuilder, EqualityDeleteWriteBuilder, PositionDeleteWriteBuilder interfaces --- .../formats/CommonWriteBuilderImpl.java | 337 ------------------ .../iceberg/formats/DataWriteBuilder.java | 66 ---- .../formats/EqualityDeleteWriteBuilder.java | 76 ---- ...iteBuilder.java => FileWriterBuilder.java} | 54 ++- .../iceberg/formats/FormatModelRegistry.java | 294 +++++++++++++-- .../formats/PositionDeleteWriteBuilder.java | 47 --- .../data/RegistryBasedFileWriterFactory.java | 14 +- .../actions/RewriteTablePathSparkAction.java | 4 +- 8 files changed, 312 insertions(+), 580 deletions(-) delete mode 100644 core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java delete mode 100644 core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java delete mode 100644 core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java rename core/src/main/java/org/apache/iceberg/formats/{CommonWriteBuilder.java => FileWriterBuilder.java} (66%) delete mode 100644 core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java diff --git a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java b/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java deleted file mode 100644 index 2124f76f56e9..000000000000 --- a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilderImpl.java +++ /dev/null @@ -1,337 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.formats; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.Objects; -import java.util.stream.Collectors; -import java.util.stream.IntStream; -import org.apache.iceberg.FileContent; -import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.deletes.PositionDeleteWriter; -import org.apache.iceberg.encryption.EncryptionKeyMetadata; -import org.apache.iceberg.io.DataWriter; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; - -/** - * An internal implementation that handles all {@link CommonWriteBuilder} interface variants. - * - *

    This unified implementation serves as a backend for multiple specialized content writers: - * - *

      - *
    • {@link DataWriteBuilder} for creating data files - *
    • {@link EqualityDeleteWriteBuilder} for creating equality delete files - *
    • {@link PositionDeleteWriteBuilder} for creating position delete files - *
    - * - *

    The implementation delegates to a format-specific {@link WriteBuilder} while enriching it with - * content-specific functionality. When building a writer, the implementation configures the - * underlying builder and calls its {@link WriteBuilder#build()} method to create the appropriate - * specialized writer for the requested content type. - * - * @param the concrete builder type for method chaining - * @param the type of data records the writer will accept - * @param the type of the schema for the input data - */ -abstract class CommonWriteBuilderImpl, D, S> - implements CommonWriteBuilder { - private final WriteBuilder writeBuilder; - private final String location; - private final FileFormat format; - private PartitionSpec spec = null; - private StructLike partition = null; - private EncryptionKeyMetadata keyMetadata = null; - private SortOrder sortOrder = null; - - static DataWriteBuilder forDataFile( - WriteBuilder writeBuilder, String location, FileFormat format) { - return new DataFileWriteBuilder<>(writeBuilder.content(FileContent.DATA), location, format); - } - - static EqualityDeleteWriteBuilder forEqualityDelete( - WriteBuilder writeBuilder, String location, FileFormat format) { - return new EqualityDeleteFileWriteBuilder<>( - writeBuilder.content(FileContent.EQUALITY_DELETES), location, format); - } - - @SuppressWarnings({"unchecked", "rawtypes"}) - static PositionDeleteWriteBuilder forPositionDelete( - WriteBuilder writeBuilder, String location, FileFormat format) { - return new PositionDeleteFileWriteBuilder( - (WriteBuilder) writeBuilder.content(FileContent.POSITION_DELETES), - location, - format); - } - - private CommonWriteBuilderImpl( - WriteBuilder writeBuilder, String location, FileFormat format) { - this.writeBuilder = writeBuilder; - this.location = location; - this.format = format; - } - - @Override - public B set(String property, String value) { - writeBuilder.set(property, value); - return self(); - } - - @Override - public B meta(String property, String value) { - writeBuilder.meta(property, value); - return self(); - } - - @Override - public B metricsConfig(MetricsConfig metricsConfig) { - writeBuilder.metricsConfig(metricsConfig); - return self(); - } - - @Override - public B overwrite() { - writeBuilder.overwrite(); - return self(); - } - - @Override - public B withFileEncryptionKey(ByteBuffer encryptionKey) { - writeBuilder.withFileEncryptionKey(encryptionKey); - return self(); - } - - @Override - public B withAADPrefix(ByteBuffer aadPrefix) { - writeBuilder.withAADPrefix(aadPrefix); - return self(); - } - - @Override - public B spec(PartitionSpec newSpec) { - this.spec = newSpec; - return self(); - } - - @Override - public B partition(StructLike newPartition) { - this.partition = newPartition; - return self(); - } - - @Override - public B keyMetadata(EncryptionKeyMetadata newKeyMetadata) { - this.keyMetadata = newKeyMetadata; - return self(); - } - - @Override - public B sortOrder(SortOrder newSortOrder) { - this.sortOrder = newSortOrder; - return self(); - } - - private static class DataFileWriteBuilder - extends CommonWriteBuilderImpl, D, S> - implements DataWriteBuilder { - private DataFileWriteBuilder( - WriteBuilder writeBuilder, String location, FileFormat format) { - super(writeBuilder, location, format); - } - - @Override - public DataFileWriteBuilder schema(Schema schema) { - super.writeBuilder.schema(schema); - return this; - } - - @Override - public DataFileWriteBuilder engineSchema(S schema) { - super.writeBuilder.engineSchema(schema); - return this; - } - - @Override - public DataFileWriteBuilder self() { - return this; - } - - @Override - public DataWriter build() throws IOException { - Preconditions.checkArgument(super.spec != null, "Cannot create data writer without spec"); - Preconditions.checkArgument( - super.spec.isUnpartitioned() || super.partition != null, - "Partition must not be null when creating data writer for partitioned spec"); - - return new DataWriter<>( - super.writeBuilder.build(), - super.format, - super.location, - super.spec, - super.partition, - super.keyMetadata, - super.sortOrder); - } - } - - private static class EqualityDeleteFileWriteBuilder - extends CommonWriteBuilderImpl, D, S> - implements EqualityDeleteWriteBuilder { - private Schema schema = null; - private int[] equalityFieldIds = null; - - private EqualityDeleteFileWriteBuilder( - WriteBuilder writeBuilder, String location, FileFormat format) { - super(writeBuilder, location, format); - } - - @Override - public EqualityDeleteFileWriteBuilder engineSchema(S newSchema) { - super.writeBuilder.engineSchema(newSchema); - return this; - } - - @Override - public EqualityDeleteFileWriteBuilder self() { - return this; - } - - @Override - public EqualityDeleteFileWriteBuilder schema(Schema newSchema) { - this.schema = newSchema; - return this; - } - - @Override - public EqualityDeleteFileWriteBuilder equalityFieldIds(int... fieldIds) { - this.equalityFieldIds = fieldIds; - return this; - } - - @Override - public EqualityDeleteWriter build() throws IOException { - Preconditions.checkState( - schema != null, "Cannot create equality delete file without a schema"); - Preconditions.checkState( - equalityFieldIds != null, "Cannot create equality delete file without delete field ids"); - Preconditions.checkArgument( - super.spec != null, "Spec must not be null when creating equality delete writer"); - Preconditions.checkArgument( - super.spec.isUnpartitioned() || super.partition != null, - "Partition must not be null for partitioned writes"); - - return new EqualityDeleteWriter<>( - super.writeBuilder - .schema(schema) - .meta("delete-type", "equality") - .meta( - "delete-field-ids", - IntStream.of(equalityFieldIds) - .mapToObj(Objects::toString) - .collect(Collectors.joining(", "))) - .build(), - super.format, - super.location, - super.spec, - super.partition, - super.keyMetadata, - super.sortOrder, - equalityFieldIds); - } - } - - @SuppressWarnings({"rawtypes", "unchecked"}) - private static class PositionDeleteFileWriteBuilder - extends CommonWriteBuilderImpl - implements PositionDeleteWriteBuilder { - - private PositionDeleteFileWriteBuilder( - WriteBuilder writeBuilder, String location, FileFormat format) { - super(writeBuilder, location, format); - } - - @Override - public PositionDeleteFileWriteBuilder self() { - return this; - } - - @Override - @SuppressWarnings("unchecked") - public PositionDeleteWriter build() throws IOException { - Preconditions.checkArgument( - super.spec != null, "Spec must not be null when creating position delete writer"); - Preconditions.checkArgument( - super.spec.isUnpartitioned() || super.partition != null, - "Partition must not be null for partitioned writes"); - - return new PositionDeleteWriter<>( - new PositionDeleteFileAppender( - super.writeBuilder.meta("delete-type", "position").build()), - super.format, - super.location, - super.spec, - super.partition, - super.keyMetadata); - } - } - - @SuppressWarnings("rawtypes") - private static class PositionDeleteFileAppender implements FileAppender { - private final FileAppender appender; - - PositionDeleteFileAppender(FileAppender appender) { - this.appender = appender; - } - - @Override - public void add(StructLike positionDelete) { - appender.add((PositionDelete) positionDelete); - } - - @Override - public Metrics metrics() { - return appender.metrics(); - } - - @Override - public long length() { - return appender.length(); - } - - @Override - public void close() throws IOException { - appender.close(); - } - - @Override - public List splitOffsets() { - return appender.splitOffsets(); - } - } -} diff --git a/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java deleted file mode 100644 index d9f581eff263..000000000000 --- a/core/src/main/java/org/apache/iceberg/formats/DataWriteBuilder.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.formats; - -import java.io.IOException; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.Schema; -import org.apache.iceberg.io.DataWriter; - -/** - * A specialized builder for creating data content file writers. - * - *

    This builder extends the generic {@link CommonWriteBuilder} interface with functionality - * specific to creating {@link DataWriter} instances. Data writers produce table content files - * containing actual data records stored in an Iceberg table, configured according to the table's - * schema and partition specification. - * - * @param the type of data records the writer will accept - * @param the type of the schema for the input data - */ -public interface DataWriteBuilder extends CommonWriteBuilder> { - - /** Set the file schema. */ - DataWriteBuilder schema(Schema schema); - - /** - * Sets the engine's representation accepted by the writer. - * - *

    Some data types require additional type information from the engine schema that cannot be - * fully expressed by the Iceberg schema or the data itself. For example, a variant type may use a - * shredded representation that relies on engine-specific metadata to map back to the Iceberg - * schema. - * - *

    The engine schema must be aligned with the Iceberg schema, but may include representation - * details that Iceberg considers equivalent. - */ - DataWriteBuilder engineSchema(S schema); - - /** - * Creates a data file writer configured with the current builder settings. - * - *

    The returned {@link DataWriter} produces files that conform to the Iceberg table format, - * generating proper {@link DataFile} metadata on completion. The writer accepts input records - * exactly matching the Iceberg schema specified via {@link #schema(Schema)} for writing. - * - * @return a fully configured {@link DataWriter} instance - * @throws IOException if the writer cannot be created due to I/O errors - */ - DataWriter build() throws IOException; -} diff --git a/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java deleted file mode 100644 index 85ff29663d1d..000000000000 --- a/core/src/main/java/org/apache/iceberg/formats/EqualityDeleteWriteBuilder.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.formats; - -import java.io.IOException; -import java.util.List; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.Schema; -import org.apache.iceberg.deletes.EqualityDeleteWriter; -import org.apache.iceberg.util.ArrayUtil; - -/** - * A specialized builder for creating equality-based delete file writers. - * - *

    This builder extends the generic {@link CommonWriteBuilder} interface with functionality - * specific to creating {@link EqualityDeleteWriter} instances. - * - *

    The builder provides methods to configure which fields should be used for equality comparison - * through {@link #equalityFieldIds(List)} or {@link #equalityFieldIds(int...)}, along with schema - * configuration for the delete records. - * - * @param the type of data records the writer will accept - * @param the type of the schema for the input data - */ -public interface EqualityDeleteWriteBuilder - extends CommonWriteBuilder> { - - /** Sets the row schema for the delete writers. */ - EqualityDeleteWriteBuilder schema(Schema schema); - - /** - * Sets the engine's representation accepted by the writer. - * - *

    The engine schema must be aligned with the Iceberg schema, but may include representation - * details that Iceberg considers equivalent. - */ - EqualityDeleteWriteBuilder engineSchema(S schema); - - /** Sets the equality field ids for the equality delete writer. */ - default EqualityDeleteWriteBuilder equalityFieldIds(List fieldIds) { - return equalityFieldIds(ArrayUtil.toIntArray(fieldIds)); - } - - /** Sets the equality field ids for the equality delete writer. */ - EqualityDeleteWriteBuilder equalityFieldIds(int... fieldIds); - - /** - * Creates an equality-based delete file writer configured with the current builder settings. - * - *

    The returned {@link EqualityDeleteWriter} produces files that identify records to be deleted - * based on field equality, generating proper {@link DeleteFile} metadata on completion. - * - *

    The writer accepts input records exactly matching the input schema specified via {@link - * #schema(Schema)} for deletion. - * - * @return a fully configured {@link EqualityDeleteWriter} instance - * @throws IOException if the writer cannot be created due to I/O errors - */ - EqualityDeleteWriter build() throws IOException; -} diff --git a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilder.java similarity index 66% rename from core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilder.java rename to core/src/main/java/org/apache/iceberg/formats/FileWriterBuilder.java index b37e755926f1..8e1394ea24bb 100644 --- a/core/src/main/java/org/apache/iceberg/formats/CommonWriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilder.java @@ -18,16 +18,19 @@ */ package org.apache.iceberg.formats; +import java.io.IOException; import java.nio.ByteBuffer; import java.util.Map; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; import org.apache.iceberg.SortOrder; import org.apache.iceberg.StructLike; import org.apache.iceberg.deletes.EqualityDeleteWriter; import org.apache.iceberg.deletes.PositionDeleteWriter; import org.apache.iceberg.encryption.EncryptionKeyMetadata; import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileWriter; /** * A generic builder interface for creating specialized file writers in the Iceberg ecosystem. @@ -44,9 +47,10 @@ *

    Each concrete implementation configures the underlying file format writer while adding * content-specific metadata and behaviors. * - * @param the concrete builder type for method chaining + * @param the concrete writer type the builder produces + * @param the type of the schema for the input data */ -interface CommonWriteBuilder> { +public interface FileWriterBuilder, S> { /** * Set a writer configuration property which affects the writer behavior. @@ -55,7 +59,7 @@ interface CommonWriteBuilder> { * @param value config value * @return this for method chaining */ - B set(String property, String value); + FileWriterBuilder set(String property, String value); /** * Adds the new properties to the writer configuration. @@ -63,9 +67,9 @@ interface CommonWriteBuilder> { * @param properties a map of writer config properties * @return this for method chaining */ - default B setAll(Map properties) { + default FileWriterBuilder setAll(Map properties) { properties.forEach(this::set); - return self(); + return this; } /** @@ -75,7 +79,7 @@ default B setAll(Map properties) { * @param value config value * @return this for method chaining */ - B meta(String property, String value); + FileWriterBuilder meta(String property, String value); /** * Add the new properties to file metadata for the created file. @@ -83,40 +87,56 @@ default B setAll(Map properties) { * @param properties a map of file metadata properties * @return this for method chaining */ - default B meta(Map properties) { + default FileWriterBuilder meta(Map properties) { properties.forEach(this::meta); - return self(); + return this; } /** Sets the metrics configuration used for collecting column metrics for the created file. */ - B metricsConfig(MetricsConfig metricsConfig); + FileWriterBuilder metricsConfig(MetricsConfig metricsConfig); /** Overwrite the file if it already exists. By default, overwrite is disabled. */ - B overwrite(); + FileWriterBuilder overwrite(); /** * Sets the encryption key used for writing the file. If the writer does not support encryption, * then an exception should be thrown. */ - B withFileEncryptionKey(ByteBuffer encryptionKey); + FileWriterBuilder withFileEncryptionKey(ByteBuffer encryptionKey); /** * Sets the additional authentication data (AAD) prefix used for writing the file. If the writer * does not support encryption, then an exception should be thrown. */ - B withAADPrefix(ByteBuffer aadPrefix); + FileWriterBuilder withAADPrefix(ByteBuffer aadPrefix); /** Sets the partition specification for the Iceberg metadata. */ - B spec(PartitionSpec newSpec); + FileWriterBuilder spec(PartitionSpec newSpec); /** Sets the partition value for the Iceberg metadata. */ - B partition(StructLike partition); + FileWriterBuilder partition(StructLike partition); /** Sets the encryption key metadata for Iceberg metadata. */ - B keyMetadata(EncryptionKeyMetadata keyMetadata); + FileWriterBuilder keyMetadata(EncryptionKeyMetadata keyMetadata); /** Sets the sort order for the Iceberg metadata. */ - B sortOrder(SortOrder sortOrder); + FileWriterBuilder sortOrder(SortOrder sortOrder); - B self(); + /** Set the file schema. */ + FileWriterBuilder schema(Schema schema); + + /** + * Sets the engine's representation accepted by the writer. + * + *

    Some data types require additional type information from the engine schema that cannot be + * fully expressed by the Iceberg schema or the data itself. For example, a variant type may use a + * shredded representation that relies on engine-specific metadata to map back to the Iceberg + * schema. + * + *

    The engine schema must be aligned with the Iceberg schema, but may include representation + * details that Iceberg considers equivalent. + */ + FileWriterBuilder engineSchema(S schema); + + W build() throws IOException; } diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java index d1b2ddaac46b..2cf516a570eb 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java @@ -18,17 +18,32 @@ */ package org.apache.iceberg.formats; +import java.io.IOException; +import java.nio.ByteBuffer; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; +import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; import org.apache.iceberg.common.DynMethods; import org.apache.iceberg.deletes.EqualityDeleteWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.deletes.PositionDeleteWriter; import org.apache.iceberg.encryption.EncryptedOutputFile; +import org.apache.iceberg.encryption.EncryptionKeyMetadata; import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileWriter; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -42,17 +57,9 @@ * A registry that manages file-format-specific readers and writers through a unified object model * factory interface. * - *

    This registry provides access to {@link ReadBuilder}s for data consumption and various writer - * builders: - * - *

      - *
    • {@link WriteBuilder} for basic file writing, - *
    • {@link DataWriteBuilder} for data files, - *
    • {@link EqualityDeleteWriteBuilder} for equality deletes, - *
    • {@link PositionDeleteWriteBuilder} for position deletes. - *
    - * - * The appropriate builder is selected based on {@link FileFormat} and object model name. + *

    This registry provides access to {@link ReadBuilder}s for data consumption and {@link + * FileWriterBuilder}s for writing various types of Iceberg content files. The appropriate builder + * is selected based on {@link FileFormat} and object model name. * *

    {@link FormatModel} objects are registered through {@link #register(FormatModel)} and used for * creating readers and writers. Read builders are returned directly from the factory. Write @@ -124,8 +131,8 @@ public static synchronized void register(FormatModel formatModel) { */ public static ReadBuilder readBuilder( FileFormat format, Class type, InputFile inputFile) { - FormatModel factory = factoryFor(format, type); - return factory.readBuilder(inputFile); + FormatModel model = modelFor(format, type); + return model.readBuilder(inputFile); } /** @@ -143,11 +150,11 @@ public static ReadBuilder readBuilder( * @param the type of the input schema for the writer * @return a configured data write builder for creating a {@link DataWriter} */ - public static DataWriteBuilder dataWriteBuilder( + public static FileWriterBuilder, S> dataWriteBuilder( FileFormat format, Class type, EncryptedOutputFile outputFile) { - FormatModel factory = factoryFor(format, type); - return CommonWriteBuilderImpl.forDataFile( - factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); + FormatModel model = modelFor(format, type); + return forDataFile( + model.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); } /** @@ -161,15 +168,19 @@ public static DataWriteBuilder dataWriteBuilder( * @param format the file format used for writing * @param type the input type * @param outputFile destination for the written data + * @param equalityFieldIds the field IDs that define the equality delete columns * @param the type of data records the writer will accept * @param the type of the input schema for the writer * @return a configured delete write builder for creating an {@link EqualityDeleteWriter} */ - public static EqualityDeleteWriteBuilder equalityDeleteWriteBuilder( - FileFormat format, Class type, EncryptedOutputFile outputFile) { - FormatModel factory = factoryFor(format, type); - return CommonWriteBuilderImpl.forEqualityDelete( - factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); + public static FileWriterBuilder, S> equalityDeleteWriteBuilder( + FileFormat format, Class type, EncryptedOutputFile outputFile, int... equalityFieldIds) { + FormatModel model = modelFor(format, type); + return forEqualityDelete( + model.writeBuilder(outputFile), + outputFile.encryptingOutputFile().location(), + format, + equalityFieldIds); } /** @@ -185,11 +196,11 @@ public static EqualityDeleteWriteBuilder equalityDeleteWriteBuilder * @return a configured delete write builder for creating a {@link PositionDeleteWriter} */ @SuppressWarnings("rawtypes") - public static PositionDeleteWriteBuilder positionDeleteWriteBuilder( + public static FileWriterBuilder, ?> positionDeleteWriteBuilder( FileFormat format, EncryptedOutputFile outputFile) { - FormatModel factory = factoryFor(format, PositionDelete.class); - return CommonWriteBuilderImpl.forPositionDelete( - factory.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); + FormatModel model = modelFor(format, PositionDelete.class); + return forPositionDelete( + model.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); } @VisibleForTesting @@ -198,7 +209,7 @@ public static PositionDeleteWriteBuilder positionDeleteWriteBuilder( } @SuppressWarnings("unchecked") - private static FormatModel factoryFor(FileFormat format, Class type) { + private static FormatModel modelFor(FileFormat format, Class type) { FormatModel model = (FormatModel) MODELS.get(Pair.of(format, type)); Preconditions.checkArgument( model != null, "Format model is not registered for format %s and type %s", format, type); @@ -219,5 +230,234 @@ private static void registerSupportedFormats() { } } + private static FileWriterBuilder, S> forDataFile( + WriteBuilder writeBuilder, String location, FileFormat format) { + return new FileWriterBuilderImpl<>( + writeBuilder.content(FileContent.DATA), + location, + format, + builder -> { + Preconditions.checkArgument( + builder.spec != null, "Cannot create data writer without spec"); + Preconditions.checkArgument( + builder.spec.isUnpartitioned() || builder.partition != null, + "Partition must not be null when creating data writer for partitioned spec"); + + return new DataWriter<>( + builder.writeBuilder.build(), + builder.format, + builder.location, + builder.spec, + builder.partition, + builder.keyMetadata, + builder.sortOrder); + }); + } + + private static FileWriterBuilder, S> forEqualityDelete( + WriteBuilder writeBuilder, String location, FileFormat format, int[] equalityFieldIds) { + return new FileWriterBuilderImpl<>( + writeBuilder.content(FileContent.EQUALITY_DELETES), + location, + format, + builder -> { + Preconditions.checkState( + builder.schema != null, "Cannot create equality delete file without a schema"); + Preconditions.checkState( + equalityFieldIds != null, + "Cannot create equality delete file without delete field ids"); + Preconditions.checkArgument( + builder.spec != null, "Spec must not be null when creating equality delete writer"); + Preconditions.checkArgument( + builder.spec.isUnpartitioned() || builder.partition != null, + "Partition must not be null for partitioned writes"); + + return new EqualityDeleteWriter<>( + builder + .writeBuilder + .schema(builder.schema) + .meta("delete-type", "equality") + .meta( + "delete-field-ids", + IntStream.of(equalityFieldIds) + .mapToObj(Objects::toString) + .collect(Collectors.joining(", "))) + .build(), + builder.format, + builder.location, + builder.spec, + builder.partition, + builder.keyMetadata, + builder.sortOrder, + equalityFieldIds); + }); + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + private static FileWriterBuilder, ?> forPositionDelete( + WriteBuilder writeBuilder, String location, FileFormat format) { + return new FileWriterBuilderImpl<>( + (WriteBuilder) writeBuilder.content(FileContent.POSITION_DELETES), + location, + format, + builder -> { + Preconditions.checkArgument( + builder.spec != null, "Spec must not be null when creating position delete writer"); + Preconditions.checkArgument( + builder.spec.isUnpartitioned() || builder.partition != null, + "Partition must not be null for partitioned writes"); + + return new PositionDeleteWriter<>( + new PositionDeleteFileAppender( + builder.writeBuilder.meta("delete-type", "position").build()), + builder.format, + builder.location, + builder.spec, + builder.partition, + builder.keyMetadata); + }); + } + private FormatModelRegistry() {} + + private static class FileWriterBuilderImpl, D, S> + implements FileWriterBuilder { + private final WriteBuilder writeBuilder; + private final String location; + private final FileFormat format; + private final BuilderFunction builderMethod; + private Schema schema = null; + private PartitionSpec spec = null; + private StructLike partition = null; + private EncryptionKeyMetadata keyMetadata = null; + private SortOrder sortOrder = null; + + private FileWriterBuilderImpl( + WriteBuilder writeBuilder, + String location, + FileFormat format, + BuilderFunction builderMethod) { + this.writeBuilder = writeBuilder; + this.location = location; + this.format = format; + this.builderMethod = builderMethod; + } + + @Override + public FileWriterBuilderImpl set(String property, String value) { + writeBuilder.set(property, value); + return this; + } + + @Override + public FileWriterBuilderImpl meta(String property, String value) { + writeBuilder.meta(property, value); + return this; + } + + @Override + public FileWriterBuilderImpl metricsConfig(MetricsConfig metricsConfig) { + writeBuilder.metricsConfig(metricsConfig); + return this; + } + + @Override + public FileWriterBuilderImpl overwrite() { + writeBuilder.overwrite(); + return this; + } + + @Override + public FileWriterBuilderImpl withFileEncryptionKey(ByteBuffer encryptionKey) { + writeBuilder.withFileEncryptionKey(encryptionKey); + return this; + } + + @Override + public FileWriterBuilderImpl withAADPrefix(ByteBuffer aadPrefix) { + writeBuilder.withAADPrefix(aadPrefix); + return this; + } + + @Override + public FileWriterBuilderImpl schema(Schema newSchema) { + writeBuilder.schema(newSchema); + this.schema = newSchema; + return this; + } + + @Override + public FileWriterBuilderImpl engineSchema(S newSchema) { + writeBuilder.engineSchema(newSchema); + return this; + } + + @Override + public FileWriterBuilderImpl spec(PartitionSpec newSpec) { + this.spec = newSpec; + return this; + } + + @Override + public FileWriterBuilderImpl partition(StructLike newPartition) { + this.partition = newPartition; + return this; + } + + @Override + public FileWriterBuilderImpl keyMetadata(EncryptionKeyMetadata newKeyMetadata) { + this.keyMetadata = newKeyMetadata; + return this; + } + + @Override + public FileWriterBuilderImpl sortOrder(SortOrder newSortOrder) { + this.sortOrder = newSortOrder; + return this; + } + + @Override + public W build() throws IOException { + return builderMethod.apply(this); + } + } + + @FunctionalInterface + private interface BuilderFunction, D, S> { + B apply(FileWriterBuilderImpl builder) throws IOException; + } + + @SuppressWarnings("rawtypes") + private static class PositionDeleteFileAppender implements FileAppender { + private final FileAppender appender; + + PositionDeleteFileAppender(FileAppender appender) { + this.appender = appender; + } + + @Override + public void add(StructLike positionDelete) { + appender.add((PositionDelete) positionDelete); + } + + @Override + public Metrics metrics() { + return appender.metrics(); + } + + @Override + public long length() { + return appender.length(); + } + + @Override + public void close() throws IOException { + appender.close(); + } + + @Override + public List splitOffsets() { + return appender.splitOffsets(); + } + } } diff --git a/core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java deleted file mode 100644 index ee379bfc249d..000000000000 --- a/core/src/main/java/org/apache/iceberg/formats/PositionDeleteWriteBuilder.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -package org.apache.iceberg.formats; - -import java.io.IOException; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.deletes.PositionDelete; -import org.apache.iceberg.deletes.PositionDeleteWriter; - -/** - * A specialized builder for creating position-based delete file writers. - * - *

    This builder extends the generic {@link CommonWriteBuilder} interface with functionality - * specific to creating {@link PositionDeleteWriter} instances. - */ -public interface PositionDeleteWriteBuilder extends CommonWriteBuilder { - - /** - * Creates a position-based delete file writer configured with the current builder settings. - * - *

    The returned {@link PositionDeleteWriter} produces files that identify records to be deleted - * by their file path and position, generating proper {@link DeleteFile} metadata on completion. - * The writer expects {@link PositionDelete} records as input. - * - * @param Only kept for backwards compatibility, the writer expects {@link PositionDelete} - * records as input, and the actual row data is not used. - * @return a fully configured {@link PositionDeleteWriter} instance - * @throws IOException if the writer cannot be created due to I/O errors - */ - PositionDeleteWriter build() throws IOException; -} diff --git a/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java index 8fb841f2fc01..9b1918f57cc6 100644 --- a/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java +++ b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java @@ -33,10 +33,8 @@ import org.apache.iceberg.deletes.PositionDeleteWriter; import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.encryption.EncryptionKeyMetadata; -import org.apache.iceberg.formats.DataWriteBuilder; -import org.apache.iceberg.formats.EqualityDeleteWriteBuilder; +import org.apache.iceberg.formats.FileWriterBuilder; import org.apache.iceberg.formats.FormatModelRegistry; -import org.apache.iceberg.formats.PositionDeleteWriteBuilder; import org.apache.iceberg.io.DataWriter; import org.apache.iceberg.io.FileWriterFactory; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -107,7 +105,7 @@ public DataWriter newDataWriter( table != null ? MetricsConfig.forTable(table) : MetricsConfig.getDefault(); try { - DataWriteBuilder builder = + FileWriterBuilder, S> builder = FormatModelRegistry.dataWriteBuilder(dataFileFormat, inputType, file); return builder .schema(dataSchema) @@ -137,15 +135,15 @@ public EqualityDeleteWriter newEqualityDeleteWriter( table != null ? MetricsConfig.forTable(table) : MetricsConfig.getDefault(); try { - EqualityDeleteWriteBuilder builder = - FormatModelRegistry.equalityDeleteWriteBuilder(deleteFileFormat, inputType, file); + FileWriterBuilder, S> builder = + FormatModelRegistry.equalityDeleteWriteBuilder( + deleteFileFormat, inputType, file, equalityFieldIds); return builder .setAll(properties) .setAll(writerProperties) .metricsConfig(metricsConfig) .schema(equalityDeleteRowSchema) .engineSchema(equalityDeleteInputSchema()) - .equalityFieldIds(equalityFieldIds) .spec(spec) .partition(partition) .keyMetadata(keyMetadata) @@ -166,7 +164,7 @@ public PositionDeleteWriter newPositionDeleteWriter( table != null ? MetricsConfig.forPositionDelete(table) : MetricsConfig.forPositionDelete(); try { - PositionDeleteWriteBuilder builder = + FileWriterBuilder, ?> builder = FormatModelRegistry.positionDeleteWriteBuilder(deleteFileFormat, file); return builder .setAll(properties) diff --git a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java index 220244f9197f..37a0caba977a 100644 --- a/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java +++ b/spark/v4.0/spark/src/main/java/org/apache/iceberg/spark/actions/RewriteTablePathSparkAction.java @@ -57,8 +57,8 @@ import org.apache.iceberg.deletes.PositionDeleteWriter; import org.apache.iceberg.encryption.EncryptedFiles; import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.formats.FileWriterBuilder; import org.apache.iceberg.formats.FormatModelRegistry; -import org.apache.iceberg.formats.PositionDeleteWriteBuilder; import org.apache.iceberg.formats.ReadBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DeleteSchemaUtil; @@ -740,7 +740,7 @@ private static PositionDeleteWriter positionDeletesWriter( Schema rowSchema) throws IOException { if (rowSchema == null) { - PositionDeleteWriteBuilder builder = + FileWriterBuilder, ?> builder = FormatModelRegistry.positionDeleteWriteBuilder( format, EncryptedFiles.plainAsEncryptedOutput(outputFile)); return builder.partition(partition).spec(spec).build(); From 8a8a67ec7cc30a24a6ea6192da6c8da4858b92b7 Mon Sep 17 00:00:00 2001 From: Peter Vary Date: Sat, 31 Jan 2026 11:38:05 +0100 Subject: [PATCH 15/15] Ryans --- .../apache/iceberg/avro/AvroFormatModel.java | 28 +- .../iceberg/formats/FileWriterBuilder.java | 16 + .../formats/FileWriterBuilderImpl.java | 282 ++++++++++++++++++ .../apache/iceberg/formats/FormatModel.java | 4 +- .../iceberg/formats/FormatModelRegistry.java | 271 +---------------- ...iteBuilder.java => ModelWriteBuilder.java} | 28 +- .../formats/TestFormatModelRegistry.java | 2 +- .../data/RegistryBasedFileWriterFactory.java | 4 +- .../apache/iceberg/orc/ORCFormatModel.java | 26 +- .../iceberg/parquet/ParquetFormatModel.java | 28 +- 10 files changed, 371 insertions(+), 318 deletions(-) create mode 100644 core/src/main/java/org/apache/iceberg/formats/FileWriterBuilderImpl.java rename core/src/main/java/org/apache/iceberg/formats/{WriteBuilder.java => ModelWriteBuilder.java} (80%) diff --git a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java index 15438e6f21dc..351553081447 100644 --- a/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java +++ b/core/src/main/java/org/apache/iceberg/avro/AvroFormatModel.java @@ -31,8 +31,8 @@ import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.BaseFormatModel; +import org.apache.iceberg.formats.ModelWriteBuilder; import org.apache.iceberg.formats.ReadBuilder; -import org.apache.iceberg.formats.WriteBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.InputFile; @@ -67,7 +67,7 @@ public FileFormat format() { } @Override - public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + public ModelWriteBuilder writeBuilder(EncryptedOutputFile outputFile) { return new WriteBuilderWrapper<>(outputFile, writerFunction()); } @@ -76,7 +76,7 @@ public ReadBuilder readBuilder(InputFile inputFile) { return new ReadBuilderWrapper<>(inputFile, readerFunction()); } - private static class WriteBuilderWrapper implements WriteBuilder { + private static class WriteBuilderWrapper implements ModelWriteBuilder { private final Avro.WriteBuilder internal; private final WriterFunction, S, Schema> writerFunction; private S inputSchema; @@ -89,66 +89,66 @@ private WriteBuilderWrapper( } @Override - public WriteBuilder schema(org.apache.iceberg.Schema schema) { + public ModelWriteBuilder schema(org.apache.iceberg.Schema schema) { internal.schema(schema); return this; } @Override - public WriteBuilder engineSchema(S schema) { + public ModelWriteBuilder engineSchema(S schema) { this.inputSchema = schema; return this; } @Override - public WriteBuilder set(String property, String value) { + public ModelWriteBuilder set(String property, String value) { internal.set(property, value); return this; } @Override - public WriteBuilder setAll(Map properties) { + public ModelWriteBuilder setAll(Map properties) { internal.setAll(properties); return this; } @Override - public WriteBuilder meta(String property, String value) { + public ModelWriteBuilder meta(String property, String value) { internal.meta(property, value); return this; } @Override - public WriteBuilder meta(Map properties) { + public ModelWriteBuilder meta(Map properties) { internal.meta(properties); return this; } @Override - public WriteBuilder content(FileContent newContent) { + public ModelWriteBuilder content(FileContent newContent) { this.content = newContent; return this; } @Override - public WriteBuilder metricsConfig(MetricsConfig metricsConfig) { + public ModelWriteBuilder metricsConfig(MetricsConfig metricsConfig) { internal.metricsConfig(metricsConfig); return this; } @Override - public WriteBuilder overwrite() { + public ModelWriteBuilder overwrite() { internal.overwrite(); return this; } @Override - public WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { + public ModelWriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { throw new UnsupportedOperationException("Avro does not support file encryption keys"); } @Override - public WriteBuilder withAADPrefix(ByteBuffer aadPrefix) { + public ModelWriteBuilder withAADPrefix(ByteBuffer aadPrefix) { throw new UnsupportedOperationException("Avro does not support AAD prefix"); } diff --git a/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilder.java b/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilder.java index 8e1394ea24bb..7dd864795888 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilder.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.util.List; import java.util.Map; import org.apache.iceberg.MetricsConfig; import org.apache.iceberg.PartitionSpec; @@ -31,6 +32,7 @@ import org.apache.iceberg.encryption.EncryptionKeyMetadata; import org.apache.iceberg.io.DataWriter; import org.apache.iceberg.io.FileWriter; +import org.apache.iceberg.util.ArrayUtil; /** * A generic builder interface for creating specialized file writers in the Iceberg ecosystem. @@ -138,5 +140,19 @@ default FileWriterBuilder meta(Map properties) { */ FileWriterBuilder engineSchema(S schema); + /** + * Sets the equality field ids for the equality delete writer. Only applicable when building an + * {@link EqualityDeleteWriter}. + */ + default FileWriterBuilder equalityFieldIds(List fieldIds) { + return equalityFieldIds(ArrayUtil.toIntArray(fieldIds)); + } + + /** + * Sets the equality field ids for the equality delete writer. Only applicable when building an + * {@link EqualityDeleteWriter}. + */ + FileWriterBuilder equalityFieldIds(int... fieldIds); + W build() throws IOException; } diff --git a/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilderImpl.java b/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilderImpl.java new file mode 100644 index 000000000000..bc8a2d2c3f0e --- /dev/null +++ b/core/src/main/java/org/apache/iceberg/formats/FileWriterBuilderImpl.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.formats; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Objects; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import org.apache.iceberg.FileContent; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Metrics; +import org.apache.iceberg.MetricsConfig; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StructLike; +import org.apache.iceberg.deletes.EqualityDeleteWriter; +import org.apache.iceberg.deletes.PositionDelete; +import org.apache.iceberg.deletes.PositionDeleteWriter; +import org.apache.iceberg.encryption.EncryptionKeyMetadata; +import org.apache.iceberg.io.DataWriter; +import org.apache.iceberg.io.FileAppender; +import org.apache.iceberg.io.FileWriter; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; + +class FileWriterBuilderImpl, D, S> implements FileWriterBuilder { + private final ModelWriteBuilder modelWriteBuilder; + private final String location; + private final FileFormat format; + private final BuilderFunction builderMethod; + private Schema schema = null; + private PartitionSpec spec = null; + private StructLike partition = null; + private EncryptionKeyMetadata keyMetadata = null; + private SortOrder sortOrder = null; + private int[] equalityFieldIds = null; + + static FileWriterBuilder, S> forDataFile( + ModelWriteBuilder modelWriteBuilder, String location, FileFormat format) { + return new FileWriterBuilderImpl<>( + modelWriteBuilder.content(FileContent.DATA), + location, + format, + builder -> { + Preconditions.checkState(builder.schema != null, "Invalid schema for data writer: null"); + Preconditions.checkArgument( + builder.spec != null, "Invalid partition spec for data writer: null"); + Preconditions.checkArgument( + builder.spec.isUnpartitioned() || builder.partition != null, + "Invalid partition, does not match spec: %s", + builder.spec); + + return new DataWriter<>( + builder.modelWriteBuilder.build(), + builder.format, + builder.location, + builder.spec, + builder.partition, + builder.keyMetadata, + builder.sortOrder); + }); + } + + static FileWriterBuilder, S> forEqualityDelete( + ModelWriteBuilder modelWriteBuilder, String location, FileFormat format) { + return new FileWriterBuilderImpl<>( + modelWriteBuilder.content(FileContent.EQUALITY_DELETES), + location, + format, + builder -> { + Preconditions.checkState( + builder.schema != null, "Invalid schema for equality delete writer: null"); + Preconditions.checkState( + builder.equalityFieldIds != null, + "Invalid delete field ids for equality delete writer: null"); + Preconditions.checkArgument( + builder.spec != null, "Invalid partition spec for equality delete writer: null"); + Preconditions.checkArgument( + builder.spec.isUnpartitioned() || builder.partition != null, + "Invalid partition, does not match spec: %s", + builder.spec); + + return new EqualityDeleteWriter<>( + builder + .modelWriteBuilder + .schema(builder.schema) + .meta("delete-type", "equality") + .meta( + "delete-field-ids", + IntStream.of(builder.equalityFieldIds) + .mapToObj(Objects::toString) + .collect(Collectors.joining(", "))) + .build(), + builder.format, + builder.location, + builder.spec, + builder.partition, + builder.keyMetadata, + builder.sortOrder, + builder.equalityFieldIds); + }); + } + + @SuppressWarnings({"unchecked", "rawtypes"}) + static FileWriterBuilder, S> forPositionDelete( + ModelWriteBuilder, S> modelWriteBuilder, + String location, + FileFormat format) { + return new FileWriterBuilderImpl<>( + modelWriteBuilder.content(FileContent.POSITION_DELETES), + location, + format, + builder -> { + Preconditions.checkArgument( + builder.spec != null, "Invalid partition spec for position delete writer: null"); + Preconditions.checkArgument( + builder.spec.isUnpartitioned() || builder.partition != null, + "Invalid partition, does not match spec: %s"); + + return new PositionDeleteWriter<>( + new PositionDeleteFileAppender<>( + builder.modelWriteBuilder.meta("delete-type", "position").build()), + builder.format, + builder.location, + builder.spec, + builder.partition, + builder.keyMetadata); + }); + } + + @FunctionalInterface + interface BuilderFunction, D, S> { + B apply(FileWriterBuilderImpl builder) throws IOException; + } + + FileWriterBuilderImpl( + ModelWriteBuilder modelWriteBuilder, + String location, + FileFormat format, + BuilderFunction builderMethod) { + this.modelWriteBuilder = modelWriteBuilder; + this.location = location; + this.format = format; + this.builderMethod = builderMethod; + } + + @Override + public FileWriterBuilderImpl set(String property, String value) { + modelWriteBuilder.set(property, value); + return this; + } + + @Override + public FileWriterBuilderImpl meta(String property, String value) { + modelWriteBuilder.meta(property, value); + return this; + } + + @Override + public FileWriterBuilderImpl metricsConfig(MetricsConfig metricsConfig) { + modelWriteBuilder.metricsConfig(metricsConfig); + return this; + } + + @Override + public FileWriterBuilderImpl overwrite() { + modelWriteBuilder.overwrite(); + return this; + } + + @Override + public FileWriterBuilderImpl withFileEncryptionKey(ByteBuffer encryptionKey) { + modelWriteBuilder.withFileEncryptionKey(encryptionKey); + return this; + } + + @Override + public FileWriterBuilderImpl withAADPrefix(ByteBuffer aadPrefix) { + modelWriteBuilder.withAADPrefix(aadPrefix); + return this; + } + + @Override + public FileWriterBuilderImpl schema(Schema newSchema) { + modelWriteBuilder.schema(newSchema); + this.schema = newSchema; + return this; + } + + @Override + public FileWriterBuilderImpl engineSchema(S newSchema) { + modelWriteBuilder.engineSchema(newSchema); + return this; + } + + @Override + public FileWriterBuilderImpl spec(PartitionSpec newSpec) { + this.spec = newSpec; + return this; + } + + @Override + public FileWriterBuilderImpl partition(StructLike newPartition) { + this.partition = newPartition; + return this; + } + + @Override + public FileWriterBuilderImpl keyMetadata(EncryptionKeyMetadata newKeyMetadata) { + this.keyMetadata = newKeyMetadata; + return this; + } + + @Override + public FileWriterBuilderImpl sortOrder(SortOrder newSortOrder) { + this.sortOrder = newSortOrder; + return this; + } + + @Override + public FileWriterBuilderImpl equalityFieldIds(int... fieldIds) { + this.equalityFieldIds = fieldIds; + return this; + } + + @Override + public W build() throws IOException { + return builderMethod.apply(this); + } + + private static class PositionDeleteFileAppender implements FileAppender { + private final FileAppender> appender; + + PositionDeleteFileAppender(FileAppender> appender) { + this.appender = appender; + } + + @SuppressWarnings("unchecked") + @Override + public void add(StructLike positionDelete) { + appender.add((PositionDelete) positionDelete); + } + + @Override + public Metrics metrics() { + return appender.metrics(); + } + + @Override + public long length() { + return appender.length(); + } + + @Override + public void close() throws IOException { + appender.close(); + } + + @Override + public List splitOffsets() { + return appender.splitOffsets(); + } + } +} diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModel.java b/core/src/main/java/org/apache/iceberg/formats/FormatModel.java index c8164aba1d8f..307a12625cd3 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FormatModel.java +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModel.java @@ -70,13 +70,13 @@ public interface FormatModel { /** * Creates a writer builder for data files. * - *

    The returned {@link WriteBuilder} configures and creates a writer that converts input + *

    The returned {@link ModelWriteBuilder} configures and creates a writer that converts input * objects into the file format supported by this factory. * * @param outputFile destination for the written data * @return configured writer builder */ - WriteBuilder writeBuilder(EncryptedOutputFile outputFile); + ModelWriteBuilder writeBuilder(EncryptedOutputFile outputFile); /** * Creates a file reader builder for the specified input file. diff --git a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java index 2cf516a570eb..8a848e9b3354 100644 --- a/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java +++ b/core/src/main/java/org/apache/iceberg/formats/FormatModelRegistry.java @@ -18,32 +18,17 @@ */ package org.apache.iceberg.formats; -import java.io.IOException; -import java.nio.ByteBuffer; import java.util.List; import java.util.Map; -import java.util.Objects; -import java.util.stream.Collectors; -import java.util.stream.IntStream; import org.apache.iceberg.DataFile; import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.FileContent; import org.apache.iceberg.FileFormat; -import org.apache.iceberg.Metrics; -import org.apache.iceberg.MetricsConfig; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.SortOrder; -import org.apache.iceberg.StructLike; import org.apache.iceberg.common.DynMethods; import org.apache.iceberg.deletes.EqualityDeleteWriter; import org.apache.iceberg.deletes.PositionDelete; import org.apache.iceberg.deletes.PositionDeleteWriter; import org.apache.iceberg.encryption.EncryptedOutputFile; -import org.apache.iceberg.encryption.EncryptionKeyMetadata; import org.apache.iceberg.io.DataWriter; -import org.apache.iceberg.io.FileAppender; -import org.apache.iceberg.io.FileWriter; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; @@ -67,6 +52,8 @@ * requested builder type. */ public final class FormatModelRegistry { + private FormatModelRegistry() {} + private static final Logger LOG = LoggerFactory.getLogger(FormatModelRegistry.class); // The list of classes which are used for registering the reader and writer builders private static final List CLASSES_TO_REGISTER = @@ -153,7 +140,7 @@ public static ReadBuilder readBuilder( public static FileWriterBuilder, S> dataWriteBuilder( FileFormat format, Class type, EncryptedOutputFile outputFile) { FormatModel model = modelFor(format, type); - return forDataFile( + return FileWriterBuilderImpl.forDataFile( model.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); } @@ -168,19 +155,15 @@ public static FileWriterBuilder, S> dataWriteBuilder( * @param format the file format used for writing * @param type the input type * @param outputFile destination for the written data - * @param equalityFieldIds the field IDs that define the equality delete columns * @param the type of data records the writer will accept * @param the type of the input schema for the writer * @return a configured delete write builder for creating an {@link EqualityDeleteWriter} */ public static FileWriterBuilder, S> equalityDeleteWriteBuilder( - FileFormat format, Class type, EncryptedOutputFile outputFile, int... equalityFieldIds) { + FileFormat format, Class type, EncryptedOutputFile outputFile) { FormatModel model = modelFor(format, type); - return forEqualityDelete( - model.writeBuilder(outputFile), - outputFile.encryptingOutputFile().location(), - format, - equalityFieldIds); + return FileWriterBuilderImpl.forEqualityDelete( + model.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); } /** @@ -195,11 +178,12 @@ public static FileWriterBuilder, S> equalityDelet * @param outputFile destination for the written data * @return a configured delete write builder for creating a {@link PositionDeleteWriter} */ - @SuppressWarnings("rawtypes") + @SuppressWarnings({"unchecked", "rawtypes"}) public static FileWriterBuilder, ?> positionDeleteWriteBuilder( FileFormat format, EncryptedOutputFile outputFile) { - FormatModel model = modelFor(format, PositionDelete.class); - return forPositionDelete( + FormatModel, ?> model = + (FormatModel, ?>) (FormatModel) modelFor(format, PositionDelete.class); + return FileWriterBuilderImpl.forPositionDelete( model.writeBuilder(outputFile), outputFile.encryptingOutputFile().location(), format); } @@ -225,239 +209,10 @@ private static void registerSupportedFormats() { } catch (NoSuchMethodException e) { // failing to register a factory is normal and does not require a stack trace LOG.info( - "Skip registration of {}. Likely the jar is not in the classpath", classToRegister); + "Unable to call register for ({}). Check for missing jars on the classpath: {}", + classToRegister, + e.getMessage()); } } } - - private static FileWriterBuilder, S> forDataFile( - WriteBuilder writeBuilder, String location, FileFormat format) { - return new FileWriterBuilderImpl<>( - writeBuilder.content(FileContent.DATA), - location, - format, - builder -> { - Preconditions.checkArgument( - builder.spec != null, "Cannot create data writer without spec"); - Preconditions.checkArgument( - builder.spec.isUnpartitioned() || builder.partition != null, - "Partition must not be null when creating data writer for partitioned spec"); - - return new DataWriter<>( - builder.writeBuilder.build(), - builder.format, - builder.location, - builder.spec, - builder.partition, - builder.keyMetadata, - builder.sortOrder); - }); - } - - private static FileWriterBuilder, S> forEqualityDelete( - WriteBuilder writeBuilder, String location, FileFormat format, int[] equalityFieldIds) { - return new FileWriterBuilderImpl<>( - writeBuilder.content(FileContent.EQUALITY_DELETES), - location, - format, - builder -> { - Preconditions.checkState( - builder.schema != null, "Cannot create equality delete file without a schema"); - Preconditions.checkState( - equalityFieldIds != null, - "Cannot create equality delete file without delete field ids"); - Preconditions.checkArgument( - builder.spec != null, "Spec must not be null when creating equality delete writer"); - Preconditions.checkArgument( - builder.spec.isUnpartitioned() || builder.partition != null, - "Partition must not be null for partitioned writes"); - - return new EqualityDeleteWriter<>( - builder - .writeBuilder - .schema(builder.schema) - .meta("delete-type", "equality") - .meta( - "delete-field-ids", - IntStream.of(equalityFieldIds) - .mapToObj(Objects::toString) - .collect(Collectors.joining(", "))) - .build(), - builder.format, - builder.location, - builder.spec, - builder.partition, - builder.keyMetadata, - builder.sortOrder, - equalityFieldIds); - }); - } - - @SuppressWarnings({"unchecked", "rawtypes"}) - private static FileWriterBuilder, ?> forPositionDelete( - WriteBuilder writeBuilder, String location, FileFormat format) { - return new FileWriterBuilderImpl<>( - (WriteBuilder) writeBuilder.content(FileContent.POSITION_DELETES), - location, - format, - builder -> { - Preconditions.checkArgument( - builder.spec != null, "Spec must not be null when creating position delete writer"); - Preconditions.checkArgument( - builder.spec.isUnpartitioned() || builder.partition != null, - "Partition must not be null for partitioned writes"); - - return new PositionDeleteWriter<>( - new PositionDeleteFileAppender( - builder.writeBuilder.meta("delete-type", "position").build()), - builder.format, - builder.location, - builder.spec, - builder.partition, - builder.keyMetadata); - }); - } - - private FormatModelRegistry() {} - - private static class FileWriterBuilderImpl, D, S> - implements FileWriterBuilder { - private final WriteBuilder writeBuilder; - private final String location; - private final FileFormat format; - private final BuilderFunction builderMethod; - private Schema schema = null; - private PartitionSpec spec = null; - private StructLike partition = null; - private EncryptionKeyMetadata keyMetadata = null; - private SortOrder sortOrder = null; - - private FileWriterBuilderImpl( - WriteBuilder writeBuilder, - String location, - FileFormat format, - BuilderFunction builderMethod) { - this.writeBuilder = writeBuilder; - this.location = location; - this.format = format; - this.builderMethod = builderMethod; - } - - @Override - public FileWriterBuilderImpl set(String property, String value) { - writeBuilder.set(property, value); - return this; - } - - @Override - public FileWriterBuilderImpl meta(String property, String value) { - writeBuilder.meta(property, value); - return this; - } - - @Override - public FileWriterBuilderImpl metricsConfig(MetricsConfig metricsConfig) { - writeBuilder.metricsConfig(metricsConfig); - return this; - } - - @Override - public FileWriterBuilderImpl overwrite() { - writeBuilder.overwrite(); - return this; - } - - @Override - public FileWriterBuilderImpl withFileEncryptionKey(ByteBuffer encryptionKey) { - writeBuilder.withFileEncryptionKey(encryptionKey); - return this; - } - - @Override - public FileWriterBuilderImpl withAADPrefix(ByteBuffer aadPrefix) { - writeBuilder.withAADPrefix(aadPrefix); - return this; - } - - @Override - public FileWriterBuilderImpl schema(Schema newSchema) { - writeBuilder.schema(newSchema); - this.schema = newSchema; - return this; - } - - @Override - public FileWriterBuilderImpl engineSchema(S newSchema) { - writeBuilder.engineSchema(newSchema); - return this; - } - - @Override - public FileWriterBuilderImpl spec(PartitionSpec newSpec) { - this.spec = newSpec; - return this; - } - - @Override - public FileWriterBuilderImpl partition(StructLike newPartition) { - this.partition = newPartition; - return this; - } - - @Override - public FileWriterBuilderImpl keyMetadata(EncryptionKeyMetadata newKeyMetadata) { - this.keyMetadata = newKeyMetadata; - return this; - } - - @Override - public FileWriterBuilderImpl sortOrder(SortOrder newSortOrder) { - this.sortOrder = newSortOrder; - return this; - } - - @Override - public W build() throws IOException { - return builderMethod.apply(this); - } - } - - @FunctionalInterface - private interface BuilderFunction, D, S> { - B apply(FileWriterBuilderImpl builder) throws IOException; - } - - @SuppressWarnings("rawtypes") - private static class PositionDeleteFileAppender implements FileAppender { - private final FileAppender appender; - - PositionDeleteFileAppender(FileAppender appender) { - this.appender = appender; - } - - @Override - public void add(StructLike positionDelete) { - appender.add((PositionDelete) positionDelete); - } - - @Override - public Metrics metrics() { - return appender.metrics(); - } - - @Override - public long length() { - return appender.length(); - } - - @Override - public void close() throws IOException { - appender.close(); - } - - @Override - public List splitOffsets() { - return appender.splitOffsets(); - } - } } diff --git a/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java b/core/src/main/java/org/apache/iceberg/formats/ModelWriteBuilder.java similarity index 80% rename from core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java rename to core/src/main/java/org/apache/iceberg/formats/ModelWriteBuilder.java index b3f2c43bb7c3..1a65305b2a3a 100644 --- a/core/src/main/java/org/apache/iceberg/formats/WriteBuilder.java +++ b/core/src/main/java/org/apache/iceberg/formats/ModelWriteBuilder.java @@ -28,9 +28,9 @@ /** * Builder interface for creating file writers across supported data file formats. The {@link - * FormatModel} implementations provide the appropriate {@link WriteBuilder} instances. + * FormatModel} implementations provide the appropriate {@link ModelWriteBuilder} instances. * - *

    The {@link WriteBuilder} follows the builder pattern to configure and create {@link + *

    The {@link ModelWriteBuilder} follows the builder pattern to configure and create {@link * FileAppender} instances that write data to the target output files. * *

    This interface is directly exposed to users for parameterizing when only an appender is @@ -39,9 +39,9 @@ * @param the output data type produced by the reader * @param the type of the schema for the output data type */ -public interface WriteBuilder { +public interface ModelWriteBuilder { /** Set the file schema. */ - WriteBuilder schema(Schema schema); + ModelWriteBuilder schema(Schema schema); /** * Sets the engine's representation accepted by the writer. @@ -54,7 +54,7 @@ public interface WriteBuilder { *

    The engine schema must be aligned with the Iceberg schema, but may include representation * details that Iceberg considers equivalent. */ - WriteBuilder engineSchema(S schema); + ModelWriteBuilder engineSchema(S schema); /** * Set a writer configuration property which affects the writer behavior. Writer builders should @@ -64,7 +64,7 @@ public interface WriteBuilder { * @param value config value * @return this for method chaining */ - WriteBuilder set(String property, String value); + ModelWriteBuilder set(String property, String value); /** * Sets multiple writer configuration properties that affect the writer behavior. Writer builders @@ -73,7 +73,7 @@ public interface WriteBuilder { * @param properties writer config properties to set * @return this for method chaining */ - default WriteBuilder setAll(Map properties) { + default ModelWriteBuilder setAll(Map properties) { properties.forEach(this::set); return this; } @@ -85,7 +85,7 @@ default WriteBuilder setAll(Map properties) { * @param value config value * @return this for method chaining */ - WriteBuilder meta(String property, String value); + ModelWriteBuilder meta(String property, String value); /** * Sets multiple file metadata properties in the created file. @@ -93,7 +93,7 @@ default WriteBuilder setAll(Map properties) { * @param properties file metadata properties to set * @return this for method chaining */ - default WriteBuilder meta(Map properties) { + default ModelWriteBuilder meta(Map properties) { properties.forEach(this::meta); return this; } @@ -102,25 +102,25 @@ default WriteBuilder meta(Map properties) { * Based on the target file content the generated {@link FileAppender} needs different * configuration. */ - WriteBuilder content(FileContent content); + ModelWriteBuilder content(FileContent content); /** Sets the metrics configuration used for collecting column metrics for the created file. */ - WriteBuilder metricsConfig(MetricsConfig metricsConfig); + ModelWriteBuilder metricsConfig(MetricsConfig metricsConfig); /** Overwrite the file if it already exists. By default, overwrite is disabled. */ - WriteBuilder overwrite(); + ModelWriteBuilder overwrite(); /** * Sets the encryption key used for writing the file. If the writer does not support encryption, * then an exception should be thrown. */ - WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey); + ModelWriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey); /** * Sets the additional authentication data (AAD) prefix used for writing the file. If the writer * does not support encryption, then an exception should be thrown. */ - WriteBuilder withAADPrefix(ByteBuffer aadPrefix); + ModelWriteBuilder withAADPrefix(ByteBuffer aadPrefix); /** Finalizes the configuration and builds the {@link FileAppender}. */ FileAppender build() throws IOException; diff --git a/core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java b/core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java index 24e168d3131b..fe7a4d96f612 100644 --- a/core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java +++ b/core/src/test/java/org/apache/iceberg/formats/TestFormatModelRegistry.java @@ -113,7 +113,7 @@ public Class schemaType() { } @Override - public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + public ModelWriteBuilder writeBuilder(EncryptedOutputFile outputFile) { return null; } diff --git a/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java index 9b1918f57cc6..3d0e2e8fb030 100644 --- a/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java +++ b/data/src/main/java/org/apache/iceberg/data/RegistryBasedFileWriterFactory.java @@ -136,14 +136,14 @@ public EqualityDeleteWriter newEqualityDeleteWriter( try { FileWriterBuilder, S> builder = - FormatModelRegistry.equalityDeleteWriteBuilder( - deleteFileFormat, inputType, file, equalityFieldIds); + FormatModelRegistry.equalityDeleteWriteBuilder(deleteFileFormat, inputType, file); return builder .setAll(properties) .setAll(writerProperties) .metricsConfig(metricsConfig) .schema(equalityDeleteRowSchema) .engineSchema(equalityDeleteInputSchema()) + .equalityFieldIds(equalityFieldIds) .spec(spec) .partition(partition) .keyMetadata(keyMetadata) diff --git a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java index 50e977e6f19d..cce9538ca240 100644 --- a/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java +++ b/orc/src/main/java/org/apache/iceberg/orc/ORCFormatModel.java @@ -31,8 +31,8 @@ import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.BaseFormatModel; +import org.apache.iceberg.formats.ModelWriteBuilder; import org.apache.iceberg.formats.ReadBuilder; -import org.apache.iceberg.formats.WriteBuilder; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.InputFile; import org.apache.iceberg.mapping.NameMapping; @@ -79,7 +79,7 @@ public FileFormat format() { } @Override - public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + public ModelWriteBuilder writeBuilder(EncryptedOutputFile outputFile) { return new WriteBuilderWrapper<>(outputFile, writerFunction()); } @@ -88,7 +88,7 @@ public ReadBuilder readBuilder(InputFile inputFile) { return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader); } - private static class WriteBuilderWrapper implements WriteBuilder { + private static class WriteBuilderWrapper implements ModelWriteBuilder { private final ORC.WriteBuilder internal; private final WriterFunction, S, TypeDescription> writerFunction; private S inputSchema; @@ -103,61 +103,61 @@ private WriteBuilderWrapper( } @Override - public WriteBuilder schema(Schema schema) { + public ModelWriteBuilder schema(Schema schema) { internal.schema(schema); return this; } @Override - public WriteBuilder engineSchema(S schema) { + public ModelWriteBuilder engineSchema(S schema) { this.inputSchema = schema; return this; } @Override - public WriteBuilder set(String property, String value) { + public ModelWriteBuilder set(String property, String value) { internal.set(property, value); return this; } @Override - public WriteBuilder setAll(Map properties) { + public ModelWriteBuilder setAll(Map properties) { internal.setAll(properties); return this; } @Override - public WriteBuilder meta(String property, String value) { + public ModelWriteBuilder meta(String property, String value) { internal.metadata(property, value); return this; } @Override - public WriteBuilder content(FileContent newContent) { + public ModelWriteBuilder content(FileContent newContent) { this.content = newContent; return this; } @Override - public WriteBuilder metricsConfig(MetricsConfig metricsConfig) { + public ModelWriteBuilder metricsConfig(MetricsConfig metricsConfig) { internal.metricsConfig(metricsConfig); return this; } @Override - public WriteBuilder overwrite() { + public ModelWriteBuilder overwrite() { internal.overwrite(); return this; } @Override - public WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { + public ModelWriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { // ORC doesn't support file encryption throw new UnsupportedOperationException("ORC does not support file encryption keys"); } @Override - public WriteBuilder withAADPrefix(ByteBuffer aadPrefix) { + public ModelWriteBuilder withAADPrefix(ByteBuffer aadPrefix) { // ORC doesn't support file encryption throw new UnsupportedOperationException("ORC does not support AAD prefix"); } diff --git a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java index 0f176d535050..e77ff42ae26b 100644 --- a/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java +++ b/parquet/src/main/java/org/apache/iceberg/parquet/ParquetFormatModel.java @@ -31,8 +31,8 @@ import org.apache.iceberg.encryption.EncryptedOutputFile; import org.apache.iceberg.expressions.Expression; import org.apache.iceberg.formats.BaseFormatModel; +import org.apache.iceberg.formats.ModelWriteBuilder; import org.apache.iceberg.formats.ReadBuilder; -import org.apache.iceberg.formats.WriteBuilder; import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.io.DeleteSchemaUtil; import org.apache.iceberg.io.FileAppender; @@ -82,7 +82,7 @@ public FileFormat format() { } @Override - public WriteBuilder writeBuilder(EncryptedOutputFile outputFile) { + public ModelWriteBuilder writeBuilder(EncryptedOutputFile outputFile) { return new WriteBuilderWrapper<>(outputFile, writerFunction()); } @@ -91,7 +91,7 @@ public ReadBuilder readBuilder(InputFile inputFile) { return new ReadBuilderWrapper<>(inputFile, readerFunction(), batchReader); } - private static class WriteBuilderWrapper implements WriteBuilder { + private static class WriteBuilderWrapper implements ModelWriteBuilder { private final Parquet.WriteBuilder internal; private final WriterFunction, S, MessageType> writerFunction; private S inputSchema; @@ -105,19 +105,19 @@ private WriteBuilderWrapper( } @Override - public WriteBuilder schema(Schema schema) { + public ModelWriteBuilder schema(Schema schema) { internal.schema(schema); return this; } @Override - public WriteBuilder engineSchema(S schema) { + public ModelWriteBuilder engineSchema(S schema) { this.inputSchema = schema; return this; } @Override - public WriteBuilder set(String property, String value) { + public ModelWriteBuilder set(String property, String value) { if (WRITER_VERSION_KEY.equals(property)) { internal.writerVersion(ParquetProperties.WriterVersion.valueOf(value)); } @@ -127,49 +127,49 @@ public WriteBuilder set(String property, String value) { } @Override - public WriteBuilder setAll(Map properties) { + public ModelWriteBuilder setAll(Map properties) { internal.setAll(properties); return this; } @Override - public WriteBuilder meta(String property, String value) { + public ModelWriteBuilder meta(String property, String value) { internal.meta(property, value); return this; } @Override - public WriteBuilder meta(Map properties) { + public ModelWriteBuilder meta(Map properties) { internal.meta(properties); return this; } @Override - public WriteBuilder content(FileContent newContent) { + public ModelWriteBuilder content(FileContent newContent) { this.content = newContent; return this; } @Override - public WriteBuilder metricsConfig(MetricsConfig metricsConfig) { + public ModelWriteBuilder metricsConfig(MetricsConfig metricsConfig) { internal.metricsConfig(metricsConfig); return this; } @Override - public WriteBuilder overwrite() { + public ModelWriteBuilder overwrite() { internal.overwrite(); return this; } @Override - public WriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { + public ModelWriteBuilder withFileEncryptionKey(ByteBuffer encryptionKey) { internal.withFileEncryptionKey(encryptionKey); return this; } @Override - public WriteBuilder withAADPrefix(ByteBuffer aadPrefix) { + public ModelWriteBuilder withAADPrefix(ByteBuffer aadPrefix) { internal.withAADPrefix(aadPrefix); return this; }