From e541a7174c4ce187960f7db4f7c3755e5710170e Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Wed, 10 Dec 2025 20:55:47 +0100 Subject: [PATCH 01/54] given a parquet file return data from a certain modification time --- .../xtable/parquet/ParquetDataManager.java | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java new file mode 100644 index 000000000..7cf950950 --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.parquet; + +import java.io.IOException; +import java.time.Instant; + +import lombok.Builder; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.hadoop.example.GroupReadSupport; +import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; + +@Builder +public class ParquetDataManager { + private static final String TIME_FIELD = "modified_at"; + public Path outputFile; + private ParquetReader reader; + private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance(); + + private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path file) { + ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, file); + int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; + try (ParquetWriter writer = + new ParquetWriter( + outputFile, + new GroupWriteSupport(), + parquetFileConfig.getCodec(), + (int) parquetFileConfig.getRowGroupSize(), + pageSize, + pageSize, // dictionaryPageSize + true, // enableDictionary + false, // enableValidation + ParquetWriter.DEFAULT_WRITER_VERSION, + conf)) { + Group currentGroup = null; + while ((currentGroup = (Group) reader.read()) != null) { + writer.write(currentGroup); + } + } catch (Exception e) { + + e.printStackTrace(); + } + return outputFile; + } + + public long convertTimeField(Type timeFieldType, int timeIndex, Group record) { + PrimitiveType timeFieldTypePrimitive = timeFieldType.asPrimitiveType(); + long timestampValue = record.getLong(timeIndex, 0); + + // Check the logical type to determine the unit: + LogicalTypeAnnotation logicalType = timeFieldTypePrimitive.getLogicalTypeAnnotation(); + // check the cases of Nanos and millis as well + if (logicalType != null + && logicalType.equals( + LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS))) { + // convert microseconds to milliseconds + timestampValue = timestampValue / 1000; + } + // continue case INT96 and BINARY (time as string) + return timestampValue; + } + + public ParquetReader readParquetDataGroupsFromTime( + String filePath, Configuration conf, Instant modifTime) { + Path file = new Path(filePath); + try { + reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); + MessageType schema = + metadataExtractor.readParquetMetadata(conf, file).getFileMetaData().getSchema(); + Group record = null; + + // get the index of the modified_at field + int timeIndex = schema.getFieldIndex(TIME_FIELD); + Type timeFieldType = null; + long recordTime = 0L; + int isAfterModificationTime = 0; + while ((reader.read()) != null) { + // TODO check if TIME_FIELD needs to be extracted or can use default one + timeFieldType = schema.getType(TIME_FIELD); + recordTime = convertTimeField(timeFieldType, timeIndex, record); + isAfterModificationTime = Instant.ofEpochMilli(recordTime).compareTo(modifTime); + if (isAfterModificationTime >= 0) { + break; + } + } + // write the remaining reader using a writer + + } catch (IOException e) { + e.printStackTrace(); + } + + return reader; + } +} + +class ParquetFileConfig { + private final MessageType schema; + private final long rowGroupSize; + private final CompressionCodecName codec; + + public ParquetFileConfig(Configuration conf, Path file) { + ParquetMetadata metadata = + ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file); + MessageType schema = metadata.getFileMetaData().getSchema(); + if (metadata.getBlocks().isEmpty()) { + throw new IllegalStateException("Baseline Parquet file has no row groups."); + } + long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize(); + CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); + this.schema = schema; + this.rowGroupSize = rowGroupSize; + this.codec = codec; + } + + public MessageType getSchema() { + return schema; + } + + public long getRowGroupSize() { + return rowGroupSize; + } + + public CompressionCodecName getCodec() { + return codec; + } +} From 15e282a04a56d6d28cbfb6f3d3fb9b9afe040417 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 13 Dec 2025 18:20:00 +0100 Subject: [PATCH 02/54] create the path based on the partition then inject the file to append into the parquet table --- .../xtable/parquet/ParquetDataManager.java | 135 ++++++------------ .../xtable/parquet/ParquetFileConfig.java | 57 ++++++++ 2 files changed, 100 insertions(+), 92 deletions(-) create mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 7cf950950..a80ab8a61 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -20,6 +20,9 @@ import java.io.IOException; import java.time.Instant; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.UUID; import lombok.Builder; @@ -30,22 +33,51 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; -import org.apache.parquet.hadoop.metadata.CompressionCodecName; -import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Type; @Builder public class ParquetDataManager { - private static final String TIME_FIELD = "modified_at"; - public Path outputFile; - private ParquetReader reader; private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance(); - private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path file) { - ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, file); + public ParquetReader readParquetDataAsReader(String filePath, Configuration conf) { + ParquetReader reader = null; + Path file = new Path(filePath); + try { + reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); + } catch (IOException e) { + e.printStackTrace(); + } + return reader; + } + + private Path appendNewParquetFile( + Configuration conf, + String rootPath, + Path fileToAppend, + String partitionColumn, + Instant modifTime) { + Path finalFile = null; + // construct the file path to inject into the existing partitioned file + String partitionValue = + DateTimeFormatter.ISO_LOCAL_DATE.format( + modifTime.atZone(ZoneId.systemDefault()).toLocalDate()); + String partitionDir = partitionColumn + partitionValue; + String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet"; + Path outputFile = new Path(new Path(rootPath, partitionDir), fileName); + // return its reader for convenience of writing + ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf); + // append/write it in the right partition + finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile); + return finalFile; + } + + private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) { + ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend); + return parquetFileConfig; + } + + private Path writeNewParquetFile( + Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) { + ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend); int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; try (ParquetWriter writer = new ParquetWriter( @@ -69,85 +101,4 @@ private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path } return outputFile; } - - public long convertTimeField(Type timeFieldType, int timeIndex, Group record) { - PrimitiveType timeFieldTypePrimitive = timeFieldType.asPrimitiveType(); - long timestampValue = record.getLong(timeIndex, 0); - - // Check the logical type to determine the unit: - LogicalTypeAnnotation logicalType = timeFieldTypePrimitive.getLogicalTypeAnnotation(); - // check the cases of Nanos and millis as well - if (logicalType != null - && logicalType.equals( - LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS))) { - // convert microseconds to milliseconds - timestampValue = timestampValue / 1000; - } - // continue case INT96 and BINARY (time as string) - return timestampValue; - } - - public ParquetReader readParquetDataGroupsFromTime( - String filePath, Configuration conf, Instant modifTime) { - Path file = new Path(filePath); - try { - reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); - MessageType schema = - metadataExtractor.readParquetMetadata(conf, file).getFileMetaData().getSchema(); - Group record = null; - - // get the index of the modified_at field - int timeIndex = schema.getFieldIndex(TIME_FIELD); - Type timeFieldType = null; - long recordTime = 0L; - int isAfterModificationTime = 0; - while ((reader.read()) != null) { - // TODO check if TIME_FIELD needs to be extracted or can use default one - timeFieldType = schema.getType(TIME_FIELD); - recordTime = convertTimeField(timeFieldType, timeIndex, record); - isAfterModificationTime = Instant.ofEpochMilli(recordTime).compareTo(modifTime); - if (isAfterModificationTime >= 0) { - break; - } - } - // write the remaining reader using a writer - - } catch (IOException e) { - e.printStackTrace(); - } - - return reader; - } -} - -class ParquetFileConfig { - private final MessageType schema; - private final long rowGroupSize; - private final CompressionCodecName codec; - - public ParquetFileConfig(Configuration conf, Path file) { - ParquetMetadata metadata = - ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file); - MessageType schema = metadata.getFileMetaData().getSchema(); - if (metadata.getBlocks().isEmpty()) { - throw new IllegalStateException("Baseline Parquet file has no row groups."); - } - long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize(); - CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); - this.schema = schema; - this.rowGroupSize = rowGroupSize; - this.codec = codec; - } - - public MessageType getSchema() { - return schema; - } - - public long getRowGroupSize() { - return rowGroupSize; - } - - public CompressionCodecName getCodec() { - return codec; - } } diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java new file mode 100644 index 000000000..6cdf1d4f3 --- /dev/null +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.parquet; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.metadata.CompressionCodecName; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; + +class ParquetFileConfig { + private final MessageType schema; + private final long rowGroupSize; + private final CompressionCodecName codec; + + public ParquetFileConfig(Configuration conf, Path file) { + ParquetMetadata metadata = + ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file); + MessageType schema = metadata.getFileMetaData().getSchema(); + if (metadata.getBlocks().isEmpty()) { + throw new IllegalStateException("Baseline Parquet file has no row groups."); + } + long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize(); + CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); + this.schema = schema; + this.rowGroupSize = rowGroupSize; + this.codec = codec; + } + + public MessageType getSchema() { + return schema; + } + + public long getRowGroupSize() { + return rowGroupSize; + } + + public CompressionCodecName getCodec() { + return codec; + } +} From 2ee71c938d8f1eed47d83a2f00cbd47bfabee448 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 14 Dec 2025 21:19:42 +0100 Subject: [PATCH 03/54] Handle case of path construction with file partitioned over many fields, interfacing with ConversionSource --- .../xtable/parquet/ParquetDataManager.java | 40 +++++++++++++++---- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index a80ab8a61..9a38c775a 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -22,7 +22,9 @@ import java.time.Instant; import java.time.ZoneId; import java.time.format.DateTimeFormatter; +import java.util.List; import java.util.UUID; +import java.util.stream.Collectors; import lombok.Builder; @@ -34,6 +36,11 @@ import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.xtable.model.schema.InternalPartitionField; +import org.apache.xtable.model.stat.PartitionValue; +import org.apache.xtable.model.stat.Range; +import org.apache.xtable.model.storage.InternalDataFile; + @Builder public class ParquetDataManager { private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance(); @@ -49,18 +56,37 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf return reader; } + // partition fields are already computed, given a parquet file InternalDataFile must be derived + // (e.g., using createInternalDataFileFromParquetFile()) private Path appendNewParquetFile( Configuration conf, String rootPath, - Path fileToAppend, - String partitionColumn, - Instant modifTime) { + InternalDataFile internalParquetFile, + List partitionFields) { Path finalFile = null; + String partitionDir = ""; + List partitionValues = internalParquetFile.getPartitionValues(); + Instant modifTime = Instant.ofEpochMilli(internalParquetFile.getLastModified()); + Path fileToAppend = new Path(internalParquetFile.toString()); // construct the file path to inject into the existing partitioned file - String partitionValue = - DateTimeFormatter.ISO_LOCAL_DATE.format( - modifTime.atZone(ZoneId.systemDefault()).toLocalDate()); - String partitionDir = partitionColumn + partitionValue; + if (partitionValues == null || partitionValues.isEmpty()) { + String partitionValue = + DateTimeFormatter.ISO_LOCAL_DATE.format( + modifTime.atZone(ZoneId.systemDefault()).toLocalDate()); + partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue; + } else { + // handle multiple partitioning case (year and month etc.) + partitionDir = + partitionValues.stream() + .map( + pv -> { + Range epochValueObject = pv.getRange(); + // epochValueObject is always sure to be long + String valueStr = String.valueOf(epochValueObject.getMaxValue()); + return pv.getPartitionField() + "=" + valueStr; + }) + .collect(Collectors.joining("/")); + } String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet"; Path outputFile = new Path(new Path(rootPath, partitionDir), fileName); // return its reader for convenience of writing From 6032e5f62aa74347a3403ed544320d4fff918d07 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 14 Dec 2025 22:03:44 +0100 Subject: [PATCH 04/54] test append Parquet file into table init --- .../parquet/TestParquetDataManager.java | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java new file mode 100644 index 000000000..2365b5fef --- /dev/null +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.xtable.parquet; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; + +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +public class TestParquetDataManager { + @TempDir static java.nio.file.Path tempDir = Paths.get("./"); + + @Test + public void testAppendParquetFile() throws IOException { + SparkSession spark = + SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate(); + + StructType schema = + DataTypes.createStructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.IntegerType, false), + DataTypes.createStructField("value", DataTypes.StringType, false), + DataTypes.createStructField("year", DataTypes.IntegerType, false), + DataTypes.createStructField("month", DataTypes.IntegerType, false) + }); + + List data = + Arrays.asList( + RowFactory.create(101, "A", 2025, 12), + RowFactory.create(102, "B", 2025, 12), + RowFactory.create(201, "C", 2025, 11), + RowFactory.create(301, "D", 2024, 7)); + + Dataset df = spark.createDataFrame(data, schema); + java.nio.file.Path path = tempDir.resolve("parquet-partitioned_table_test"); + File file = path.toFile(); + String outputPath = file.getPath(); + + df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); + + // TODO create an InternalDataFile and Partition Fields for testing purposes + + // TODO test appendNewParquetFile() + + // TODO validate the final table + spark.stop(); + } +} From f6fdc72be71ea937de7a06e0d95aa6fa893879c2 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 15 Dec 2025 21:05:06 +0100 Subject: [PATCH 05/54] add function to test schema equivalence before appending --- .../java/org/apache/xtable/parquet/ParquetDataManager.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 9a38c775a..f0e8a22cd 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -56,6 +56,12 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf return reader; } + // check required before appending the file + private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) { + ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend); + ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable); + return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema()); + } // partition fields are already computed, given a parquet file InternalDataFile must be derived // (e.g., using createInternalDataFileFromParquetFile()) private Path appendNewParquetFile( From a94c3f36c9587cfe62ff7330ef5537e03aff8751 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 16 Dec 2025 12:59:02 +0100 Subject: [PATCH 06/54] construct path to inject to based on partitions --- .../xtable/parquet/ParquetDataManager.java | 175 +++++++++++++++--- .../parquet/TestParquetDataManager.java | 9 +- 2 files changed, 151 insertions(+), 33 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index f0e8a22cd..a19d8f585 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -19,16 +19,18 @@ package org.apache.xtable.parquet; import java.io.IOException; -import java.time.Instant; -import java.time.ZoneId; +import java.time.*; import java.time.format.DateTimeFormatter; -import java.util.List; -import java.util.UUID; +import java.time.temporal.ChronoUnit; +import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; import lombok.Builder; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.parquet.example.data.Group; import org.apache.parquet.hadoop.ParquetReader; @@ -36,10 +38,7 @@ import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; -import org.apache.xtable.model.schema.InternalPartitionField; -import org.apache.xtable.model.stat.PartitionValue; -import org.apache.xtable.model.stat.Range; -import org.apache.xtable.model.storage.InternalDataFile; +import org.apache.iceberg.PartitionField; @Builder public class ParquetDataManager { @@ -56,48 +55,170 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf return reader; } + // TODO check if footer of added file can cause problems (catalog) when reading the full table + // after appending // check required before appending the file private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) { ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend); ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable); return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema()); } + + Instant parsePartitionDirToStartTime(String partitionDir, List partitionFields) { + Map partitionValues = new HashMap<>(); + String[] parts = partitionDir.split("/"); + for (String part : parts) { + String[] keyValue = part.split("="); + if (keyValue.length == 2) { + partitionValues.put(keyValue[0].toLowerCase(), keyValue[1]); + } + } + int year = 1970; + int month = 1; + int day = 1; + int hour = 0; + int minute = 0; + int second = 0; + + for (PartitionField field : partitionFields) { + String fieldName = field.name().toLowerCase(); + String value = partitionValues.get(fieldName); + + if (value != null) { + try { + int intValue = Integer.parseInt(value); + + switch (fieldName) { + case "year": + year = intValue; + break; + case "month": + month = intValue; + break; + case "day": + day = intValue; + break; + case "hour": + hour = intValue; + break; + case "minute": + minute = intValue; + break; + case "second": + second = intValue; + break; + default: + break; + } + } catch (NumberFormatException e) { + System.err.println( + "Warning: Invalid number format for partition field '" + fieldName + "': " + value); + } + } + } + + try { + LocalDateTime localDateTime = LocalDateTime.of(year, month, day, hour, minute, second); + return localDateTime.toInstant(ZoneOffset.UTC); + } catch (java.time.DateTimeException e) { + throw new IllegalArgumentException( + "Invalid partition directory date components: " + partitionDir, e); + } + } + + private ChronoUnit getGranularityUnit(List partitionFields) { + if (partitionFields == null || partitionFields.isEmpty()) { + return ChronoUnit.DAYS; + } + // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month') + String lastFieldName = partitionFields.get(partitionFields.size() - 1).name(); + + if (lastFieldName.equalsIgnoreCase("year")) { + return ChronoUnit.YEARS; + } else if (lastFieldName.equalsIgnoreCase("month")) { + return ChronoUnit.MONTHS; + } else if (lastFieldName.equalsIgnoreCase("day") || lastFieldName.equalsIgnoreCase("date")) { + return ChronoUnit.DAYS; + } else if (lastFieldName.equalsIgnoreCase("hour")) { + return ChronoUnit.HOURS; + } else if (lastFieldName.equalsIgnoreCase("minute")) { + return ChronoUnit.MINUTES; + } else { + return ChronoUnit.DAYS; + } + } + + private String findTargetPartitionFolder( + Instant modifTime, + List partitionFields, + // could be retrieved from getParquetFiles() of ParquetConversionSource + Stream parquetFiles) + throws IOException { + + long modifTimeMillis = modifTime.toEpochMilli(); + final ZoneId ZONE = ZoneId.systemDefault(); + ChronoUnit partitionUnit = getGranularityUnit(partitionFields); + List allFolders = + parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList()); + + // sort the folders by their start time + allFolders.sort(Comparator.comparing(f -> parsePartitionDirToStartTime(f, partitionFields))); + + for (int i = 0; i < allFolders.size(); i++) { + String currentFolder = allFolders.get(i); + + // start time of the current folder + Instant currentStartTime = parsePartitionDirToStartTime(currentFolder, partitionFields); + long currentStartMillis = currentStartTime.toEpochMilli(); + + // determine the end time (which is the start time of the next logical partition) + long nextStartMillis; + if (i + 1 < allFolders.size()) { + nextStartMillis = + parsePartitionDirToStartTime(allFolders.get(i + 1), partitionFields).toEpochMilli(); + } else { + ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE); + ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit); + nextStartMillis = nextStartZDT.toInstant().toEpochMilli(); + } + if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) { + return currentFolder; + } + } + return ""; + } // partition fields are already computed, given a parquet file InternalDataFile must be derived // (e.g., using createInternalDataFileFromParquetFile()) private Path appendNewParquetFile( Configuration conf, String rootPath, - InternalDataFile internalParquetFile, - List partitionFields) { + FileStatus parquetFile, + Stream parquetFiles, + List partitionFields) { Path finalFile = null; String partitionDir = ""; - List partitionValues = internalParquetFile.getPartitionValues(); - Instant modifTime = Instant.ofEpochMilli(internalParquetFile.getLastModified()); - Path fileToAppend = new Path(internalParquetFile.toString()); + Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime()); + Path fileToAppend = parquetFile.getPath(); // construct the file path to inject into the existing partitioned file - if (partitionValues == null || partitionValues.isEmpty()) { + if (partitionFields == null || partitionFields.isEmpty()) { String partitionValue = DateTimeFormatter.ISO_LOCAL_DATE.format( modifTime.atZone(ZoneId.systemDefault()).toLocalDate()); - partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue; + partitionDir = partitionFields.get(0).name() + partitionValue; } else { - // handle multiple partitioning case (year and month etc.) - partitionDir = - partitionValues.stream() - .map( - pv -> { - Range epochValueObject = pv.getRange(); - // epochValueObject is always sure to be long - String valueStr = String.valueOf(epochValueObject.getMaxValue()); - return pv.getPartitionField() + "=" + valueStr; - }) - .collect(Collectors.joining("/")); + // handle multiple partitioning case (year and month etc.), find the target partition dir + try { + partitionDir = findTargetPartitionFolder(modifTime, partitionFields, parquetFiles); + } catch (IOException e) { + e.printStackTrace(); + } } + // construct the path String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet"; Path outputFile = new Path(new Path(rootPath, partitionDir), fileName); // return its reader for convenience of writing ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf); - // append/write it in the right partition + // then inject/append/write it in the right partition finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile); return finalFile; } diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java index 2365b5fef..9db52a70d 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java @@ -18,8 +18,8 @@ package org.apache.xtable.parquet; -import java.io.File; import java.io.IOException; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.List; @@ -32,10 +32,8 @@ import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; public class TestParquetDataManager { - @TempDir static java.nio.file.Path tempDir = Paths.get("./"); @Test public void testAppendParquetFile() throws IOException { @@ -59,9 +57,8 @@ public void testAppendParquetFile() throws IOException { RowFactory.create(301, "D", 2024, 7)); Dataset df = spark.createDataFrame(data, schema); - java.nio.file.Path path = tempDir.resolve("parquet-partitioned_table_test"); - File file = path.toFile(); - String outputPath = file.getPath(); + Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test"); + String outputPath = fixedPath.toFile().getName(); df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); From f8bdbfe2ff26cb816e8d57f566f3de5aaf6b6831 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Wed, 17 Dec 2025 00:25:01 +0100 Subject: [PATCH 07/54] fix imports --- .../xtable/parquet/ParquetDataManager.java | 32 ++++++++++++------- ...Manager.java => ITParquetDataManager.java} | 2 +- 2 files changed, 22 insertions(+), 12 deletions(-) rename xtable-core/src/test/java/org/apache/xtable/parquet/{TestParquetDataManager.java => ITParquetDataManager.java} (98%) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index a19d8f585..23c590642 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -19,10 +19,18 @@ package org.apache.xtable.parquet; import java.io.IOException; -import java.time.*; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZoneOffset; +import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; -import java.util.*; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -38,7 +46,7 @@ import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; -import org.apache.iceberg.PartitionField; +import org.apache.xtable.model.schema.InternalPartitionField; @Builder public class ParquetDataManager { @@ -64,7 +72,8 @@ private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fi return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema()); } - Instant parsePartitionDirToStartTime(String partitionDir, List partitionFields) { + Instant parsePartitionDirToStartTime( + String partitionDir, List partitionFields) { Map partitionValues = new HashMap<>(); String[] parts = partitionDir.split("/"); for (String part : parts) { @@ -80,8 +89,8 @@ Instant parsePartitionDirToStartTime(String partitionDir, List p int minute = 0; int second = 0; - for (PartitionField field : partitionFields) { - String fieldName = field.name().toLowerCase(); + for (InternalPartitionField field : partitionFields) { + String fieldName = field.getSourceField().getName().toLowerCase(); String value = partitionValues.get(fieldName); if (value != null) { @@ -126,12 +135,13 @@ Instant parsePartitionDirToStartTime(String partitionDir, List p } } - private ChronoUnit getGranularityUnit(List partitionFields) { + private ChronoUnit getGranularityUnit(List partitionFields) { if (partitionFields == null || partitionFields.isEmpty()) { return ChronoUnit.DAYS; } // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month') - String lastFieldName = partitionFields.get(partitionFields.size() - 1).name(); + String lastFieldName = + partitionFields.get(partitionFields.size() - 1).getSourceField().getName(); if (lastFieldName.equalsIgnoreCase("year")) { return ChronoUnit.YEARS; @@ -150,7 +160,7 @@ private ChronoUnit getGranularityUnit(List partitionFields) { private String findTargetPartitionFolder( Instant modifTime, - List partitionFields, + List partitionFields, // could be retrieved from getParquetFiles() of ParquetConversionSource Stream parquetFiles) throws IOException { @@ -194,7 +204,7 @@ private Path appendNewParquetFile( String rootPath, FileStatus parquetFile, Stream parquetFiles, - List partitionFields) { + List partitionFields) { Path finalFile = null; String partitionDir = ""; Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime()); @@ -204,7 +214,7 @@ private Path appendNewParquetFile( String partitionValue = DateTimeFormatter.ISO_LOCAL_DATE.format( modifTime.atZone(ZoneId.systemDefault()).toLocalDate()); - partitionDir = partitionFields.get(0).name() + partitionValue; + partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue; } else { // handle multiple partitioning case (year and month etc.), find the target partition dir try { diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java similarity index 98% rename from xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java rename to xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 9db52a70d..30d044578 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -33,7 +33,7 @@ import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.Test; -public class TestParquetDataManager { +public class ITParquetDataManager { @Test public void testAppendParquetFile() throws IOException { From c04a983733bc67614c224caca696333ba96867eb Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Wed, 17 Dec 2025 20:57:24 +0100 Subject: [PATCH 08/54] refactoring (lombok, logs, javadocs and function and approach commenting) --- .../xtable/parquet/ParquetDataManager.java | 32 +++++++++++++---- .../xtable/parquet/ParquetFileConfig.java | 35 ++++++++----------- 2 files changed, 39 insertions(+), 28 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 23c590642..ca0496684 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -35,6 +35,7 @@ import java.util.stream.Stream; import lombok.Builder; +import lombok.extern.log4j.Log4j2; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -48,17 +49,27 @@ import org.apache.xtable.model.schema.InternalPartitionField; +/** + * Manages Parquet file operations including reading, writing, and partition discovery and path + * construction. + * + *

This class provides functions to handle Parquet metadata, validate schemas during appends, and + * calculate target partition directories based on file modification times and defined partition + * fields. + */ +@Log4j2 @Builder public class ParquetDataManager { private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance(); - public ParquetReader readParquetDataAsReader(String filePath, Configuration conf) { + public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException { ParquetReader reader = null; Path file = new Path(filePath); try { reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); } catch (IOException e) { - e.printStackTrace(); + log.error("Unexpected error during Parquet read: {}", filePath, e); + throw new IOException("Unexpected error reading Parquet file", e); } return reader; } @@ -158,6 +169,9 @@ private ChronoUnit getGranularityUnit(List partitionFiel } } + // find the target partition by comparing the modficationTime of the file to add with the current + // tables + // if the modifTime falls between two partitions then we know it belongs to the first private String findTargetPartitionFolder( Instant modifTime, List partitionFields, @@ -167,6 +181,7 @@ private String findTargetPartitionFolder( long modifTimeMillis = modifTime.toEpochMilli(); final ZoneId ZONE = ZoneId.systemDefault(); + // find the partition granularity (days, hours...) ChronoUnit partitionUnit = getGranularityUnit(partitionFields); List allFolders = parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList()); @@ -189,6 +204,7 @@ private String findTargetPartitionFolder( } else { ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE); ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit); + // to evaluate the partition date value and make comparable with the modifTime in Instant nextStartMillis = nextStartZDT.toInstant().toEpochMilli(); } if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) { @@ -204,7 +220,8 @@ private Path appendNewParquetFile( String rootPath, FileStatus parquetFile, Stream parquetFiles, - List partitionFields) { + List partitionFields) + throws IOException { Path finalFile = null; String partitionDir = ""; Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime()); @@ -227,7 +244,7 @@ private Path appendNewParquetFile( String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet"; Path outputFile = new Path(new Path(rootPath, partitionDir), fileName); // return its reader for convenience of writing - ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf); + ParquetReader reader = getParquetReader(fileToAppend.getName(), conf); // then inject/append/write it in the right partition finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile); return finalFile; @@ -239,7 +256,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp } private Path writeNewParquetFile( - Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) { + Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) + throws IOException { ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend); int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; try (ParquetWriter writer = @@ -259,8 +277,8 @@ private Path writeNewParquetFile( writer.write(currentGroup); } } catch (Exception e) { - - e.printStackTrace(); + log.error("Unexpected error during Parquet write: {}", outputFile, e); + throw new IOException("Failed to complete Parquet write operation", e); } return outputFile; } diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java index 6cdf1d4f3..e4b645f5d 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java @@ -18,40 +18,33 @@ package org.apache.xtable.parquet; +import lombok.AccessLevel; +import lombok.Getter; +import lombok.experimental.FieldDefaults; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; +@Getter +@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) class ParquetFileConfig { - private final MessageType schema; - private final long rowGroupSize; - private final CompressionCodecName codec; + MessageType schema; + long rowGroupSize; + CompressionCodecName codec; public ParquetFileConfig(Configuration conf, Path file) { ParquetMetadata metadata = ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file); - MessageType schema = metadata.getFileMetaData().getSchema(); + if (metadata.getBlocks().isEmpty()) { - throw new IllegalStateException("Baseline Parquet file has no row groups."); + throw new IllegalStateException("Parquet file contains no row groups."); } - long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize(); - CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); - this.schema = schema; - this.rowGroupSize = rowGroupSize; - this.codec = codec; - } - - public MessageType getSchema() { - return schema; - } - - public long getRowGroupSize() { - return rowGroupSize; - } - public CompressionCodecName getCodec() { - return codec; + this.schema = metadata.getFileMetaData().getSchema(); + this.rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize(); + this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); } } From 5f2541e9c81d91b8b2a1cc3fe986457a7e3bb96c Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Thu, 1 Jan 2026 18:03:56 +0100 Subject: [PATCH 09/54] use appendFile to append a file into a table while tracking the appends using the FileStatus' modifTime attribute --- .../xtable/parquet/ParquetDataManager.java | 139 ++++++++++++------ 1 file changed, 93 insertions(+), 46 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index ca0496684..19777be81 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -39,13 +39,20 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.parquet.example.data.Group; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.util.HadoopInputFile; +import org.apache.parquet.hadoop.util.HadoopOutputFile; +import org.apache.parquet.io.InputFile; +import org.apache.parquet.schema.MessageType; import org.apache.xtable.model.schema.InternalPartitionField; @@ -61,27 +68,8 @@ @Builder public class ParquetDataManager { private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance(); - - public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException { - ParquetReader reader = null; - Path file = new Path(filePath); - try { - reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); - } catch (IOException e) { - log.error("Unexpected error during Parquet read: {}", filePath, e); - throw new IOException("Unexpected error reading Parquet file", e); - } - return reader; - } - - // TODO check if footer of added file can cause problems (catalog) when reading the full table - // after appending - // check required before appending the file - private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) { - ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend); - ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable); - return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema()); - } + private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB + private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB Instant parsePartitionDirToStartTime( String partitionDir, List partitionFields) { @@ -213,6 +201,34 @@ private String findTargetPartitionFolder( } return ""; } + + private Path writeNewParquetFile( + Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) + throws IOException { + ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend); + int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; + try (ParquetWriter writer = + new ParquetWriter( + outputFile, + new GroupWriteSupport(), + parquetFileConfig.getCodec(), + (int) parquetFileConfig.getRowGroupSize(), + pageSize, + pageSize, // dictionaryPageSize + true, // enableDictionary + false, // enableValidation + ParquetWriter.DEFAULT_WRITER_VERSION, + conf)) { + Group currentGroup = null; + while ((currentGroup = (Group) reader.read()) != null) { + writer.write(currentGroup); + } + } catch (Exception e) { + log.error("Unexpected error during Parquet write: {}", outputFile, e); + throw new IOException("Failed to complete Parquet write operation", e); + } + return outputFile; + } // partition fields are already computed, given a parquet file InternalDataFile must be derived // (e.g., using createInternalDataFileFromParquetFile()) private Path appendNewParquetFile( @@ -250,36 +266,67 @@ private Path appendNewParquetFile( return finalFile; } + public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException { + ParquetReader reader = null; + Path file = new Path(filePath); + try { + reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); + } catch (IOException e) { + log.error("Unexpected error during Parquet read: {}", filePath, e); + throw new IOException("Unexpected error reading Parquet file", e); + } + return reader; + } + + /* Alternative Approach (without path construction) using Parquet API to append a file */ + + // after appending check required before appending the file + private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) { + ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend); + ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable); + return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema()); + } + private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) { ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend); return parquetFileConfig; } - private Path writeNewParquetFile( - Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) - throws IOException { - ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend); - int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; - try (ParquetWriter writer = - new ParquetWriter( - outputFile, - new GroupWriteSupport(), - parquetFileConfig.getCodec(), - (int) parquetFileConfig.getRowGroupSize(), - pageSize, - pageSize, // dictionaryPageSize - true, // enableDictionary - false, // enableValidation - ParquetWriter.DEFAULT_WRITER_VERSION, - conf)) { - Group currentGroup = null; - while ((currentGroup = (Group) reader.read()) != null) { - writer.write(currentGroup); - } - } catch (Exception e) { - log.error("Unexpected error during Parquet write: {}", outputFile, e); - throw new IOException("Failed to complete Parquet write operation", e); + // Method2 to append a file into a table + public void mergeParquetFiles( + Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException { + Configuration conf = new Configuration(); + ParquetFileWriter writer = + new ParquetFileWriter( + HadoopOutputFile.fromPath(outputPath, conf), + schema, + ParquetFileWriter.Mode.CREATE, + DEFAULT_BLOCK_SIZE, + 0); + // write the initial table with the appended file to add into the outputPath + writer.start(); + HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf); + InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf); + if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) { + writer.appendFile(inputFile); + writer.appendFile(inputFileToAppend); } - return outputFile; + Map combinedMeta = new HashMap<>(); + // track the append date and save it in the footer + int appendCount = + Integer.parseInt( + ParquetFileReader.readFooter(conf, filePath) + .getFileMetaData() + .getKeyValueMetaData() + .getOrDefault("total_appends", "0")); + String newKey = "append_date_" + (appendCount + 1); + // get the equivalent fileStatus from the file-to-append Path + FileSystem fs = FileSystem.get(conf); + FileStatus fileStatus = fs.getFileStatus(fileToAppend); + // save its modification time (for later sync related retrieval) in the metadata of the output + // table + combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime())); + combinedMeta.put("total_appends", String.valueOf(appendCount + 1)); + writer.end(combinedMeta); } } From 47e707688d3ced9ee34c477c988224dc5c663ff4 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Thu, 1 Jan 2026 19:01:06 +0100 Subject: [PATCH 10/54] find the files that satisfy to the time condition --- .../xtable/parquet/ParquetDataManager.java | 65 +++++++++++-------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 19777be81..777b829be 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -26,11 +26,7 @@ import java.time.ZonedDateTime; import java.time.format.DateTimeFormatter; import java.time.temporal.ChronoUnit; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.UUID; +import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -47,8 +43,8 @@ import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetReader; import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.example.GroupReadSupport; import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.HadoopOutputFile; import org.apache.parquet.io.InputFile; @@ -231,7 +227,7 @@ private Path writeNewParquetFile( } // partition fields are already computed, given a parquet file InternalDataFile must be derived // (e.g., using createInternalDataFileFromParquetFile()) - private Path appendNewParquetFile( + private void appendNewParquetFile( Configuration conf, String rootPath, FileStatus parquetFile, @@ -259,26 +255,12 @@ private Path appendNewParquetFile( // construct the path String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet"; Path outputFile = new Path(new Path(rootPath, partitionDir), fileName); - // return its reader for convenience of writing - ParquetReader reader = getParquetReader(fileToAppend.getName(), conf); // then inject/append/write it in the right partition - finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile); - return finalFile; + MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema(); + AppendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema); } - public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException { - ParquetReader reader = null; - Path file = new Path(filePath); - try { - reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build(); - } catch (IOException e) { - log.error("Unexpected error during Parquet read: {}", filePath, e); - throw new IOException("Unexpected error reading Parquet file", e); - } - return reader; - } - - /* Alternative Approach (without path construction) using Parquet API to append a file */ + /* Use Parquet API to append to a file */ // after appending check required before appending the file private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) { @@ -292,8 +274,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp return parquetFileConfig; } - // Method2 to append a file into a table - public void mergeParquetFiles( + // append a file into a table + public void AppendNewParquetFiles( Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException { Configuration conf = new Configuration(); ParquetFileWriter writer = @@ -329,4 +311,35 @@ public void mergeParquetFiles( combinedMeta.put("total_appends", String.valueOf(appendCount + 1)); writer.end(combinedMeta); } + // Find and retrieve the file paths that satisfy the time condition + // Each partition folder contains one merged_file in turn containing many appends + public List findFilesAfterModifTime( + Configuration conf, Path directoryPath, long targetModifTime) throws IOException { + List results = new ArrayList<>(); + FileSystem fs = directoryPath.getFileSystem(conf); + + FileStatus[] statuses = + fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet")); + + for (FileStatus status : statuses) { + Path filePath = status.getPath(); + + try { + ParquetMetadata footer = ParquetFileReader.readFooter(conf, filePath); + Map meta = footer.getFileMetaData().getKeyValueMetaData(); + + int totalAppends = Integer.parseInt(meta.getOrDefault("total_appends", "0")); + if (Long.parseLong(meta.get("append_date_0")) > targetModifTime) { + results.add(filePath); + break; + } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) { + continue; + } + } catch (Exception e) { + log.error("Could not read metadata for: {}", filePath, e); + throw new IOException("Could not read metadata for", e); + } + } + return results; + } } From fbb09ecbc284c7c1eaa96a22934b91134f0e9750 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Thu, 1 Jan 2026 19:16:18 +0100 Subject: [PATCH 11/54] treat appends as separate files to add in the target partition folder --- .../xtable/parquet/ParquetDataManager.java | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 777b829be..cc1f473d0 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -34,10 +34,7 @@ import lombok.extern.log4j.Log4j2; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.*; import org.apache.parquet.example.data.Group; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetFileWriter; @@ -257,7 +254,15 @@ private void appendNewParquetFile( Path outputFile = new Path(new Path(rootPath, partitionDir), fileName); // then inject/append/write it in the right partition MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema(); - AppendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema); + appendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema); + + // OR + /*FileSystem fs = outputFile.getFileSystem(conf); + if (!fs.exists(outputFile.getParent())) { + fs.mkdirs(outputFile.getParent()); + } + // copy a seperate file into the target partition folder + FileUtil.copy(fs, fileToAppend, fs, outputFile, false, conf);*/ } /* Use Parquet API to append to a file */ @@ -275,7 +280,7 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp } // append a file into a table - public void AppendNewParquetFiles( + public void appendNewParquetFiles( Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException { Configuration conf = new Configuration(); ParquetFileWriter writer = @@ -335,11 +340,17 @@ public List findFilesAfterModifTime( } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) { continue; } + // OR + /*if (status.getPath().getName().endsWith(".parquet") && + status.getModificationTime() > targetModifTime) { + results.add(status.getPath()); + }*/ } catch (Exception e) { log.error("Could not read metadata for: {}", filePath, e); throw new IOException("Could not read metadata for", e); } } + return results; } } From fe19a6055c92079a938ff2feb83d105e1b6c720e Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Fri, 2 Jan 2026 20:10:35 +0100 Subject: [PATCH 12/54] update approach: selective block compaction --- .../xtable/parquet/ParquetDataManager.java | 295 +++++------------- .../xtable/parquet/ParquetFileConfig.java | 2 +- 2 files changed, 73 insertions(+), 224 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index cc1f473d0..563c6ca3d 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -19,36 +19,23 @@ package org.apache.xtable.parquet; import java.io.IOException; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.time.ZoneOffset; -import java.time.ZonedDateTime; -import java.time.format.DateTimeFormatter; -import java.time.temporal.ChronoUnit; import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; import lombok.Builder; import lombok.extern.log4j.Log4j2; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; -import org.apache.parquet.example.data.Group; +import org.apache.parquet.format.converter.ParquetMetadataConverter; import org.apache.parquet.hadoop.ParquetFileReader; import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetReader; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.hadoop.example.GroupWriteSupport; +import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.hadoop.util.HadoopInputFile; import org.apache.parquet.hadoop.util.HadoopOutputFile; import org.apache.parquet.io.InputFile; import org.apache.parquet.schema.MessageType; -import org.apache.xtable.model.schema.InternalPartitionField; - /** * Manages Parquet file operations including reading, writing, and partition discovery and path * construction. @@ -64,207 +51,6 @@ public class ParquetDataManager { private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB - Instant parsePartitionDirToStartTime( - String partitionDir, List partitionFields) { - Map partitionValues = new HashMap<>(); - String[] parts = partitionDir.split("/"); - for (String part : parts) { - String[] keyValue = part.split("="); - if (keyValue.length == 2) { - partitionValues.put(keyValue[0].toLowerCase(), keyValue[1]); - } - } - int year = 1970; - int month = 1; - int day = 1; - int hour = 0; - int minute = 0; - int second = 0; - - for (InternalPartitionField field : partitionFields) { - String fieldName = field.getSourceField().getName().toLowerCase(); - String value = partitionValues.get(fieldName); - - if (value != null) { - try { - int intValue = Integer.parseInt(value); - - switch (fieldName) { - case "year": - year = intValue; - break; - case "month": - month = intValue; - break; - case "day": - day = intValue; - break; - case "hour": - hour = intValue; - break; - case "minute": - minute = intValue; - break; - case "second": - second = intValue; - break; - default: - break; - } - } catch (NumberFormatException e) { - System.err.println( - "Warning: Invalid number format for partition field '" + fieldName + "': " + value); - } - } - } - - try { - LocalDateTime localDateTime = LocalDateTime.of(year, month, day, hour, minute, second); - return localDateTime.toInstant(ZoneOffset.UTC); - } catch (java.time.DateTimeException e) { - throw new IllegalArgumentException( - "Invalid partition directory date components: " + partitionDir, e); - } - } - - private ChronoUnit getGranularityUnit(List partitionFields) { - if (partitionFields == null || partitionFields.isEmpty()) { - return ChronoUnit.DAYS; - } - // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month') - String lastFieldName = - partitionFields.get(partitionFields.size() - 1).getSourceField().getName(); - - if (lastFieldName.equalsIgnoreCase("year")) { - return ChronoUnit.YEARS; - } else if (lastFieldName.equalsIgnoreCase("month")) { - return ChronoUnit.MONTHS; - } else if (lastFieldName.equalsIgnoreCase("day") || lastFieldName.equalsIgnoreCase("date")) { - return ChronoUnit.DAYS; - } else if (lastFieldName.equalsIgnoreCase("hour")) { - return ChronoUnit.HOURS; - } else if (lastFieldName.equalsIgnoreCase("minute")) { - return ChronoUnit.MINUTES; - } else { - return ChronoUnit.DAYS; - } - } - - // find the target partition by comparing the modficationTime of the file to add with the current - // tables - // if the modifTime falls between two partitions then we know it belongs to the first - private String findTargetPartitionFolder( - Instant modifTime, - List partitionFields, - // could be retrieved from getParquetFiles() of ParquetConversionSource - Stream parquetFiles) - throws IOException { - - long modifTimeMillis = modifTime.toEpochMilli(); - final ZoneId ZONE = ZoneId.systemDefault(); - // find the partition granularity (days, hours...) - ChronoUnit partitionUnit = getGranularityUnit(partitionFields); - List allFolders = - parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList()); - - // sort the folders by their start time - allFolders.sort(Comparator.comparing(f -> parsePartitionDirToStartTime(f, partitionFields))); - - for (int i = 0; i < allFolders.size(); i++) { - String currentFolder = allFolders.get(i); - - // start time of the current folder - Instant currentStartTime = parsePartitionDirToStartTime(currentFolder, partitionFields); - long currentStartMillis = currentStartTime.toEpochMilli(); - - // determine the end time (which is the start time of the next logical partition) - long nextStartMillis; - if (i + 1 < allFolders.size()) { - nextStartMillis = - parsePartitionDirToStartTime(allFolders.get(i + 1), partitionFields).toEpochMilli(); - } else { - ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE); - ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit); - // to evaluate the partition date value and make comparable with the modifTime in Instant - nextStartMillis = nextStartZDT.toInstant().toEpochMilli(); - } - if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) { - return currentFolder; - } - } - return ""; - } - - private Path writeNewParquetFile( - Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) - throws IOException { - ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend); - int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE; - try (ParquetWriter writer = - new ParquetWriter( - outputFile, - new GroupWriteSupport(), - parquetFileConfig.getCodec(), - (int) parquetFileConfig.getRowGroupSize(), - pageSize, - pageSize, // dictionaryPageSize - true, // enableDictionary - false, // enableValidation - ParquetWriter.DEFAULT_WRITER_VERSION, - conf)) { - Group currentGroup = null; - while ((currentGroup = (Group) reader.read()) != null) { - writer.write(currentGroup); - } - } catch (Exception e) { - log.error("Unexpected error during Parquet write: {}", outputFile, e); - throw new IOException("Failed to complete Parquet write operation", e); - } - return outputFile; - } - // partition fields are already computed, given a parquet file InternalDataFile must be derived - // (e.g., using createInternalDataFileFromParquetFile()) - private void appendNewParquetFile( - Configuration conf, - String rootPath, - FileStatus parquetFile, - Stream parquetFiles, - List partitionFields) - throws IOException { - Path finalFile = null; - String partitionDir = ""; - Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime()); - Path fileToAppend = parquetFile.getPath(); - // construct the file path to inject into the existing partitioned file - if (partitionFields == null || partitionFields.isEmpty()) { - String partitionValue = - DateTimeFormatter.ISO_LOCAL_DATE.format( - modifTime.atZone(ZoneId.systemDefault()).toLocalDate()); - partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue; - } else { - // handle multiple partitioning case (year and month etc.), find the target partition dir - try { - partitionDir = findTargetPartitionFolder(modifTime, partitionFields, parquetFiles); - } catch (IOException e) { - e.printStackTrace(); - } - } - // construct the path - String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet"; - Path outputFile = new Path(new Path(rootPath, partitionDir), fileName); - // then inject/append/write it in the right partition - MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema(); - appendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema); - - // OR - /*FileSystem fs = outputFile.getFileSystem(conf); - if (!fs.exists(outputFile.getParent())) { - fs.mkdirs(outputFile.getParent()); - } - // copy a seperate file into the target partition folder - FileUtil.copy(fs, fileToAppend, fs, outputFile, false, conf);*/ - } - /* Use Parquet API to append to a file */ // after appending check required before appending the file @@ -279,23 +65,25 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp return parquetFileConfig; } - // append a file into a table - public void appendNewParquetFiles( - Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException { + // append a file into a table (merges two files into one .parquet under a partition folder) + public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema) + throws IOException { Configuration conf = new Configuration(); + long firstBlockIndex = getParquetFileConfig(conf, fileToAppend).getRowGroupSize(); ParquetFileWriter writer = new ParquetFileWriter( - HadoopOutputFile.fromPath(outputPath, conf), + HadoopOutputFile.fromPath(filePath, conf), + // HadoopOutputFile.fromPath(outputPath, conf), schema, ParquetFileWriter.Mode.CREATE, DEFAULT_BLOCK_SIZE, 0); // write the initial table with the appended file to add into the outputPath writer.start(); - HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf); + // HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf); InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf); if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) { - writer.appendFile(inputFile); + // writer.appendFile(inputFile); writer.appendFile(inputFileToAppend); } Map combinedMeta = new HashMap<>(); @@ -310,12 +98,73 @@ public void appendNewParquetFiles( // get the equivalent fileStatus from the file-to-append Path FileSystem fs = FileSystem.get(conf); FileStatus fileStatus = fs.getFileStatus(fileToAppend); - // save its modification time (for later sync related retrieval) in the metadata of the output + // save block indexes and modification time (for later sync related retrieval) in the metadata + // of the output // table combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime())); combinedMeta.put("total_appends", String.valueOf(appendCount + 1)); + combinedMeta.put( + "index_start_block_of_append_" + String.valueOf(appendCount + 1), + String.valueOf(firstBlockIndex)); writer.end(combinedMeta); } + // selective compaction of parquet blocks + public List formNewTargetFiles(Configuration conf, Path directoryPath, long targetModifTime) + throws IOException { + List finalPaths = new ArrayList<>(); + FileSystem fs = directoryPath.getFileSystem(conf); + + FileStatus[] statuses = + fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet")); + + for (FileStatus status : statuses) { + + Path filePath = status.getPath(); + ParquetMetadata bigFileFooter = + ParquetFileReader.readFooter(conf, filePath, ParquetMetadataConverter.NO_FILTER); + MessageType schema = bigFileFooter.getFileMetaData().getSchema(); + int totalAppends = + Integer.parseInt( + bigFileFooter + .getFileMetaData() + .getKeyValueMetaData() + .getOrDefault("total_appends", "0")); + for (int i = 1; i < totalAppends; i++) { + int startBlock = + Integer.valueOf( + bigFileFooter + .getFileMetaData() + .getKeyValueMetaData() + .get("index_start_block_of_append_" + totalAppends)); + int endBlock = + Integer.valueOf( + bigFileFooter + .getFileMetaData() + .getKeyValueMetaData() + .get("index_end_block_of_append_" + (totalAppends + 1))); + Path targetSyncFilePath = + new Path( + status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock)); + try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) { + writer.start(); + for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) { + BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j); + if (j >= startBlock + && j <= endBlock + && status.getModificationTime() >= targetModifTime) { + List blockList = + Collections.singletonList(blockMetadata); + FSDataInputStream targetInputStream = fs.open(status.getPath()); + writer.appendRowGroups(targetInputStream, blockList, false); + } + } + writer.end(new HashMap<>()); + finalPaths.add(targetSyncFilePath); + } + } + } + return finalPaths; + } // Find and retrieve the file paths that satisfy the time condition // Each partition folder contains one merged_file in turn containing many appends public List findFilesAfterModifTime( diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java index e4b645f5d..64c9e9cfc 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java @@ -44,7 +44,7 @@ public ParquetFileConfig(Configuration conf, Path file) { } this.schema = metadata.getFileMetaData().getSchema(); - this.rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize(); + this.rowGroupSize = metadata.getBlocks().size(); // .get(0).getTotalByteSize() this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); } } From da7f300b18172b21f968ba4583d2fbe98dc0488c Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Fri, 2 Jan 2026 20:50:01 +0100 Subject: [PATCH 13/54] update approach: added a basic test to check data selection using modificationTime selector --- .../xtable/parquet/ParquetDataManager.java | 41 +------------ .../xtable/parquet/ITParquetDataManager.java | 58 +++++++++++++++++-- 2 files changed, 56 insertions(+), 43 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 563c6ca3d..75f3c4f80 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -109,8 +109,8 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType writer.end(combinedMeta); } // selective compaction of parquet blocks - public List formNewTargetFiles(Configuration conf, Path directoryPath, long targetModifTime) - throws IOException { + public static List formNewTargetFiles( + Configuration conf, Path directoryPath, long targetModifTime) throws IOException { List finalPaths = new ArrayList<>(); FileSystem fs = directoryPath.getFileSystem(conf); @@ -165,41 +165,4 @@ public List formNewTargetFiles(Configuration conf, Path directoryPath, lon } return finalPaths; } - // Find and retrieve the file paths that satisfy the time condition - // Each partition folder contains one merged_file in turn containing many appends - public List findFilesAfterModifTime( - Configuration conf, Path directoryPath, long targetModifTime) throws IOException { - List results = new ArrayList<>(); - FileSystem fs = directoryPath.getFileSystem(conf); - - FileStatus[] statuses = - fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet")); - - for (FileStatus status : statuses) { - Path filePath = status.getPath(); - - try { - ParquetMetadata footer = ParquetFileReader.readFooter(conf, filePath); - Map meta = footer.getFileMetaData().getKeyValueMetaData(); - - int totalAppends = Integer.parseInt(meta.getOrDefault("total_appends", "0")); - if (Long.parseLong(meta.get("append_date_0")) > targetModifTime) { - results.add(filePath); - break; - } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) { - continue; - } - // OR - /*if (status.getPath().getName().endsWith(".parquet") && - status.getModificationTime() > targetModifTime) { - results.add(status.getPath()); - }*/ - } catch (Exception e) { - log.error("Could not read metadata for: {}", filePath, e); - throw new IOException("Could not read metadata for", e); - } - } - - return results; - } } diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 30d044578..c084eb6df 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -18,12 +18,17 @@ package org.apache.xtable.parquet; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.List; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -39,7 +44,7 @@ public class ITParquetDataManager { public void testAppendParquetFile() throws IOException { SparkSession spark = SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate(); - + Configuration conf = spark.sparkContext().hadoopConfiguration(); StructType schema = DataTypes.createStructType( new StructField[] { @@ -62,11 +67,56 @@ public void testAppendParquetFile() throws IOException { df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); - // TODO create an InternalDataFile and Partition Fields for testing purposes + // test find files to sync + long targetModifTime = System.currentTimeMillis() - 360000; + org.apache.hadoop.fs.Path hdfsPath = + new org.apache.hadoop.fs.Path("target/fixed-parquet-data/parquet-partitioned_table_test"); + FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); + // set the modification time to the file + updateModificationTimeRecursive(fs, hdfsPath, targetModifTime); + // create new file to append using Spark + List futureDataToSync = + Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7)); + Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); + dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath); + long newModifTime = System.currentTimeMillis() - 5000; + List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7"); + + for (String partition : newPartitions) { + org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition); + if (fs.exists(partitionPath)) { + updateModificationTimeRecursive(fs, partitionPath, newModifTime); + } + } + List resultingFiles = + ParquetDataManager.formNewTargetFiles( + conf, new org.apache.hadoop.fs.Path(fixedPath.toUri()), newModifTime); + // check if resultingFiles contains the append data only (through the partition names) + for (org.apache.hadoop.fs.Path p : resultingFiles) { + String pathString = p.toString(); + // should be TRUE + boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027"); - // TODO test appendNewParquetFile() + // should be FALSE + boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); + + assertTrue(isNewData, "Path should belong to appended data: " + pathString); + assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); + } + // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file) - // TODO validate the final table spark.stop(); } + + private void updateModificationTimeRecursive( + FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException { + org.apache.hadoop.fs.RemoteIterator it = + fs.listFiles(path, true); + while (it.hasNext()) { + org.apache.hadoop.fs.LocatedFileStatus status = it.next(); + if (status.getPath().getName().endsWith(".parquet")) { + fs.setTimes(status.getPath(), time, -1); + } + } + } } From a8730b7c33fd4aabe9066a855b630bfb0e08bdda Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Fri, 2 Jan 2026 21:18:28 +0100 Subject: [PATCH 14/54] fix append based on partition value --- .../xtable/parquet/ParquetDataManager.java | 30 +++++++++++++++---- .../xtable/parquet/ParquetFileConfig.java | 4 +-- .../xtable/parquet/ITParquetDataManager.java | 2 ++ 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 75f3c4f80..6585fda41 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -65,25 +65,45 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp return parquetFileConfig; } + public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) throws IOException { + Configuration conf = new Configuration(); + + // e.g., "year=2024/month=12" + String targetPartition = getPartitionPath(targetPath); + String sourcePartition = getPartitionPath(sourcePath); + + if (!targetPartition.equals(sourcePartition)) { + throw new IllegalArgumentException("Partition Mismatch! Cannot merge " + + sourcePartition + " into " + targetPartition); + } + + // append files within the same partition foldr + appendNewParquetFiles(targetPath, sourcePath, schema); + } + + private String getPartitionPath(Path path) { + // Returns the parent directory name (e.g., country=US) + // For nested partitions, you might need path.getParent().getParent()... + return path.getParent().getName(); + } // append a file into a table (merges two files into one .parquet under a partition folder) public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema) throws IOException { Configuration conf = new Configuration(); - long firstBlockIndex = getParquetFileConfig(conf, fileToAppend).getRowGroupSize(); + long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex(); ParquetFileWriter writer = new ParquetFileWriter( HadoopOutputFile.fromPath(filePath, conf), - // HadoopOutputFile.fromPath(outputPath, conf), schema, ParquetFileWriter.Mode.CREATE, DEFAULT_BLOCK_SIZE, 0); // write the initial table with the appended file to add into the outputPath writer.start(); - // HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf); + HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf); InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf); if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) { - // writer.appendFile(inputFile); + writer.appendFile(inputFile); writer.appendFile(inputFileToAppend); } Map combinedMeta = new HashMap<>(); @@ -105,7 +125,7 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType combinedMeta.put("total_appends", String.valueOf(appendCount + 1)); combinedMeta.put( "index_start_block_of_append_" + String.valueOf(appendCount + 1), - String.valueOf(firstBlockIndex)); + String.valueOf(firstBlockIndex+1)); writer.end(combinedMeta); } // selective compaction of parquet blocks diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java index 64c9e9cfc..33ee08d38 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java @@ -32,7 +32,7 @@ @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) class ParquetFileConfig { MessageType schema; - long rowGroupSize; + long rowGroupIndex; CompressionCodecName codec; public ParquetFileConfig(Configuration conf, Path file) { @@ -44,7 +44,7 @@ public ParquetFileConfig(Configuration conf, Path file) { } this.schema = metadata.getFileMetaData().getSchema(); - this.rowGroupSize = metadata.getBlocks().size(); // .get(0).getTotalByteSize() + this.rowGroupIndex = metadata.getBlocks().size(); this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); } } diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index c084eb6df..3aff654a1 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -79,6 +79,8 @@ public void testAppendParquetFile() throws IOException { Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7)); Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath); + // TODO create the folders manually for a new partition value as appendFile works only within the same partition value + long newModifTime = System.currentTimeMillis() - 5000; List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7"); From d19ccbf2ebe776f69dbf65e560c8bcf4433f8599 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Fri, 2 Jan 2026 22:46:38 +0100 Subject: [PATCH 15/54] fix test with basic example where partitions are not considered --- .../xtable/parquet/ParquetDataManager.java | 9 ++++--- .../xtable/parquet/ITParquetDataManager.java | 26 ++++++++----------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 6585fda41..01fb6b75f 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -65,7 +65,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp return parquetFileConfig; } - public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) throws IOException { + public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) + throws IOException { Configuration conf = new Configuration(); // e.g., "year=2024/month=12" @@ -73,8 +74,8 @@ public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageTy String sourcePartition = getPartitionPath(sourcePath); if (!targetPartition.equals(sourcePartition)) { - throw new IllegalArgumentException("Partition Mismatch! Cannot merge " + - sourcePartition + " into " + targetPartition); + throw new IllegalArgumentException( + "Partition Mismatch! Cannot merge " + sourcePartition + " into " + targetPartition); } // append files within the same partition foldr @@ -125,7 +126,7 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType combinedMeta.put("total_appends", String.valueOf(appendCount + 1)); combinedMeta.put( "index_start_block_of_append_" + String.valueOf(appendCount + 1), - String.valueOf(firstBlockIndex+1)); + String.valueOf(firstBlockIndex + 1)); writer.end(combinedMeta); } // selective compaction of parquet blocks diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 3aff654a1..61a9a9cf5 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -18,7 +18,6 @@ package org.apache.xtable.parquet; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; @@ -55,11 +54,9 @@ public void testAppendParquetFile() throws IOException { }); List data = - Arrays.asList( - RowFactory.create(101, "A", 2025, 12), - RowFactory.create(102, "B", 2025, 12), - RowFactory.create(201, "C", 2025, 11), - RowFactory.create(301, "D", 2024, 7)); + Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); + /* RowFactory.create(201, "C", 2025, 11), + RowFactory.create(301, "D", 2024, 7));*/ Dataset df = spark.createDataFrame(data, schema); Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test"); @@ -69,20 +66,20 @@ public void testAppendParquetFile() throws IOException { // test find files to sync long targetModifTime = System.currentTimeMillis() - 360000; - org.apache.hadoop.fs.Path hdfsPath = - new org.apache.hadoop.fs.Path("target/fixed-parquet-data/parquet-partitioned_table_test"); + org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); // set the modification time to the file updateModificationTimeRecursive(fs, hdfsPath, targetModifTime); // create new file to append using Spark List futureDataToSync = - Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7)); + Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12)); Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath); - // TODO create the folders manually for a new partition value as appendFile works only within the same partition value + // TODO create the folders manually for a new partition value as appendFile works only within + // the same partition value long newModifTime = System.currentTimeMillis() - 5000; - List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7"); + List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); for (String partition : newPartitions) { org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition); @@ -91,19 +88,18 @@ public void testAppendParquetFile() throws IOException { } } List resultingFiles = - ParquetDataManager.formNewTargetFiles( - conf, new org.apache.hadoop.fs.Path(fixedPath.toUri()), newModifTime); + ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime); // check if resultingFiles contains the append data only (through the partition names) for (org.apache.hadoop.fs.Path p : resultingFiles) { String pathString = p.toString(); // should be TRUE - boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027"); + boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027"); // should be FALSE boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); assertTrue(isNewData, "Path should belong to appended data: " + pathString); - assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); + // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file) From aecb204ede4ad15851e1c403866e5cb88366be2a Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Fri, 2 Jan 2026 23:06:36 +0100 Subject: [PATCH 16/54] fix test with basic example where partitions are not considered2 --- .../xtable/parquet/ITParquetDataManager.java | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 61a9a9cf5..ad5a41f2a 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -60,7 +60,8 @@ public void testAppendParquetFile() throws IOException { Dataset df = spark.createDataFrame(data, schema); Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test"); - String outputPath = fixedPath.toFile().getName(); + String outputPath = fixedPath.toString(); + ; df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); @@ -79,25 +80,30 @@ public void testAppendParquetFile() throws IOException { // the same partition value long newModifTime = System.currentTimeMillis() - 5000; - List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); - - for (String partition : newPartitions) { + // TODO many partitions case + // List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); + /* for (String partition : newPartitions) { org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition); if (fs.exists(partitionPath)) { updateModificationTimeRecursive(fs, partitionPath, newModifTime); } - } + }*/ + org.apache.hadoop.fs.Path partitionPath = + new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12"); + updateModificationTimeRecursive(fs, partitionPath, newModifTime); List resultingFiles = ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime); // check if resultingFiles contains the append data only (through the partition names) for (org.apache.hadoop.fs.Path p : resultingFiles) { String pathString = p.toString(); - // should be TRUE + // should be TRUE (test for many partitions) boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027"); - // should be FALSE + // should be FALSE (test for many partitions) boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); - + long modTime = fs.getFileStatus(p).getModificationTime(); + // test for one partition value + assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); assertTrue(isNewData, "Path should belong to appended data: " + pathString); // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } From 0ec8cbb7ed740cbe72687d02878742167bca9ac8 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Fri, 2 Jan 2026 23:22:34 +0100 Subject: [PATCH 17/54] fix test with basic example where partitions are not considered3 --- .../xtable/parquet/ParquetDataManager.java | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 01fb6b75f..e342cdacb 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -122,11 +122,16 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType // save block indexes and modification time (for later sync related retrieval) in the metadata // of the output // table - combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime())); - combinedMeta.put("total_appends", String.valueOf(appendCount + 1)); + int currentAppendIdx = appendCount + 1; + long startBlock = firstBlockIndex; + long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size(); + long endBlock = startBlock + blocksAdded; + + combinedMeta.put("total_appends", String.valueOf(currentAppendIdx)); + combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock)); + combinedMeta.put("index_end_block_of_append_" + currentAppendIdx, String.valueOf(endBlock)); combinedMeta.put( - "index_start_block_of_append_" + String.valueOf(appendCount + 1), - String.valueOf(firstBlockIndex + 1)); + "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime())); writer.end(combinedMeta); } // selective compaction of parquet blocks @@ -156,13 +161,13 @@ public static List formNewTargetFiles( bigFileFooter .getFileMetaData() .getKeyValueMetaData() - .get("index_start_block_of_append_" + totalAppends)); + .get("index_start_block_of_append_" + i)); int endBlock = Integer.valueOf( bigFileFooter .getFileMetaData() .getKeyValueMetaData() - .get("index_end_block_of_append_" + (totalAppends + 1))); + .get("index_end_block_of_append_" + i)); Path targetSyncFilePath = new Path( status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock)); From 9cb75dfd13c1180b38095e6d3ab5b0f80d97f2c8 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 11:29:34 +0100 Subject: [PATCH 18/54] test with time of last append is now --- .../java/org/apache/xtable/parquet/ITParquetDataManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index ad5a41f2a..ac2a94887 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -79,7 +79,7 @@ public void testAppendParquetFile() throws IOException { // TODO create the folders manually for a new partition value as appendFile works only within // the same partition value - long newModifTime = System.currentTimeMillis() - 5000; + long newModifTime = System.currentTimeMillis() - 50000; // TODO many partitions case // List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); /* for (String partition : newPartitions) { @@ -90,7 +90,7 @@ public void testAppendParquetFile() throws IOException { }*/ org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12"); - updateModificationTimeRecursive(fs, partitionPath, newModifTime); + // updateModificationTimeRecursive(fs, partitionPath, newModifTime); List resultingFiles = ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime); // check if resultingFiles contains the append data only (through the partition names) From 9e125f242ccd56fa1770ae27dbd6c0e0db4bc3e1 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 12:07:27 +0100 Subject: [PATCH 19/54] test appendFile with Parquet: TODO test with multiple partitions 1) append and 2) filter for sync --- .../xtable/parquet/ParquetDataManager.java | 14 +-- .../xtable/parquet/ITParquetDataManager.java | 101 +++++++++++++++++- 2 files changed, 106 insertions(+), 9 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index e342cdacb..5e8bb25bb 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -54,13 +54,14 @@ public class ParquetDataManager { /* Use Parquet API to append to a file */ // after appending check required before appending the file - private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) { + private static boolean checkIfSchemaIsSame( + Configuration conf, Path fileToAppend, Path fileFromTable) { ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend); ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable); return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema()); } - private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) { + private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) { ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend); return parquetFileConfig; } @@ -88,7 +89,7 @@ private String getPartitionPath(Path path) { return path.getParent().getName(); } // append a file into a table (merges two files into one .parquet under a partition folder) - public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema) + public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema) throws IOException { Configuration conf = new Configuration(); long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex(); @@ -133,15 +134,16 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType combinedMeta.put( "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime())); writer.end(combinedMeta); + return filePath; } // selective compaction of parquet blocks public static List formNewTargetFiles( - Configuration conf, Path directoryPath, long targetModifTime) throws IOException { + Configuration conf, Path partitionPath, long targetModifTime) throws IOException { List finalPaths = new ArrayList<>(); - FileSystem fs = directoryPath.getFileSystem(conf); + FileSystem fs = partitionPath.getFileSystem(conf); FileStatus[] statuses = - fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet")); + fs.listStatus(partitionPath, path -> path.getName().endsWith(".parquet")); for (FileStatus status : statuses) { diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index ac2a94887..b26460a32 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -28,6 +28,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; +import org.apache.parquet.schema.LogicalTypeAnnotation; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -40,9 +44,9 @@ public class ITParquetDataManager { @Test - public void testAppendParquetFile() throws IOException { + public void testFormParquetFileSinglePartition() throws IOException { SparkSession spark = - SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate(); + SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate(); Configuration conf = spark.sparkContext().hadoopConfiguration(); StructType schema = DataTypes.createStructType( @@ -107,7 +111,98 @@ public void testAppendParquetFile() throws IOException { assertTrue(isNewData, "Path should belong to appended data: " + pathString); // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } - // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file) + spark.stop(); + } + + @Test + public void testAppendParquetFileSinglePartition() throws IOException { + SparkSession spark = + SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate(); + Configuration conf = spark.sparkContext().hadoopConfiguration(); + MessageType schemaParquet = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT32) + .named("id") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named("value") + .required(PrimitiveType.PrimitiveTypeName.INT32) + .named("year") + .required(PrimitiveType.PrimitiveTypeName.INT32) + .named("month") + .named("parquet_schema"); + StructType schema = + DataTypes.createStructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.IntegerType, false), + DataTypes.createStructField("value", DataTypes.StringType, false), + DataTypes.createStructField("year", DataTypes.IntegerType, false), + DataTypes.createStructField("month", DataTypes.IntegerType, false) + }); + List data = + Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); + /* RowFactory.create(201, "C", 2025, 11), + RowFactory.create(301, "D", 2024, 7));*/ + + Dataset df = spark.createDataFrame(data, schema); + Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test"); + Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test"); + String outputPath = fixedPath.toString(); + String finalAppendFilePath = appendFilePath.toString(); + + df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); + + // test find files to sync + long targetModifTime = System.currentTimeMillis() - 360000; + org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); + FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); + // set the modification time to the table file + updateModificationTimeRecursive(fs, hdfsPath, targetModifTime); + // create new file to append using Spark + List futureDataToSync = + Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12)); + Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); + dfToSync.write().partitionBy("year", "month").mode("overwrite").parquet(finalAppendFilePath); + long newModifTime = System.currentTimeMillis() - 50000; + // update modifTime for file to append + updateModificationTimeRecursive( + fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime); + org.apache.hadoop.fs.Path outputFile = + ParquetDataManager.appendNewParquetFiles( + new org.apache.hadoop.fs.Path(outputPath), + new org.apache.hadoop.fs.Path(finalAppendFilePath), + schemaParquet); + + // TODO many partitions case + // List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); + /* for (String partition : newPartitions) { + org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition); + if (fs.exists(partitionPath)) { + updateModificationTimeRecursive(fs, partitionPath, newModifTime); + } + }*/ + org.apache.hadoop.fs.Path partitionPath = + new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12"); + // updateModificationTimeRecursive(fs, partitionPath, newModifTime); + + // test whether the appended data can be selectively filtered for incr sync (e.g. get me the + // data to sync only) + List resultingFiles = + ParquetDataManager.formNewTargetFiles(conf, outputFile, newModifTime); + // check if resultingFiles contains the append data only (through the partition names) + for (org.apache.hadoop.fs.Path p : resultingFiles) { + String pathString = p.toString(); + // should be TRUE (test for many partitions) + boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027"); + + // should be FALSE (test for many partitions) + boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); + long modTime = fs.getFileStatus(p).getModificationTime(); + // test for one partition value + assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); + assertTrue(isNewData, "Path should belong to appended data: " + pathString); + // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); + } spark.stop(); } From 233ca770baccb5a6fd00d800bafb1029a7cb5720 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 13:10:02 +0100 Subject: [PATCH 20/54] merge recursively one partition files --- .../xtable/parquet/ParquetDataManager.java | 47 ++++++++++++++++++- .../xtable/parquet/ITParquetDataManager.java | 5 +- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 5e8bb25bb..6384f5c01 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -88,7 +88,7 @@ private String getPartitionPath(Path path) { // For nested partitions, you might need path.getParent().getParent()... return path.getParent().getName(); } - // append a file into a table (merges two files into one .parquet under a partition folder) + // append a file (merges two files into one .parquet under a partition folder) public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema) throws IOException { Configuration conf = new Configuration(); @@ -193,4 +193,49 @@ public static List formNewTargetFiles( } return finalPaths; } + + public static Path appendPartitionedData(Path targetPath, Path sourcePath, MessageType schema) + throws IOException { + Configuration conf = new Configuration(); + FileSystem fs = sourcePath.getFileSystem(conf); + + List targetFiles = collectParquetFiles(fs, targetPath); + + List sourceFiles = collectParquetFiles(fs, sourcePath); + + if (targetFiles.isEmpty()) { + throw new IOException("Target directory contains no parquet files to append to."); + } + if (sourceFiles.isEmpty()) { + return targetPath; + } + Path masterTargetFile = targetFiles.get(0); + + for (int i = 1; i < targetFiles.size(); i++) { + appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema); + } + for (Path sourceFile : sourceFiles) { + appendNewParquetFiles(masterTargetFile, sourceFile, schema); + } + + return masterTargetFile; + } + + private static List collectParquetFiles(FileSystem fs, Path root) throws IOException { + List parquetFiles = new ArrayList<>(); + if (!fs.exists(root)) return parquetFiles; + + if (fs.getFileStatus(root).isDirectory()) { + RemoteIterator it = fs.listFiles(root, true); + while (it.hasNext()) { + Path p = it.next().getPath(); + if (p.getName().endsWith(".parquet")) { + parquetFiles.add(p); + } + } + } else { + parquetFiles.add(root); + } + return parquetFiles; + } } diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index b26460a32..3d7619cf3 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -167,11 +167,14 @@ public void testAppendParquetFileSinglePartition() throws IOException { // update modifTime for file to append updateModificationTimeRecursive( fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime); + // recursively append all files under a specific partition path (e.g. "year=2026/month=12") + // (assuming that outputPath and finalAppendFilePath are same partitions for different files) org.apache.hadoop.fs.Path outputFile = - ParquetDataManager.appendNewParquetFiles( + ParquetDataManager.appendPartitionedData( new org.apache.hadoop.fs.Path(outputPath), new org.apache.hadoop.fs.Path(finalAppendFilePath), schemaParquet); + // can create big file which can be split if reaches a certain threshold size // TODO many partitions case // List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); From b4cba5a1c358689a8953e7b8d7916d2d2d51abc7 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 13:39:56 +0100 Subject: [PATCH 21/54] fix paths for files to append --- .../main/java/org/apache/xtable/parquet/ParquetDataManager.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 6384f5c01..da88a50e2 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -97,7 +97,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa new ParquetFileWriter( HadoopOutputFile.fromPath(filePath, conf), schema, - ParquetFileWriter.Mode.CREATE, + ParquetFileWriter.Mode.OVERWRITE, DEFAULT_BLOCK_SIZE, 0); // write the initial table with the appended file to add into the outputPath From a564b29a7a11659b9ac647996cebfe1f7a34ca51 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 14:08:27 +0100 Subject: [PATCH 22/54] fix bug of appending file path --- .../java/org/apache/xtable/parquet/ParquetDataManager.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index da88a50e2..fbf546d37 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -93,9 +93,10 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa throws IOException { Configuration conf = new Configuration(); long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex(); + Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp"); ParquetFileWriter writer = new ParquetFileWriter( - HadoopOutputFile.fromPath(filePath, conf), + HadoopOutputFile.fromPath(tempPath, conf), schema, ParquetFileWriter.Mode.OVERWRITE, DEFAULT_BLOCK_SIZE, @@ -134,6 +135,8 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa combinedMeta.put( "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime())); writer.end(combinedMeta); + fs.delete(filePath, false); + fs.rename(tempPath, filePath); return filePath; } // selective compaction of parquet blocks From d1ceafba5cef0f8a916a994d44e74bbecfb45f64 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 14:38:15 +0100 Subject: [PATCH 23/54] fix bug of schema --- .../java/org/apache/xtable/parquet/ITParquetDataManager.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 3d7619cf3..6c490bcbb 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -119,6 +119,7 @@ public void testAppendParquetFileSinglePartition() throws IOException { SparkSession spark = SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate(); Configuration conf = spark.sparkContext().hadoopConfiguration(); + // In testAppendParquetFileSinglePartition MessageType schemaParquet = Types.buildMessage() .required(PrimitiveType.PrimitiveTypeName.INT32) @@ -126,10 +127,6 @@ public void testAppendParquetFileSinglePartition() throws IOException { .required(PrimitiveType.PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.stringType()) .named("value") - .required(PrimitiveType.PrimitiveTypeName.INT32) - .named("year") - .required(PrimitiveType.PrimitiveTypeName.INT32) - .named("month") .named("parquet_schema"); StructType schema = DataTypes.createStructType( From 649250fbfb939367ce4108e7e6bef267911859b8 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 15:10:27 +0100 Subject: [PATCH 24/54] make sure returned files are non empty when iterating in the partitions --- .../org/apache/xtable/parquet/ParquetDataManager.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index fbf546d37..2650e4791 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -226,14 +226,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa private static List collectParquetFiles(FileSystem fs, Path root) throws IOException { List parquetFiles = new ArrayList<>(); + Configuration conf = fs.getConf(); if (!fs.exists(root)) return parquetFiles; if (fs.getFileStatus(root).isDirectory()) { RemoteIterator it = fs.listFiles(root, true); while (it.hasNext()) { Path p = it.next().getPath(); - if (p.getName().endsWith(".parquet")) { - parquetFiles.add(p); + if (p.getName().endsWith(".parquet") && !p.getName().startsWith(".")) { + ParquetMetadata footer = ParquetFileReader.readFooter(conf, p); + if (!footer.getBlocks().isEmpty()) { + parquetFiles.add(p); + } } } } else { From 24ff828b03af7d03cdecb3556fbe71254802d682 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 15:32:58 +0100 Subject: [PATCH 25/54] make sure returned files are non empty when iterating in the partitions 2 --- .../java/org/apache/xtable/parquet/ParquetDataManager.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 2650e4791..991879d05 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -233,9 +233,11 @@ private static List collectParquetFiles(FileSystem fs, Path root) throws I RemoteIterator it = fs.listFiles(root, true); while (it.hasNext()) { Path p = it.next().getPath(); - if (p.getName().endsWith(".parquet") && !p.getName().startsWith(".")) { + if (p.getName().endsWith(".parquet") + && !p.toString().contains("/.") + && !p.toString().contains("/_")) { ParquetMetadata footer = ParquetFileReader.readFooter(conf, p); - if (!footer.getBlocks().isEmpty()) { + if (footer.getBlocks() != null && !footer.getBlocks().isEmpty()) { parquetFiles.add(p); } } From 463d2eea37664ce1aab0414c8ac366f665abb1e6 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 16:20:21 +0100 Subject: [PATCH 26/54] discard empty files when appending --- .../org/apache/xtable/parquet/ParquetDataManager.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 991879d05..942bf95e8 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -215,9 +215,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa Path masterTargetFile = targetFiles.get(0); for (int i = 1; i < targetFiles.size(); i++) { + long len = fs.getFileStatus(targetFiles.get(i)).getLen(); + System.out.println("DEBUG: Attempting to append " + targetFiles.get(i) + " Size: " + len); appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema); } for (Path sourceFile : sourceFiles) { + ParquetMetadata sourceFooter = ParquetFileReader.readFooter(conf, sourceFile); + if (sourceFooter.getBlocks().isEmpty()) { + System.out.println("SKIPPING: " + sourceFile + " has no data."); + continue; + } + long len = fs.getFileStatus(sourceFile).getLen(); + System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len); appendNewParquetFiles(masterTargetFile, sourceFile, schema); } From 71d1c34f97f12db45840277a7f7f4cfa095e8222 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 16:45:59 +0100 Subject: [PATCH 27/54] discard empty files when appending --- .../apache/xtable/parquet/ParquetDataManager.java | 13 ++++++++++++- .../apache/xtable/parquet/ITParquetDataManager.java | 7 ++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 942bf95e8..76efa93fb 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -212,7 +212,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa if (sourceFiles.isEmpty()) { return targetPath; } - Path masterTargetFile = targetFiles.get(0); + Path masterTargetFile = null; + for (Path p : targetFiles) { + ParquetMetadata metadata = ParquetFileReader.readFooter(conf, p); + if (!metadata.getBlocks().isEmpty()) { + masterTargetFile = p; + break; + } + } + + if (masterTargetFile == null) { + throw new IOException("Target directory contains no files with valid data blocks."); + } for (int i = 1; i < targetFiles.size(); i++) { long len = fs.getFileStatus(targetFiles.get(i)).getLen(); diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 6c490bcbb..bd419c885 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -159,7 +159,12 @@ public void testAppendParquetFileSinglePartition() throws IOException { List futureDataToSync = Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12)); Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); - dfToSync.write().partitionBy("year", "month").mode("overwrite").parquet(finalAppendFilePath); + dfToSync + .coalesce(1) + .write() + .partitionBy("year", "month") + .mode("overwrite") + .parquet(finalAppendFilePath); long newModifTime = System.currentTimeMillis() - 50000; // update modifTime for file to append updateModificationTimeRecursive( From 3319a91429eec91faeaabb3dce4d0eac95b5b3a5 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 17:08:36 +0100 Subject: [PATCH 28/54] one spark session for both tests --- .../xtable/parquet/ITParquetDataManager.java | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index bd419c885..d12e3016d 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -39,14 +39,22 @@ import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; public class ITParquetDataManager { + private static SparkSession spark; + + @BeforeAll + public static void setup() { + spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate(); + } @Test public void testFormParquetFileSinglePartition() throws IOException { - SparkSession spark = - SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate(); + /*SparkSession spark = + SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();*/ Configuration conf = spark.sparkContext().hadoopConfiguration(); StructType schema = DataTypes.createStructType( @@ -111,13 +119,13 @@ public void testFormParquetFileSinglePartition() throws IOException { assertTrue(isNewData, "Path should belong to appended data: " + pathString); // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } - spark.stop(); + // spark.stop(); } @Test public void testAppendParquetFileSinglePartition() throws IOException { - SparkSession spark = - SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate(); + /*SparkSession spark = + SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();*/ Configuration conf = spark.sparkContext().hadoopConfiguration(); // In testAppendParquetFileSinglePartition MessageType schemaParquet = @@ -209,7 +217,7 @@ public void testAppendParquetFileSinglePartition() throws IOException { // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } - spark.stop(); + // spark.stop(); } private void updateModificationTimeRecursive( @@ -223,4 +231,11 @@ private void updateModificationTimeRecursive( } } } + + @AfterAll + public static void tearDown() { + if (spark != null) { + spark.stop(); + } + } } From 013ffe4c000b68992460679180eb448ef63c1974 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 17:41:31 +0100 Subject: [PATCH 29/54] bug fix --- .../xtable/parquet/ParquetDataManager.java | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 76efa93fb..471925c1d 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -93,6 +93,8 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa throws IOException { Configuration conf = new Configuration(); long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex(); + ParquetMetadata existingFooter = ParquetFileReader.readFooter(conf, filePath); + Map existingMeta = existingFooter.getFileMetaData().getKeyValueMetaData(); Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp"); ParquetFileWriter writer = new ParquetFileWriter( @@ -110,14 +112,11 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa writer.appendFile(inputFileToAppend); } Map combinedMeta = new HashMap<>(); + if (existingMeta != null) { + combinedMeta.putAll(existingMeta); + } // track the append date and save it in the footer - int appendCount = - Integer.parseInt( - ParquetFileReader.readFooter(conf, filePath) - .getFileMetaData() - .getKeyValueMetaData() - .getOrDefault("total_appends", "0")); - String newKey = "append_date_" + (appendCount + 1); + int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0")); // get the equivalent fileStatus from the file-to-append Path FileSystem fs = FileSystem.get(conf); FileStatus fileStatus = fs.getFileStatus(fileToAppend); @@ -127,7 +126,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa int currentAppendIdx = appendCount + 1; long startBlock = firstBlockIndex; long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size(); - long endBlock = startBlock + blocksAdded; + long endBlock = startBlock + blocksAdded - 1; combinedMeta.put("total_appends", String.valueOf(currentAppendIdx)); combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock)); @@ -160,7 +159,7 @@ public static List formNewTargetFiles( .getFileMetaData() .getKeyValueMetaData() .getOrDefault("total_appends", "0")); - for (int i = 1; i < totalAppends; i++) { + for (int i = 1; i <= totalAppends; i++) { int startBlock = Integer.valueOf( bigFileFooter From b939a0ac1229ca6d8fd5cec695d98a0f31c5d395 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 18:03:43 +0100 Subject: [PATCH 30/54] bug fix --- .../java/org/apache/xtable/parquet/ParquetDataManager.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 471925c1d..bf0035630 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -172,9 +172,10 @@ public static List formNewTargetFiles( .getFileMetaData() .getKeyValueMetaData() .get("index_end_block_of_append_" + i)); - Path targetSyncFilePath = - new Path( - status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock)); + String newFileName = + String.format( + "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock); + Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName); try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) { writer.start(); for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) { From b6d8ddccb49eb83e687f327fee9e3e80f356421c Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 3 Jan 2026 18:36:47 +0100 Subject: [PATCH 31/54] cleanups + TODO: partitions match and merge --- .../apache/xtable/parquet/ParquetDataManager.java | 6 ++---- .../apache/xtable/parquet/ITParquetDataManager.java | 12 ------------ 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index bf0035630..7263612e6 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -65,10 +65,10 @@ private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path f ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend); return parquetFileConfig; } - + // TODO use this method to match the partition folders and merge them one by one (if one exists + // but not in other file then create it) public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) throws IOException { - Configuration conf = new Configuration(); // e.g., "year=2024/month=12" String targetPartition = getPartitionPath(targetPath); @@ -84,8 +84,6 @@ public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageTy } private String getPartitionPath(Path path) { - // Returns the parent directory name (e.g., country=US) - // For nested partitions, you might need path.getParent().getParent()... return path.getParent().getName(); } // append a file (merges two files into one .parquet under a partition folder) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index d12e3016d..420618b92 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -53,8 +53,6 @@ public static void setup() { @Test public void testFormParquetFileSinglePartition() throws IOException { - /*SparkSession spark = - SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();*/ Configuration conf = spark.sparkContext().hadoopConfiguration(); StructType schema = DataTypes.createStructType( @@ -67,9 +65,6 @@ public void testFormParquetFileSinglePartition() throws IOException { List data = Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); - /* RowFactory.create(201, "C", 2025, 11), - RowFactory.create(301, "D", 2024, 7));*/ - Dataset df = spark.createDataFrame(data, schema); Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test"); String outputPath = fixedPath.toString(); @@ -119,13 +114,10 @@ public void testFormParquetFileSinglePartition() throws IOException { assertTrue(isNewData, "Path should belong to appended data: " + pathString); // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } - // spark.stop(); } @Test public void testAppendParquetFileSinglePartition() throws IOException { - /*SparkSession spark = - SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();*/ Configuration conf = spark.sparkContext().hadoopConfiguration(); // In testAppendParquetFileSinglePartition MessageType schemaParquet = @@ -146,8 +138,6 @@ public void testAppendParquetFileSinglePartition() throws IOException { }); List data = Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); - /* RowFactory.create(201, "C", 2025, 11), - RowFactory.create(301, "D", 2024, 7));*/ Dataset df = spark.createDataFrame(data, schema); Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test"); @@ -216,8 +206,6 @@ public void testAppendParquetFileSinglePartition() throws IOException { assertTrue(isNewData, "Path should belong to appended data: " + pathString); // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } - - // spark.stop(); } private void updateModificationTimeRecursive( From c18ab1c23a16d12accb00e40842261e4931778a8 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 4 Jan 2026 15:57:08 +0100 Subject: [PATCH 32/54] added test for many partitions, TODO integrate functions into ParquetConversionSource --- .../xtable/parquet/ParquetDataManager.java | 46 ++++++ .../xtable/parquet/ITParquetDataManager.java | 149 +++++++++++++----- 2 files changed, 156 insertions(+), 39 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 7263612e6..cfd3d66c3 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -265,4 +265,50 @@ private static List collectParquetFiles(FileSystem fs, Path root) throws I } return parquetFiles; } + + public static List mergeDatasetsByPartition( + Path targetRoot, Path sourceRoot, MessageType schema) throws IOException { + List finalPartitionPaths = new ArrayList<>(); + Configuration conf = new Configuration(); + FileSystem fs = targetRoot.getFileSystem(conf); + + Map sourcePartitions = mapPartitions(fs, sourceRoot); + + Map targetPartitions = mapPartitions(fs, targetRoot); + + for (Map.Entry entry : sourcePartitions.entrySet()) { + String partitionKey = entry.getKey(); + Path sourcePartitionPath = entry.getValue(); + + if (targetPartitions.containsKey(partitionKey)) { + Path targetPartitionPath = targetPartitions.get(partitionKey); + + finalPartitionPaths.add( + appendPartitionedData(targetPartitionPath, sourcePartitionPath, schema)); + } else { + // TODO logic for when a partition exists in source but not in target + // simply merge the files in the respective partitions using appendPartitionedData() that + // takes only one Path/dir + + } + } + return finalPartitionPaths; + } + + private static Map mapPartitions(FileSystem fs, Path root) throws IOException { + Map partitionMap = new HashMap<>(); + RemoteIterator it = fs.listFiles(root, true); + + while (it.hasNext()) { + Path filePath = it.next().getPath(); + if (filePath.getName().endsWith(".parquet") && !filePath.toString().contains("/_")) { + Path parent = filePath.getParent(); + String relative = parent.toString().replace(root.toString(), ""); + if (relative.startsWith("/")) relative = relative.substring(1); + + partitionMap.put(relative, parent); + } + } + return partitionMap; + } } diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 420618b92..fe2bcc069 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -18,11 +18,13 @@ package org.apache.xtable.parquet; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -41,6 +43,7 @@ import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; public class ITParquetDataManager { @@ -52,6 +55,7 @@ public static void setup() { } @Test + @Disabled("This test already passed CI") public void testFormParquetFileSinglePartition() throws IOException { Configuration conf = spark.sparkContext().hadoopConfiguration(); StructType schema = @@ -68,8 +72,6 @@ public void testFormParquetFileSinglePartition() throws IOException { Dataset df = spark.createDataFrame(data, schema); Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test"); String outputPath = fixedPath.toString(); - ; - df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); // test find files to sync @@ -83,40 +85,26 @@ public void testFormParquetFileSinglePartition() throws IOException { Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12)); Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath); - // TODO create the folders manually for a new partition value as appendFile works only within - // the same partition value long newModifTime = System.currentTimeMillis() - 50000; - // TODO many partitions case - // List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); - /* for (String partition : newPartitions) { - org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition); - if (fs.exists(partitionPath)) { - updateModificationTimeRecursive(fs, partitionPath, newModifTime); - } - }*/ - org.apache.hadoop.fs.Path partitionPath = - new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12"); - // updateModificationTimeRecursive(fs, partitionPath, newModifTime); + List resultingFiles = ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime); // check if resultingFiles contains the append data only (through the partition names) for (org.apache.hadoop.fs.Path p : resultingFiles) { String pathString = p.toString(); - // should be TRUE (test for many partitions) - boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027"); - // should be FALSE (test for many partitions) - boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); + boolean isNewData = pathString.contains("year=2026"); + long modTime = fs.getFileStatus(p).getModificationTime(); // test for one partition value assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); assertTrue(isNewData, "Path should belong to appended data: " + pathString); - // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } } @Test + @Disabled("This test already passed CI") public void testAppendParquetFileSinglePartition() throws IOException { Configuration conf = spark.sparkContext().hadoopConfiguration(); // In testAppendParquetFileSinglePartition @@ -174,19 +162,7 @@ public void testAppendParquetFileSinglePartition() throws IOException { new org.apache.hadoop.fs.Path(outputPath), new org.apache.hadoop.fs.Path(finalAppendFilePath), schemaParquet); - // can create big file which can be split if reaches a certain threshold size - - // TODO many partitions case - // List newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7"); - /* for (String partition : newPartitions) { - org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition); - if (fs.exists(partitionPath)) { - updateModificationTimeRecursive(fs, partitionPath, newModifTime); - } - }*/ - org.apache.hadoop.fs.Path partitionPath = - new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12"); - // updateModificationTimeRecursive(fs, partitionPath, newModifTime); + // TODO can create big file which can be split if reaches a certain threshold size // test whether the appended data can be selectively filtered for incr sync (e.g. get me the // data to sync only) @@ -195,16 +171,111 @@ public void testAppendParquetFileSinglePartition() throws IOException { // check if resultingFiles contains the append data only (through the partition names) for (org.apache.hadoop.fs.Path p : resultingFiles) { String pathString = p.toString(); - // should be TRUE (test for many partitions) - boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027"); - - // should be FALSE (test for many partitions) - boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); + boolean isNewData = pathString.contains("year=2026"); long modTime = fs.getFileStatus(p).getModificationTime(); // test for one partition value assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); assertTrue(isNewData, "Path should belong to appended data: " + pathString); - // assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); + } + } + + @Test + public void testAppendParquetFileMultiplePartition() throws IOException { + Configuration conf = spark.sparkContext().hadoopConfiguration(); + + MessageType schemaParquet = + Types.buildMessage() + .required(PrimitiveType.PrimitiveTypeName.INT32) + .named("id") + .required(PrimitiveType.PrimitiveTypeName.BINARY) + .as(LogicalTypeAnnotation.stringType()) + .named("value") + .named("parquet_schema"); + StructType schema = + DataTypes.createStructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.IntegerType, false), + DataTypes.createStructField("value", DataTypes.StringType, false), + DataTypes.createStructField("year", DataTypes.IntegerType, false), + DataTypes.createStructField("month", DataTypes.IntegerType, false) + }); + List data = + Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11)); + + Dataset df = spark.createDataFrame(data, schema); + Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test"); + Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test"); + String outputPath = fixedPath.toString(); + String finalAppendFilePath = appendFilePath.toString(); + + df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); + + // test find files to sync + long targetModifTime = System.currentTimeMillis() - 360000; + org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); + FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); + // set the modification time to the table file + // update modifTime for file to append + // many partitions case + List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11"); + for (String partition : newPartitions) { + org.apache.hadoop.fs.Path partitionPath = + new org.apache.hadoop.fs.Path(outputPath, partition); + if (fs.exists(partitionPath)) { + updateModificationTimeRecursive(fs, partitionPath, targetModifTime); + } + } + // create new file to append using Spark + List futureDataToSync = + Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 11)); + Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); + dfToSync + .coalesce(1) + .write() + .partitionBy("year", "month") + .mode("overwrite") + .parquet(finalAppendFilePath); + long newModifTime = System.currentTimeMillis() - 50000; + for (String partition : newPartitions) { + org.apache.hadoop.fs.Path partitionPath = + new org.apache.hadoop.fs.Path(finalAppendFilePath, partition); + if (fs.exists(partitionPath)) { + updateModificationTimeRecursive(fs, partitionPath, newModifTime); + } + } + // recursively append all files under a specific partition path (e.g. "year=2026/month=12") + // (assuming that outputPath and finalAppendFilePath are same partitions for different files) + // the result contains all partition paths with the parquet files compacted + List outputPaths = + ParquetDataManager.mergeDatasetsByPartition( + new org.apache.hadoop.fs.Path(outputPath), + new org.apache.hadoop.fs.Path(finalAppendFilePath), + schemaParquet); + // TODO can create big file which can be split if reaches a certain threshold size + // implement bin-packing function for large files + + // test whether the appended data can be selectively filtered for incr sync (e.g. get me the + // data to sync only) + List> finalPaths = new ArrayList<>(); + for (org.apache.hadoop.fs.Path resPath : outputPaths) { + List resultingFiles = + ParquetDataManager.formNewTargetFiles(conf, resPath, newModifTime); + finalPaths.add(resultingFiles); + } + // check if resultingFiles contains the append data only + for (List fPaths : finalPaths) { + for (org.apache.hadoop.fs.Path p : fPaths) { + String pathString = p.toString(); + // should be TRUE + boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027"); + // should be FALSE + boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); + long modTime = fs.getFileStatus(p).getModificationTime(); + // test for one partition value + assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); + assertTrue(isNewData, "Path should belong to appended data: " + pathString); + assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); + } } } From b7c613e0def55e32b121e65ceef45da241b9d5e3 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 4 Jan 2026 17:39:04 +0100 Subject: [PATCH 33/54] selecting data bug fix, TODO integrate functions into ParquetConversionSource --- .../java/org/apache/xtable/parquet/ParquetDataManager.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index cfd3d66c3..9fd6fd925 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -170,6 +170,9 @@ public static List formNewTargetFiles( .getFileMetaData() .getKeyValueMetaData() .get("index_end_block_of_append_" + i)); + long modifTime = + Long.parseLong( + bigFileFooter.getFileMetaData().getKeyValueMetaData().get("append_date_" + i)); String newFileName = String.format( "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock); @@ -178,9 +181,7 @@ public static List formNewTargetFiles( writer.start(); for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) { BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j); - if (j >= startBlock - && j <= endBlock - && status.getModificationTime() >= targetModifTime) { + if (j >= startBlock && j <= endBlock && modifTime >= targetModifTime) { List blockList = Collections.singletonList(blockMetadata); FSDataInputStream targetInputStream = fs.open(status.getPath()); From 2a75f4925daedc63f6186fc34033478e16ba5621 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 4 Jan 2026 17:59:33 +0100 Subject: [PATCH 34/54] run all tests, TODO integrate functions into ParquetConversionSource --- .../java/org/apache/xtable/parquet/ITParquetDataManager.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index fe2bcc069..de49a1d86 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -55,7 +55,6 @@ public static void setup() { } @Test - @Disabled("This test already passed CI") public void testFormParquetFileSinglePartition() throws IOException { Configuration conf = spark.sparkContext().hadoopConfiguration(); StructType schema = @@ -104,7 +103,6 @@ public void testFormParquetFileSinglePartition() throws IOException { } @Test - @Disabled("This test already passed CI") public void testAppendParquetFileSinglePartition() throws IOException { Configuration conf = spark.sparkContext().hadoopConfiguration(); // In testAppendParquetFileSinglePartition From f1538b07e0dc79ec9e36d2cc437f4209c01a0f3d Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 4 Jan 2026 18:49:33 +0100 Subject: [PATCH 35/54] run all tests, TODO integrate functions into ParquetConversionSource --- .../apache/xtable/parquet/ITParquetDataManager.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index de49a1d86..0049af0bb 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -69,7 +69,7 @@ public void testFormParquetFileSinglePartition() throws IOException { List data = Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); Dataset df = spark.createDataFrame(data, schema); - Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test"); + Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test_0"); String outputPath = fixedPath.toString(); df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); @@ -126,8 +126,8 @@ public void testAppendParquetFileSinglePartition() throws IOException { Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); Dataset df = spark.createDataFrame(data, schema); - Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test"); - Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test"); + Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_1"); + Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_1"); String outputPath = fixedPath.toString(); String finalAppendFilePath = appendFilePath.toString(); @@ -201,8 +201,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException { Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11)); Dataset df = spark.createDataFrame(data, schema); - Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test"); - Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test"); + Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2"); + Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2"); String outputPath = fixedPath.toString(); String finalAppendFilePath = appendFilePath.toString(); From cdedeaecf8a467229edf758166f62e74ddb25bab Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 4 Jan 2026 19:08:47 +0100 Subject: [PATCH 36/54] spotless:apply, TODO integrate functions into ParquetConversionSource --- .../java/org/apache/xtable/parquet/ITParquetDataManager.java | 1 - 1 file changed, 1 deletion(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 0049af0bb..e47ddcf11 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -43,7 +43,6 @@ import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; public class ITParquetDataManager { From e87312031363e60f20029c7e6827414ea6420fb5 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 4 Jan 2026 23:50:19 +0100 Subject: [PATCH 37/54] bug fixes, TODO integrate functions into ParquetConversionSource --- .../xtable/parquet/ParquetDataManager.java | 48 +++++++++---------- .../xtable/parquet/ITParquetDataManager.java | 27 +++++++++-- 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 9fd6fd925..bac6a5e74 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -65,27 +65,7 @@ private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path f ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend); return parquetFileConfig; } - // TODO use this method to match the partition folders and merge them one by one (if one exists - // but not in other file then create it) - public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) - throws IOException { - - // e.g., "year=2024/month=12" - String targetPartition = getPartitionPath(targetPath); - String sourcePartition = getPartitionPath(sourcePath); - - if (!targetPartition.equals(sourcePartition)) { - throw new IllegalArgumentException( - "Partition Mismatch! Cannot merge " + sourcePartition + " into " + targetPartition); - } - - // append files within the same partition foldr - appendNewParquetFiles(targetPath, sourcePath, schema); - } - - private String getPartitionPath(Path path) { - return path.getParent().getName(); - } + // TODO add safe guards for possible empty parquet files // append a file (merges two files into one .parquet under a partition folder) public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema) throws IOException { @@ -105,6 +85,12 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa writer.start(); HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf); InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf); + // get the equivalent fileStatus from the file-to-append Path + FileSystem fs = FileSystem.get(conf); + FileStatus fileStatus = fs.getFileStatus(fileToAppend); + // save modif Time (not needed for real case scenario) + // long prevModifTime = fileStatus.getModificationTime(); + if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) { writer.appendFile(inputFile); writer.appendFile(inputFileToAppend); @@ -115,9 +101,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa } // track the append date and save it in the footer int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0")); - // get the equivalent fileStatus from the file-to-append Path - FileSystem fs = FileSystem.get(conf); - FileStatus fileStatus = fs.getFileStatus(fileToAppend); + // save block indexes and modification time (for later sync related retrieval) in the metadata // of the output // table @@ -133,9 +117,13 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime())); writer.end(combinedMeta); fs.delete(filePath, false); + // restore modifTime not needed actually (only for test purposes) + // the append happens here so the time must NOT updated manually + // fs.setTimes(tempPath, prevModifTime, -1); fs.rename(tempPath, filePath); return filePath; } + // TODO add safe guards for possible empty parquet files // selective compaction of parquet blocks public static List formNewTargetFiles( Configuration conf, Path partitionPath, long targetModifTime) throws IOException { @@ -178,6 +166,7 @@ public static List formNewTargetFiles( "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock); Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName); try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) { + boolean wasDataAppended = false; writer.start(); for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) { BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j); @@ -186,10 +175,17 @@ public static List formNewTargetFiles( Collections.singletonList(blockMetadata); FSDataInputStream targetInputStream = fs.open(status.getPath()); writer.appendRowGroups(targetInputStream, blockList, false); + wasDataAppended = true; } } + // manually restore the modifTime after using appendRowGroups() writer.end(new HashMap<>()); - finalPaths.add(targetSyncFilePath); + if (wasDataAppended) { + fs.setTimes(targetSyncFilePath, modifTime, -1); + finalPaths.add(targetSyncFilePath); + } else { + fs.delete(targetSyncFilePath, false); + } } } } @@ -236,7 +232,9 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa continue; } long len = fs.getFileStatus(sourceFile).getLen(); + long len_2 = fs.getFileStatus(masterTargetFile).getLen(); System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len); + System.out.println("DEBUG: Attempting to append " + masterTargetFile + " Size: " + len_2); appendNewParquetFiles(masterTargetFile, sourceFile, schema); } diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index e47ddcf11..83d2c0a91 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -43,6 +43,7 @@ import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; public class ITParquetDataManager { @@ -54,6 +55,7 @@ public static void setup() { } @Test + @Disabled("") public void testFormParquetFileSinglePartition() throws IOException { Configuration conf = spark.sparkContext().hadoopConfiguration(); StructType schema = @@ -102,6 +104,7 @@ public void testFormParquetFileSinglePartition() throws IOException { } @Test + @Disabled("") public void testAppendParquetFileSinglePartition() throws IOException { Configuration conf = spark.sparkContext().hadoopConfiguration(); // In testAppendParquetFileSinglePartition @@ -197,7 +200,11 @@ public void testAppendParquetFileMultiplePartition() throws IOException { DataTypes.createStructField("month", DataTypes.IntegerType, false) }); List data = - Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11)); + Arrays.asList( + RowFactory.create(100, "A", 2026, 12), + RowFactory.create(101, "AA", 2026, 12), + RowFactory.create(102, "CB", 2027, 11), + RowFactory.create(103, "BA", 2027, 11)); Dataset df = spark.createDataFrame(data, schema); Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2"); @@ -205,7 +212,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException { String outputPath = fixedPath.toString(); String finalAppendFilePath = appendFilePath.toString(); - df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); + df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); // test find files to sync long targetModifTime = System.currentTimeMillis() - 360000; @@ -224,10 +231,13 @@ public void testAppendParquetFileMultiplePartition() throws IOException { } // create new file to append using Spark List futureDataToSync = - Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 11)); + Arrays.asList( + RowFactory.create(101, "A", 2026, 12), + RowFactory.create(301, "D", 2027, 11), + RowFactory.create(302, "DA", 2027, 11)); Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); dfToSync - .coalesce(1) + .coalesce(1) // since we are exec this test on low data volume .write() .partitionBy("year", "month") .mode("overwrite") @@ -248,6 +258,13 @@ public void testAppendParquetFileMultiplePartition() throws IOException { new org.apache.hadoop.fs.Path(outputPath), new org.apache.hadoop.fs.Path(finalAppendFilePath), schemaParquet); + // reset time FOR TESTING PURPOSES ONLY + // (in real, newModifTime time is the appending time as set by the appendRow() in the + // DataManager) + // THIS IS NEED TO CHECK AGAINST THE TIME FROM WHICH THE LAST APPEND HAPPENED + for (org.apache.hadoop.fs.Path oPath : outputPaths) { + fs.setTimes(oPath, newModifTime, -1); + } // TODO can create big file which can be split if reaches a certain threshold size // implement bin-packing function for large files @@ -269,7 +286,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException { boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); long modTime = fs.getFileStatus(p).getModificationTime(); // test for one partition value - assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); + assertTrue(modTime >= newModifTime, "File discovered was actually old data: " + pathString); assertTrue(isNewData, "Path should belong to appended data: " + pathString); assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); } From 4d0f245c02512840d16c1567a40201efea7ca678 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 5 Jan 2026 09:51:17 +0100 Subject: [PATCH 38/54] bug fixes, TODO integrate functions into ParquetConversionSource --- .../xtable/parquet/ParquetDataManager.java | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index bac6a5e74..e0f8daee2 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -88,9 +88,6 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa // get the equivalent fileStatus from the file-to-append Path FileSystem fs = FileSystem.get(conf); FileStatus fileStatus = fs.getFileStatus(fileToAppend); - // save modif Time (not needed for real case scenario) - // long prevModifTime = fileStatus.getModificationTime(); - if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) { writer.appendFile(inputFile); writer.appendFile(inputFileToAppend); @@ -106,9 +103,23 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa // of the output // table int currentAppendIdx = appendCount + 1; - long startBlock = firstBlockIndex; - long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size(); - long endBlock = startBlock + blocksAdded - 1; + List initialBlocks = existingFooter.getBlocks(); + long totalInitialCompressedSize = 0; + // first the blocks start at 0 + for (BlockMetaData block : initialBlocks) { + totalInitialCompressedSize += block.getCompressedSize(); + } + long startBlock = 0; + if (appendCount > 0) { + startBlock = totalInitialCompressedSize; + } + + List addedBlocks = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks(); + long totalAddedCompressedSize = 0; + for (BlockMetaData block : addedBlocks) { + totalAddedCompressedSize += block.getCompressedSize(); + } + long endBlock = totalInitialCompressedSize + totalAddedCompressedSize; combinedMeta.put("total_appends", String.valueOf(currentAppendIdx)); combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock)); @@ -117,9 +128,6 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime())); writer.end(combinedMeta); fs.delete(filePath, false); - // restore modifTime not needed actually (only for test purposes) - // the append happens here so the time must NOT updated manually - // fs.setTimes(tempPath, prevModifTime, -1); fs.rename(tempPath, filePath); return filePath; } @@ -170,7 +178,11 @@ public static List formNewTargetFiles( writer.start(); for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) { BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j); - if (j >= startBlock && j <= endBlock && modifTime >= targetModifTime) { + long currentBlockStart = blockMetadata.getStartingPos(); + long currentBlockEnd = currentBlockStart + blockMetadata.getCompressedSize(); + if (currentBlockStart >= startBlock + && currentBlockEnd <= endBlock + && modifTime >= targetModifTime) { List blockList = Collections.singletonList(blockMetadata); FSDataInputStream targetInputStream = fs.open(status.getPath()); From 219656e02194c807c93868e7ab42c96d4e6fc136 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 12 Jan 2026 22:29:29 +0100 Subject: [PATCH 39/54] handle lightweight metadata in the conversionSource --- .../parquet/ParquetConversionSource.java | 70 ++-- .../xtable/parquet/ParquetDataManager.java | 298 ++---------------- .../xtable/parquet/ParquetFileConfig.java | 20 +- .../xtable/parquet/ITParquetDataManager.java | 275 ---------------- 4 files changed, 90 insertions(+), 573 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java index e7eff9616..26d758026 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java @@ -56,6 +56,10 @@ public class ParquetConversionSource implements ConversionSource { private static final ParquetSchemaExtractor schemaExtractor = ParquetSchemaExtractor.getInstance(); + private static final ParquetMetadataExtractor metadataExtractor = + ParquetMetadataExtractor.getInstance(); + private static final ParquetDataManager parquetDataManagerExtractor = + ParquetDataManager.getInstance(); private static final ParquetMetadataExtractor parquetMetadataExtractor = ParquetMetadataExtractor.getInstance(); @@ -69,7 +73,7 @@ public class ParquetConversionSource implements ConversionSource { private final String basePath; @NonNull private final Configuration hadoopConf; - private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile) { + private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) { ParquetMetadata parquetMetadata = parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath()); MessageType parquetSchema = parquetMetadataExtractor.getSchema(parquetMetadata); @@ -87,15 +91,41 @@ private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile) .layoutStrategy(dataLayoutStrategy) .partitioningFields(partitionFields) .readSchema(schema) - .latestCommitTime(Instant.ofEpochMilli(latestFile.getModificationTime())) + .latestCommitTime(Instant.ofEpochMilli(latestFile.getModifTime())) .build(); } + public Stream getConfigsFromStream( + Stream fileStream, Configuration conf) { + + return fileStream.map( + fileStatus -> { + Path path = fileStatus.getPath(); + + ParquetMetadata metadata = + ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path); + + return ParquetFileConfig.builder() + .schema(metadata.getFileMetaData().getSchema()) + .metadata(metadata) + .path(path) + .size(fileStatus.getLen()) + .modifTime(fileStatus.getModificationTime()) + .rowGroupIndex(0L) + .codec( + metadata.getBlocks().isEmpty() + ? null + : metadata.getBlocks().get(0).getColumns().get(0).getCodec()) + .build(); + }); + } + @Override public InternalTable getTable(Long modificationTime) { // get parquetFile at specific time modificationTime Stream parquetFiles = getParquetFiles(hadoopConf, basePath); - LocatedFileStatus file = getParquetFileAt(parquetFiles, modificationTime); + Stream parquetFilesMetadata = getConfigsFromStream(parquetFiles, hadoopConf); + ParquetFileConfig file = getParquetFileAt(parquetFilesMetadata, modificationTime); return createInternalTableFromFile(file); } @@ -156,7 +186,10 @@ public TableChange getTableChangeForCommit(Long modificationTime) { parquetFiles .filter(fileStatus -> fileStatus.getModificationTime() > modificationTime) .collect(Collectors.toList()); - InternalTable internalTable = getMostRecentTable(parquetFiles); + List tableChangesAfterMetadata = + parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( + hadoopConf, parquetFiles, modificationTime); + InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream()); for (FileStatus tableStatus : tableChangesAfter) { InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(tableStatus); addedInternalDataFiles.add(currentDataFile); @@ -168,15 +201,15 @@ public TableChange getTableChangeForCommit(Long modificationTime) { .build(); } - private InternalTable getMostRecentTable(Stream parquetFiles) { - LocatedFileStatus latestFile = getMostRecentParquetFile(parquetFiles); + private InternalTable getMostRecentTable(Stream parquetFiles) { + ParquetFileConfig latestFile = getMostRecentParquetFile(parquetFiles); return createInternalTableFromFile(latestFile); } @Override public InternalTable getCurrentTable() { Stream parquetFiles = getParquetFiles(hadoopConf, basePath); - return getMostRecentTable(parquetFiles); + return getMostRecentTable(getConfigsFromStream(parquetFiles, hadoopConf)); } /** @@ -189,29 +222,30 @@ public InternalSnapshot getCurrentSnapshot() { // to avoid consume the stream call the method twice to return the same stream of parquet files Stream internalDataFiles = getInternalDataFiles(getParquetFiles(hadoopConf, basePath)); - InternalTable table = getMostRecentTable(getParquetFiles(hadoopConf, basePath)); + Stream parquetFilesMetadata = + getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf); + InternalTable table = getMostRecentTable(parquetFilesMetadata); return InternalSnapshot.builder() .table(table) .sourceIdentifier( - getCommitIdentifier( - getMostRecentParquetFile(getParquetFiles(hadoopConf, basePath)) - .getModificationTime())) + getCommitIdentifier(getMostRecentParquetFile(parquetFilesMetadata).getModifTime())) .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles)) .build(); } - private LocatedFileStatus getMostRecentParquetFile(Stream parquetFiles) { + private ParquetFileConfig getMostRecentParquetFile(Stream parquetFiles) { return parquetFiles - .max(Comparator.comparing(FileStatus::getModificationTime)) + .max(Comparator.comparing(ParquetFileConfig::getModifTime)) .orElseThrow(() -> new IllegalStateException("No files found")); } - private LocatedFileStatus getParquetFileAt( - Stream parquetFiles, long modificationTime) { - return parquetFiles - .filter(fileStatus -> fileStatus.getModificationTime() == modificationTime) + private ParquetFileConfig getParquetFileAt( + Stream parquetConfigs, long targetTime) { + + return parquetConfigs + .filter(config -> config.getModifTime() >= targetTime) .findFirst() - .orElseThrow(() -> new IllegalStateException("No file found at " + modificationTime)); + .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime)); } private Stream getParquetFiles(Configuration hadoopConf, String basePath) { diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index e0f8daee2..5d09352ea 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -18,23 +18,14 @@ package org.apache.xtable.parquet; -import java.io.IOException; import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; -import lombok.Builder; import lombok.extern.log4j.Log4j2; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; -import org.apache.parquet.format.converter.ParquetMetadataConverter; -import org.apache.parquet.hadoop.ParquetFileReader; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.metadata.BlockMetaData; -import org.apache.parquet.hadoop.metadata.ParquetMetadata; -import org.apache.parquet.hadoop.util.HadoopInputFile; -import org.apache.parquet.hadoop.util.HadoopOutputFile; -import org.apache.parquet.io.InputFile; -import org.apache.parquet.schema.MessageType; /** * Manages Parquet file operations including reading, writing, and partition discovery and path @@ -45,281 +36,30 @@ * fields. */ @Log4j2 -@Builder public class ParquetDataManager { - private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance(); - private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB - private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB + public static final ParquetDataManager INSTANCE = new ParquetDataManager(); - /* Use Parquet API to append to a file */ - - // after appending check required before appending the file - private static boolean checkIfSchemaIsSame( - Configuration conf, Path fileToAppend, Path fileFromTable) { - ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend); - ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable); - return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema()); - } - - private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) { - ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend); - return parquetFileConfig; - } - // TODO add safe guards for possible empty parquet files - // append a file (merges two files into one .parquet under a partition folder) - public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema) - throws IOException { - Configuration conf = new Configuration(); - long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex(); - ParquetMetadata existingFooter = ParquetFileReader.readFooter(conf, filePath); - Map existingMeta = existingFooter.getFileMetaData().getKeyValueMetaData(); - Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp"); - ParquetFileWriter writer = - new ParquetFileWriter( - HadoopOutputFile.fromPath(tempPath, conf), - schema, - ParquetFileWriter.Mode.OVERWRITE, - DEFAULT_BLOCK_SIZE, - 0); - // write the initial table with the appended file to add into the outputPath - writer.start(); - HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf); - InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf); - // get the equivalent fileStatus from the file-to-append Path - FileSystem fs = FileSystem.get(conf); - FileStatus fileStatus = fs.getFileStatus(fileToAppend); - if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) { - writer.appendFile(inputFile); - writer.appendFile(inputFileToAppend); - } - Map combinedMeta = new HashMap<>(); - if (existingMeta != null) { - combinedMeta.putAll(existingMeta); - } - // track the append date and save it in the footer - int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0")); - - // save block indexes and modification time (for later sync related retrieval) in the metadata - // of the output - // table - int currentAppendIdx = appendCount + 1; - List initialBlocks = existingFooter.getBlocks(); - long totalInitialCompressedSize = 0; - // first the blocks start at 0 - for (BlockMetaData block : initialBlocks) { - totalInitialCompressedSize += block.getCompressedSize(); - } - long startBlock = 0; - if (appendCount > 0) { - startBlock = totalInitialCompressedSize; - } - - List addedBlocks = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks(); - long totalAddedCompressedSize = 0; - for (BlockMetaData block : addedBlocks) { - totalAddedCompressedSize += block.getCompressedSize(); - } - long endBlock = totalInitialCompressedSize + totalAddedCompressedSize; - - combinedMeta.put("total_appends", String.valueOf(currentAppendIdx)); - combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock)); - combinedMeta.put("index_end_block_of_append_" + currentAppendIdx, String.valueOf(endBlock)); - combinedMeta.put( - "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime())); - writer.end(combinedMeta); - fs.delete(filePath, false); - fs.rename(tempPath, filePath); - return filePath; - } - // TODO add safe guards for possible empty parquet files - // selective compaction of parquet blocks - public static List formNewTargetFiles( - Configuration conf, Path partitionPath, long targetModifTime) throws IOException { - List finalPaths = new ArrayList<>(); - FileSystem fs = partitionPath.getFileSystem(conf); - - FileStatus[] statuses = - fs.listStatus(partitionPath, path -> path.getName().endsWith(".parquet")); - - for (FileStatus status : statuses) { - - Path filePath = status.getPath(); - ParquetMetadata bigFileFooter = - ParquetFileReader.readFooter(conf, filePath, ParquetMetadataConverter.NO_FILTER); - MessageType schema = bigFileFooter.getFileMetaData().getSchema(); - int totalAppends = - Integer.parseInt( - bigFileFooter - .getFileMetaData() - .getKeyValueMetaData() - .getOrDefault("total_appends", "0")); - for (int i = 1; i <= totalAppends; i++) { - int startBlock = - Integer.valueOf( - bigFileFooter - .getFileMetaData() - .getKeyValueMetaData() - .get("index_start_block_of_append_" + i)); - int endBlock = - Integer.valueOf( - bigFileFooter - .getFileMetaData() - .getKeyValueMetaData() - .get("index_end_block_of_append_" + i)); - long modifTime = - Long.parseLong( - bigFileFooter.getFileMetaData().getKeyValueMetaData().get("append_date_" + i)); - String newFileName = - String.format( - "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock); - Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName); - try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) { - boolean wasDataAppended = false; - writer.start(); - for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) { - BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j); - long currentBlockStart = blockMetadata.getStartingPos(); - long currentBlockEnd = currentBlockStart + blockMetadata.getCompressedSize(); - if (currentBlockStart >= startBlock - && currentBlockEnd <= endBlock - && modifTime >= targetModifTime) { - List blockList = - Collections.singletonList(blockMetadata); - FSDataInputStream targetInputStream = fs.open(status.getPath()); - writer.appendRowGroups(targetInputStream, blockList, false); - wasDataAppended = true; - } - } - // manually restore the modifTime after using appendRowGroups() - writer.end(new HashMap<>()); - if (wasDataAppended) { - fs.setTimes(targetSyncFilePath, modifTime, -1); - finalPaths.add(targetSyncFilePath); - } else { - fs.delete(targetSyncFilePath, false); - } - } - } - } - return finalPaths; - } - - public static Path appendPartitionedData(Path targetPath, Path sourcePath, MessageType schema) - throws IOException { - Configuration conf = new Configuration(); - FileSystem fs = sourcePath.getFileSystem(conf); - - List targetFiles = collectParquetFiles(fs, targetPath); - - List sourceFiles = collectParquetFiles(fs, sourcePath); - - if (targetFiles.isEmpty()) { - throw new IOException("Target directory contains no parquet files to append to."); - } - if (sourceFiles.isEmpty()) { - return targetPath; - } - Path masterTargetFile = null; - for (Path p : targetFiles) { - ParquetMetadata metadata = ParquetFileReader.readFooter(conf, p); - if (!metadata.getBlocks().isEmpty()) { - masterTargetFile = p; - break; - } - } - - if (masterTargetFile == null) { - throw new IOException("Target directory contains no files with valid data blocks."); - } - - for (int i = 1; i < targetFiles.size(); i++) { - long len = fs.getFileStatus(targetFiles.get(i)).getLen(); - System.out.println("DEBUG: Attempting to append " + targetFiles.get(i) + " Size: " + len); - appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema); - } - for (Path sourceFile : sourceFiles) { - ParquetMetadata sourceFooter = ParquetFileReader.readFooter(conf, sourceFile); - if (sourceFooter.getBlocks().isEmpty()) { - System.out.println("SKIPPING: " + sourceFile + " has no data."); - continue; - } - long len = fs.getFileStatus(sourceFile).getLen(); - long len_2 = fs.getFileStatus(masterTargetFile).getLen(); - System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len); - System.out.println("DEBUG: Attempting to append " + masterTargetFile + " Size: " + len_2); - appendNewParquetFiles(masterTargetFile, sourceFile, schema); - } - - return masterTargetFile; - } - - private static List collectParquetFiles(FileSystem fs, Path root) throws IOException { - List parquetFiles = new ArrayList<>(); - Configuration conf = fs.getConf(); - if (!fs.exists(root)) return parquetFiles; - - if (fs.getFileStatus(root).isDirectory()) { - RemoteIterator it = fs.listFiles(root, true); - while (it.hasNext()) { - Path p = it.next().getPath(); - if (p.getName().endsWith(".parquet") - && !p.toString().contains("/.") - && !p.toString().contains("/_")) { - ParquetMetadata footer = ParquetFileReader.readFooter(conf, p); - if (footer.getBlocks() != null && !footer.getBlocks().isEmpty()) { - parquetFiles.add(p); - } - } - } - } else { - parquetFiles.add(root); - } - return parquetFiles; + public static ParquetDataManager getInstance() { + return INSTANCE; } - public static List mergeDatasetsByPartition( - Path targetRoot, Path sourceRoot, MessageType schema) throws IOException { - List finalPartitionPaths = new ArrayList<>(); - Configuration conf = new Configuration(); - FileSystem fs = targetRoot.getFileSystem(conf); - - Map sourcePartitions = mapPartitions(fs, sourceRoot); - - Map targetPartitions = mapPartitions(fs, targetRoot); - - for (Map.Entry entry : sourcePartitions.entrySet()) { - String partitionKey = entry.getKey(); - Path sourcePartitionPath = entry.getValue(); + public List getParquetFilesMetadataInRange( + Configuration conf, Stream parquetFiles, long startTime, long endTime) { - if (targetPartitions.containsKey(partitionKey)) { - Path targetPartitionPath = targetPartitions.get(partitionKey); - - finalPartitionPaths.add( - appendPartitionedData(targetPartitionPath, sourcePartitionPath, schema)); - } else { - // TODO logic for when a partition exists in source but not in target - // simply merge the files in the respective partitions using appendPartitionedData() that - // takes only one Path/dir - - } - } - return finalPartitionPaths; + return parquetFiles + .filter( + file -> + file.getModificationTime() >= startTime && file.getModificationTime() <= endTime) + .map(file -> new ParquetFileConfig(conf, file.getPath())) + .collect(Collectors.toList()); } - private static Map mapPartitions(FileSystem fs, Path root) throws IOException { - Map partitionMap = new HashMap<>(); - RemoteIterator it = fs.listFiles(root, true); - - while (it.hasNext()) { - Path filePath = it.next().getPath(); - if (filePath.getName().endsWith(".parquet") && !filePath.toString().contains("/_")) { - Path parent = filePath.getParent(); - String relative = parent.toString().replace(root.toString(), ""); - if (relative.startsWith("/")) relative = relative.substring(1); + public List getParquetFilesMetadataAfterTime( + Configuration conf, Stream parquetFiles, long syncTime) { - partitionMap.put(relative, parent); - } - } - return partitionMap; + return parquetFiles + .filter(file -> file.getModificationTime() >= syncTime) + .map(file -> new ParquetFileConfig(conf, file.getPath())) + .collect(Collectors.toList()); } } diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java index 33ee08d38..9e7755011 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java @@ -18,31 +18,49 @@ package org.apache.xtable.parquet; +import java.io.IOException; + import lombok.AccessLevel; +import lombok.Builder; import lombok.Getter; import lombok.experimental.FieldDefaults; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; @Getter @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) +@Builder class ParquetFileConfig { MessageType schema; + ParquetMetadata metadata; long rowGroupIndex; + long modifTime; + long size; CompressionCodecName codec; + Path path; public ParquetFileConfig(Configuration conf, Path file) { + long modifTime = -1L; ParquetMetadata metadata = ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file); if (metadata.getBlocks().isEmpty()) { throw new IllegalStateException("Parquet file contains no row groups."); } - + try { + modifTime = file.getFileSystem(conf).getFileStatus(file).getModificationTime(); + } catch (IOException e) { + e.printStackTrace(); + } + this.path = file; + this.modifTime = modifTime; + this.size = metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum(); + this.metadata = metadata; this.schema = metadata.getFileMetaData().getSchema(); this.rowGroupIndex = metadata.getBlocks().size(); this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec(); diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 83d2c0a91..8d8a16a2d 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -18,33 +18,9 @@ package org.apache.xtable.parquet; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.io.IOException; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.parquet.schema.LogicalTypeAnnotation; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.PrimitiveType; -import org.apache.parquet.schema.Types; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.types.DataTypes; -import org.apache.spark.sql.types.StructField; -import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Disabled; -import org.junit.jupiter.api.Test; public class ITParquetDataManager { private static SparkSession spark; @@ -54,257 +30,6 @@ public static void setup() { spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate(); } - @Test - @Disabled("") - public void testFormParquetFileSinglePartition() throws IOException { - Configuration conf = spark.sparkContext().hadoopConfiguration(); - StructType schema = - DataTypes.createStructType( - new StructField[] { - DataTypes.createStructField("id", DataTypes.IntegerType, false), - DataTypes.createStructField("value", DataTypes.StringType, false), - DataTypes.createStructField("year", DataTypes.IntegerType, false), - DataTypes.createStructField("month", DataTypes.IntegerType, false) - }); - - List data = - Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); - Dataset df = spark.createDataFrame(data, schema); - Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test_0"); - String outputPath = fixedPath.toString(); - df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); - - // test find files to sync - long targetModifTime = System.currentTimeMillis() - 360000; - org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); - FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); - // set the modification time to the file - updateModificationTimeRecursive(fs, hdfsPath, targetModifTime); - // create new file to append using Spark - List futureDataToSync = - Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12)); - Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); - dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath); - - long newModifTime = System.currentTimeMillis() - 50000; - - List resultingFiles = - ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime); - // check if resultingFiles contains the append data only (through the partition names) - for (org.apache.hadoop.fs.Path p : resultingFiles) { - String pathString = p.toString(); - - boolean isNewData = pathString.contains("year=2026"); - - long modTime = fs.getFileStatus(p).getModificationTime(); - // test for one partition value - assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); - assertTrue(isNewData, "Path should belong to appended data: " + pathString); - } - } - - @Test - @Disabled("") - public void testAppendParquetFileSinglePartition() throws IOException { - Configuration conf = spark.sparkContext().hadoopConfiguration(); - // In testAppendParquetFileSinglePartition - MessageType schemaParquet = - Types.buildMessage() - .required(PrimitiveType.PrimitiveTypeName.INT32) - .named("id") - .required(PrimitiveType.PrimitiveTypeName.BINARY) - .as(LogicalTypeAnnotation.stringType()) - .named("value") - .named("parquet_schema"); - StructType schema = - DataTypes.createStructType( - new StructField[] { - DataTypes.createStructField("id", DataTypes.IntegerType, false), - DataTypes.createStructField("value", DataTypes.StringType, false), - DataTypes.createStructField("year", DataTypes.IntegerType, false), - DataTypes.createStructField("month", DataTypes.IntegerType, false) - }); - List data = - Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12)); - - Dataset df = spark.createDataFrame(data, schema); - Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_1"); - Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_1"); - String outputPath = fixedPath.toString(); - String finalAppendFilePath = appendFilePath.toString(); - - df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); - - // test find files to sync - long targetModifTime = System.currentTimeMillis() - 360000; - org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); - FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); - // set the modification time to the table file - updateModificationTimeRecursive(fs, hdfsPath, targetModifTime); - // create new file to append using Spark - List futureDataToSync = - Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12)); - Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); - dfToSync - .coalesce(1) - .write() - .partitionBy("year", "month") - .mode("overwrite") - .parquet(finalAppendFilePath); - long newModifTime = System.currentTimeMillis() - 50000; - // update modifTime for file to append - updateModificationTimeRecursive( - fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime); - // recursively append all files under a specific partition path (e.g. "year=2026/month=12") - // (assuming that outputPath and finalAppendFilePath are same partitions for different files) - org.apache.hadoop.fs.Path outputFile = - ParquetDataManager.appendPartitionedData( - new org.apache.hadoop.fs.Path(outputPath), - new org.apache.hadoop.fs.Path(finalAppendFilePath), - schemaParquet); - // TODO can create big file which can be split if reaches a certain threshold size - - // test whether the appended data can be selectively filtered for incr sync (e.g. get me the - // data to sync only) - List resultingFiles = - ParquetDataManager.formNewTargetFiles(conf, outputFile, newModifTime); - // check if resultingFiles contains the append data only (through the partition names) - for (org.apache.hadoop.fs.Path p : resultingFiles) { - String pathString = p.toString(); - boolean isNewData = pathString.contains("year=2026"); - long modTime = fs.getFileStatus(p).getModificationTime(); - // test for one partition value - assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString); - assertTrue(isNewData, "Path should belong to appended data: " + pathString); - } - } - - @Test - public void testAppendParquetFileMultiplePartition() throws IOException { - Configuration conf = spark.sparkContext().hadoopConfiguration(); - - MessageType schemaParquet = - Types.buildMessage() - .required(PrimitiveType.PrimitiveTypeName.INT32) - .named("id") - .required(PrimitiveType.PrimitiveTypeName.BINARY) - .as(LogicalTypeAnnotation.stringType()) - .named("value") - .named("parquet_schema"); - StructType schema = - DataTypes.createStructType( - new StructField[] { - DataTypes.createStructField("id", DataTypes.IntegerType, false), - DataTypes.createStructField("value", DataTypes.StringType, false), - DataTypes.createStructField("year", DataTypes.IntegerType, false), - DataTypes.createStructField("month", DataTypes.IntegerType, false) - }); - List data = - Arrays.asList( - RowFactory.create(100, "A", 2026, 12), - RowFactory.create(101, "AA", 2026, 12), - RowFactory.create(102, "CB", 2027, 11), - RowFactory.create(103, "BA", 2027, 11)); - - Dataset df = spark.createDataFrame(data, schema); - Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2"); - Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2"); - String outputPath = fixedPath.toString(); - String finalAppendFilePath = appendFilePath.toString(); - - df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); - - // test find files to sync - long targetModifTime = System.currentTimeMillis() - 360000; - org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); - FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); - // set the modification time to the table file - // update modifTime for file to append - // many partitions case - List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11"); - for (String partition : newPartitions) { - org.apache.hadoop.fs.Path partitionPath = - new org.apache.hadoop.fs.Path(outputPath, partition); - if (fs.exists(partitionPath)) { - updateModificationTimeRecursive(fs, partitionPath, targetModifTime); - } - } - // create new file to append using Spark - List futureDataToSync = - Arrays.asList( - RowFactory.create(101, "A", 2026, 12), - RowFactory.create(301, "D", 2027, 11), - RowFactory.create(302, "DA", 2027, 11)); - Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); - dfToSync - .coalesce(1) // since we are exec this test on low data volume - .write() - .partitionBy("year", "month") - .mode("overwrite") - .parquet(finalAppendFilePath); - long newModifTime = System.currentTimeMillis() - 50000; - for (String partition : newPartitions) { - org.apache.hadoop.fs.Path partitionPath = - new org.apache.hadoop.fs.Path(finalAppendFilePath, partition); - if (fs.exists(partitionPath)) { - updateModificationTimeRecursive(fs, partitionPath, newModifTime); - } - } - // recursively append all files under a specific partition path (e.g. "year=2026/month=12") - // (assuming that outputPath and finalAppendFilePath are same partitions for different files) - // the result contains all partition paths with the parquet files compacted - List outputPaths = - ParquetDataManager.mergeDatasetsByPartition( - new org.apache.hadoop.fs.Path(outputPath), - new org.apache.hadoop.fs.Path(finalAppendFilePath), - schemaParquet); - // reset time FOR TESTING PURPOSES ONLY - // (in real, newModifTime time is the appending time as set by the appendRow() in the - // DataManager) - // THIS IS NEED TO CHECK AGAINST THE TIME FROM WHICH THE LAST APPEND HAPPENED - for (org.apache.hadoop.fs.Path oPath : outputPaths) { - fs.setTimes(oPath, newModifTime, -1); - } - // TODO can create big file which can be split if reaches a certain threshold size - // implement bin-packing function for large files - - // test whether the appended data can be selectively filtered for incr sync (e.g. get me the - // data to sync only) - List> finalPaths = new ArrayList<>(); - for (org.apache.hadoop.fs.Path resPath : outputPaths) { - List resultingFiles = - ParquetDataManager.formNewTargetFiles(conf, resPath, newModifTime); - finalPaths.add(resultingFiles); - } - // check if resultingFiles contains the append data only - for (List fPaths : finalPaths) { - for (org.apache.hadoop.fs.Path p : fPaths) { - String pathString = p.toString(); - // should be TRUE - boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027"); - // should be FALSE - boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025"); - long modTime = fs.getFileStatus(p).getModificationTime(); - // test for one partition value - assertTrue(modTime >= newModifTime, "File discovered was actually old data: " + pathString); - assertTrue(isNewData, "Path should belong to appended data: " + pathString); - assertFalse(isOldData, "Path should NOT belong to old data: " + pathString); - } - } - } - - private void updateModificationTimeRecursive( - FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException { - org.apache.hadoop.fs.RemoteIterator it = - fs.listFiles(path, true); - while (it.hasNext()) { - org.apache.hadoop.fs.LocatedFileStatus status = it.next(); - if (status.getPath().getName().endsWith(".parquet")) { - fs.setTimes(status.getPath(), time, -1); - } - } - } - @AfterAll public static void tearDown() { if (spark != null) { From ff809d7b6aa294b234ec679f693dc8f058d25898 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 12 Jan 2026 22:36:47 +0100 Subject: [PATCH 40/54] fix CI --- .../main/java/org/apache/xtable/parquet/ParquetFileConfig.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java index 9e7755011..7a038124a 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java @@ -21,6 +21,7 @@ import java.io.IOException; import lombok.AccessLevel; +import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Getter; import lombok.experimental.FieldDefaults; @@ -35,6 +36,7 @@ @Getter @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @Builder +@AllArgsConstructor class ParquetFileConfig { MessageType schema; ParquetMetadata metadata; From e06368f9d92a3ac4553352dd01e65e1464c7cc39 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 12 Jan 2026 23:05:15 +0100 Subject: [PATCH 41/54] fix CI, refactoring and isIncrementalSyncSafeFrom implementation --- .../parquet/ParquetConversionSource.java | 78 ++++++++++++++----- 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java index 26d758026..935a5c942 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java @@ -24,11 +24,11 @@ import java.nio.file.Paths; import java.time.Instant; import java.util.*; -import java.util.stream.Collectors; import java.util.stream.Stream; import lombok.Builder; import lombok.NonNull; +import lombok.extern.log4j.Log4j2; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; @@ -52,6 +52,7 @@ import org.apache.xtable.spi.extractor.ConversionSource; @Builder +@Log4j2 public class ParquetConversionSource implements ConversionSource { private static final ParquetSchemaExtractor schemaExtractor = @@ -129,13 +130,13 @@ public InternalTable getTable(Long modificationTime) { return createInternalTableFromFile(file); } - private Stream getInternalDataFiles(Stream parquetFiles) { + private Stream getInternalDataFiles(Stream parquetFiles) { return parquetFiles.map( file -> InternalDataFile.builder() .physicalPath(file.getPath().toString()) .fileFormat(FileFormat.APACHE_PARQUET) - .fileSizeBytes(file.getLen()) + .fileSizeBytes(file.getSize()) .partitionValues( partitionValueExtractor.extractPartitionValues( partitionSpecExtractor.spec( @@ -144,14 +145,14 @@ private Stream getInternalDataFiles(Stream hadoopConf, file.getPath()), file.getPath().toString())), HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath()))) - .lastModified(file.getModificationTime()) + .lastModified(file.getModifTime()) .columnStats( parquetStatsExtractor.getColumnStatsForaFile( parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath()))) .build()); } - private InternalDataFile createInternalDataFileFromParquetFile(FileStatus parquetFile) { + private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) { return InternalDataFile.builder() .physicalPath(parquetFile.getPath().toString()) .partitionValues( @@ -162,8 +163,8 @@ private InternalDataFile createInternalDataFileFromParquetFile(FileStatus parque hadoopConf, parquetFile.getPath()), parquetFile.getPath().toString())), basePath)) - .lastModified(parquetFile.getModificationTime()) - .fileSizeBytes(parquetFile.getLen()) + .lastModified(parquetFile.getModifTime()) + .fileSizeBytes(parquetFile.getSize()) .columnStats( parquetStatsExtractor.getColumnStatsForaFile( parquetMetadataExtractor.readParquetMetadata(hadoopConf, parquetFile.getPath()))) @@ -182,16 +183,15 @@ public TableChange getTableChangeForCommit(Long modificationTime) { Stream parquetFiles = getParquetFiles(hadoopConf, basePath); Set addedInternalDataFiles = new HashSet<>(); - List tableChangesAfter = - parquetFiles - .filter(fileStatus -> fileStatus.getModificationTime() > modificationTime) - .collect(Collectors.toList()); + List filesMetadata = + parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( + hadoopConf, parquetFiles, modificationTime); List tableChangesAfterMetadata = parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( hadoopConf, parquetFiles, modificationTime); InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream()); - for (FileStatus tableStatus : tableChangesAfter) { - InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(tableStatus); + for (ParquetFileConfig fileMetadata : filesMetadata) { + InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata); addedInternalDataFiles.add(currentDataFile); } @@ -219,16 +219,18 @@ public InternalTable getCurrentTable() { */ @Override public InternalSnapshot getCurrentSnapshot() { - // to avoid consume the stream call the method twice to return the same stream of parquet files Stream internalDataFiles = - getInternalDataFiles(getParquetFiles(hadoopConf, basePath)); - Stream parquetFilesMetadata = - getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf); - InternalTable table = getMostRecentTable(parquetFilesMetadata); + getInternalDataFiles( + getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)); + InternalTable table = + getMostRecentTable(getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)); return InternalSnapshot.builder() .table(table) .sourceIdentifier( - getCommitIdentifier(getMostRecentParquetFile(parquetFilesMetadata).getModifTime())) + getCommitIdentifier( + getMostRecentParquetFile( + getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)) + .getModifTime())) .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles)) .build(); } @@ -262,8 +264,42 @@ private Stream getParquetFiles(Configuration hadoopConf, Stri } @Override - public boolean isIncrementalSyncSafeFrom(Instant instant) { - return false; + public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) { + Stream parquetFilesMetadata = + getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf); + LongSummaryStatistics stats = + parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics(); + + if (stats.getCount() == 0) { + log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName); + return false; + } + + long earliestModTime = stats.getMin(); + long latestModTime = stats.getMax(); + + if (timeInMillis.toEpochMilli() > latestModTime) { + log.warn( + "Instant {} is in the future relative to the data. Latest file time: {}", + timeInMillis.toEpochMilli(), + Instant.ofEpochMilli(latestModTime)); + return false; + } + + if (earliestModTime > timeInMillis.toEpochMilli()) { + log.warn( + "Incremental sync is not safe. Earliest available metadata (time={}) is newer " + + "than requested instant {}. Data history has been truncated.", + Instant.ofEpochMilli(earliestModTime), + timeInMillis.toEpochMilli()); + return false; + } + + log.info( + "Incremental sync is safe from instant {} for table {}", + timeInMillis.toEpochMilli(), + tableName); + return true; } @Override From 7cdccd01e9012b6af74ce6b88d171555d784d80a Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 12 Jan 2026 23:44:36 +0100 Subject: [PATCH 42/54] added test --- .../xtable/parquet/ITParquetDataManager.java | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 8d8a16a2d..db17958ac 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -18,18 +18,132 @@ package org.apache.xtable.parquet; +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import org.apache.xtable.model.InternalTable; +import org.apache.xtable.model.storage.TableFormat; public class ITParquetDataManager { private static SparkSession spark; + private ParquetConversionSource conversionSource; @BeforeAll public static void setup() { spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate(); } + @Test + public void testAppendParquetFileMultiplePartition() throws IOException { + + Configuration conf = spark.sparkContext().hadoopConfiguration(); + + StructType schema = + DataTypes.createStructType( + new StructField[] { + DataTypes.createStructField("id", DataTypes.IntegerType, false), + DataTypes.createStructField("value", DataTypes.StringType, false), + DataTypes.createStructField("year", DataTypes.IntegerType, false), + DataTypes.createStructField("month", DataTypes.IntegerType, false) + }); + List data = + Arrays.asList( + RowFactory.create(100, "A", 2026, 12), + RowFactory.create(101, "AA", 2026, 12), + RowFactory.create(102, "CB", 2027, 11), + RowFactory.create(103, "BA", 2027, 11)); + + Dataset df = spark.createDataFrame(data, schema); + Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2"); + Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2"); + String outputPath = fixedPath.toString(); + String finalAppendFilePath = appendFilePath.toString(); + + df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); + + // test find files to sync + long targetModifTime = System.currentTimeMillis() - 360000; + org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); + FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); + // set the modification time to the table file + // update modifTime for file to append + // many partitions case + List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11"); + for (String partition : newPartitions) { + org.apache.hadoop.fs.Path partitionPath = + new org.apache.hadoop.fs.Path(outputPath, partition); + if (fs.exists(partitionPath)) { + updateModificationTimeRecursive(fs, partitionPath, targetModifTime); + } + } + // create new file to append using Spark + List futureDataToSync = + Arrays.asList( + RowFactory.create(101, "A", 2026, 12), + RowFactory.create(301, "D", 2027, 11), + RowFactory.create(302, "DA", 2027, 11)); + Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); + dfToSync + .coalesce(1) + .write() + .partitionBy("year", "month") + .mode("overwrite") + .parquet(finalAppendFilePath); + long newModifTime = System.currentTimeMillis() - 50000; + for (String partition : newPartitions) { + org.apache.hadoop.fs.Path partitionPath = + new org.apache.hadoop.fs.Path(finalAppendFilePath, partition); + if (fs.exists(partitionPath)) { + updateModificationTimeRecursive(fs, partitionPath, newModifTime); + } + } + Dataset dfWithNewTimes = spark.read().parquet(finalAppendFilePath); + dfWithNewTimes + .coalesce(1) + .write() + .partitionBy("year", "month") + .mode("append") + .parquet(outputPath); + fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true); + // conversionSource operations + InternalTable result = conversionSource.getTable(newModifTime); + assertEquals(newModifTime, result.getLatestCommitTime()); + assertNotNull(result); + assertEquals("parquet_table_test_2", result.getName()); + assertEquals(TableFormat.PARQUET, result.getTableFormat()); + assertNotNull(result.getReadSchema()); + } + + private void updateModificationTimeRecursive( + FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException { + org.apache.hadoop.fs.RemoteIterator it = + fs.listFiles(path, true); + while (it.hasNext()) { + org.apache.hadoop.fs.LocatedFileStatus status = it.next(); + if (status.getPath().getName().endsWith(".parquet")) { + fs.setTimes(status.getPath(), time, -1); + } + } + } + @AfterAll public static void tearDown() { if (spark != null) { From b919146226e12768522e27ae4c0c3e1af43cbe35 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 12 Jan 2026 23:52:08 +0100 Subject: [PATCH 43/54] added test: init conversionSource instance --- .../org/apache/xtable/parquet/ITParquetDataManager.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index db17958ac..30aa2f6e2 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -124,6 +124,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException { .parquet(outputPath); fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true); // conversionSource operations + conversionSource = + ParquetConversionSource.builder() + .tableName("parquet_table_test_2") + .basePath(fixedPath.toString()) + .hadoopConf(new Configuration()) // Required due to @NonNull + .partitionValueExtractor(null) + .partitionSpecExtractor(null) + .build(); InternalTable result = conversionSource.getTable(newModifTime); assertEquals(newModifTime, result.getLatestCommitTime()); assertNotNull(result); From 8ff5aa63d6fb7aaefe6fa8888ea78afa2defe60c Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 00:36:34 +0100 Subject: [PATCH 44/54] added test: init conversionSource instance: fix error --- .../java/org/apache/xtable/parquet/ITParquetDataManager.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 30aa2f6e2..192e7ab3b 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -127,8 +127,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException { conversionSource = ParquetConversionSource.builder() .tableName("parquet_table_test_2") - .basePath(fixedPath.toString()) - .hadoopConf(new Configuration()) // Required due to @NonNull + .basePath(fixedPath.toAbsolutePath().toUri().toString()) + .hadoopConf(new Configuration()) .partitionValueExtractor(null) .partitionSpecExtractor(null) .build(); From db9d7aba0816c8d472b98344818f6526f7683b85 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 02:44:29 +0100 Subject: [PATCH 45/54] added test: fix error --- .../parquet/ParquetConversionSource.java | 13 ++-- .../xtable/parquet/ITParquetDataManager.java | 67 ++++++++++++++++--- 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java index 935a5c942..7cc66dcea 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java @@ -159,15 +159,11 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig partitionValueExtractor.extractPartitionValues( partitionSpecExtractor.spec( partitionValueExtractor.extractSchemaForParquetPartitions( - parquetMetadataExtractor.readParquetMetadata( - hadoopConf, parquetFile.getPath()), - parquetFile.getPath().toString())), + parquetFile.getMetadata(), parquetFile.getPath().toString())), basePath)) .lastModified(parquetFile.getModifTime()) .fileSizeBytes(parquetFile.getSize()) - .columnStats( - parquetStatsExtractor.getColumnStatsForaFile( - parquetMetadataExtractor.readParquetMetadata(hadoopConf, parquetFile.getPath()))) + .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata())) .build(); } @@ -180,15 +176,14 @@ public CommitsBacklog getCommitsBacklog(InstantsForIncrementalSync syncIns @Override public TableChange getTableChangeForCommit(Long modificationTime) { - Stream parquetFiles = getParquetFiles(hadoopConf, basePath); Set addedInternalDataFiles = new HashSet<>(); List filesMetadata = parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( - hadoopConf, parquetFiles, modificationTime); + hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime); List tableChangesAfterMetadata = parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( - hadoopConf, parquetFiles, modificationTime); + hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime); InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream()); for (ParquetFileConfig fileMetadata : filesMetadata) { InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata); diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 192e7ab3b..15bccd721 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -23,11 +23,15 @@ import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; +import java.time.Instant; import java.util.Arrays; import java.util.List; +import java.util.Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; @@ -39,16 +43,24 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.apache.xtable.conversion.SourceTable; +import org.apache.xtable.model.InternalSnapshot; import org.apache.xtable.model.InternalTable; import org.apache.xtable.model.storage.TableFormat; public class ITParquetDataManager { private static SparkSession spark; - private ParquetConversionSource conversionSource; + private static ParquetConversionSourceProvider conversionSourceProvider; + public static final String PARTITION_FIELD_SPEC_CONFIG = + "xtable.parquet.source.partition_field_spec_config"; @BeforeAll public static void setup() { spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate(); + conversionSourceProvider = new ParquetConversionSourceProvider(); + Configuration hadoopConf = new Configuration(); + hadoopConf.set("fs.defaultFS", "file:///"); + conversionSourceProvider.init(hadoopConf); } @Test @@ -80,13 +92,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException { df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); // test find files to sync - long targetModifTime = System.currentTimeMillis() - 360000; + org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath); FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf); // set the modification time to the table file // update modifTime for file to append // many partitions case List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11"); + long targetModifTime = System.currentTimeMillis() - 360000; for (String partition : newPartitions) { org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(outputPath, partition); @@ -107,14 +120,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException { .partitionBy("year", "month") .mode("overwrite") .parquet(finalAppendFilePath); - long newModifTime = System.currentTimeMillis() - 50000; + /*long newModifTime = System.currentTimeMillis() - 50000; for (String partition : newPartitions) { org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(finalAppendFilePath, partition); if (fs.exists(partitionPath)) { updateModificationTimeRecursive(fs, partitionPath, newModifTime); } - } + }*/ Dataset dfWithNewTimes = spark.read().parquet(finalAppendFilePath); dfWithNewTimes .coalesce(1) @@ -124,20 +137,52 @@ public void testAppendParquetFileMultiplePartition() throws IOException { .parquet(outputPath); fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true); // conversionSource operations - conversionSource = - ParquetConversionSource.builder() - .tableName("parquet_table_test_2") + Properties sourceProperties = new Properties(); + String partitionConfig = "id:MONTH:year=yyyy/month=MM"; + sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig); + SourceTable tableConfig = + SourceTable.builder() + .name("parquet_table_test_2") .basePath(fixedPath.toAbsolutePath().toUri().toString()) - .hadoopConf(new Configuration()) - .partitionValueExtractor(null) - .partitionSpecExtractor(null) + .additionalProperties(sourceProperties) + .formatName(TableFormat.PARQUET) .build(); + + ParquetConversionSource conversionSource = + conversionSourceProvider.getConversionSourceInstance(tableConfig); + + long newModifTime = System.currentTimeMillis() - 50000; + + for (String partition : newPartitions) { + org.apache.hadoop.fs.Path partitionPath = + new org.apache.hadoop.fs.Path(outputPath, partition); + + RemoteIterator it = fs.listFiles(partitionPath, false); + while (it.hasNext()) { + LocatedFileStatus fileStatus = it.next(); + + if (fileStatus.getModificationTime() > newModifTime) { + fs.setTimes(fileStatus.getPath(), newModifTime, -1); + } else { + + fs.setTimes(fileStatus.getPath(), targetModifTime, -1); + } + } + + fs.setTimes(partitionPath, newModifTime, -1); + } + InternalTable result = conversionSource.getTable(newModifTime); - assertEquals(newModifTime, result.getLatestCommitTime()); + assertEquals( + Instant.ofEpochMilli(newModifTime).toString(), result.getLatestCommitTime().toString()); assertNotNull(result); assertEquals("parquet_table_test_2", result.getName()); assertEquals(TableFormat.PARQUET, result.getTableFormat()); assertNotNull(result.getReadSchema()); + // TableChange changes = conversionSource.getTableChangeForCommit(newModifTime); + InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); + // assertNotNull(changes); + assertNotNull(snapshot); } private void updateModificationTimeRecursive( From 324d703207ea0c89615711ccfd92fbd85cfab39c Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 12:43:59 +0100 Subject: [PATCH 46/54] more tests + bug fixes and reformatting --- .../org/apache/xtable/parquet/ParquetConversionSource.java | 7 ++++++- .../org/apache/xtable/parquet/ITParquetDataManager.java | 5 +++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java index 7cc66dcea..24b86cac5 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java @@ -160,7 +160,7 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig partitionSpecExtractor.spec( partitionValueExtractor.extractSchemaForParquetPartitions( parquetFile.getMetadata(), parquetFile.getPath().toString())), - basePath)) + HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath()))) .lastModified(parquetFile.getModifTime()) .fileSizeBytes(parquetFile.getSize()) .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata())) @@ -191,6 +191,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) { } return TableChange.builder() + .sourceIdentifier( + getCommitIdentifier( + getMostRecentParquetFile( + getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)) + .getModifTime())) .tableAsOfChange(internalTable) .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build()) .build(); diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 15bccd721..90f28808f 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -46,6 +46,7 @@ import org.apache.xtable.conversion.SourceTable; import org.apache.xtable.model.InternalSnapshot; import org.apache.xtable.model.InternalTable; +import org.apache.xtable.model.TableChange; import org.apache.xtable.model.storage.TableFormat; public class ITParquetDataManager { @@ -179,10 +180,10 @@ public void testAppendParquetFileMultiplePartition() throws IOException { assertEquals("parquet_table_test_2", result.getName()); assertEquals(TableFormat.PARQUET, result.getTableFormat()); assertNotNull(result.getReadSchema()); - // TableChange changes = conversionSource.getTableChangeForCommit(newModifTime); InternalSnapshot snapshot = conversionSource.getCurrentSnapshot(); - // assertNotNull(changes); assertNotNull(snapshot); + TableChange changes = conversionSource.getTableChangeForCommit(newModifTime); + assertNotNull(changes); } private void updateModificationTimeRecursive( From 3c453e69355f4ad0c0d2f487d45ca912fd620f54 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 13:17:28 +0100 Subject: [PATCH 47/54] CI fix --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 855894ec2..be4c56dcb 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ 0.2.0-SNAPSHOT 2025-01-01T00:00:00Z - 8 + 17 3.13.0 1.11.4 2.22.0 From e4c0b4cc2470d5c425193a85c87d95c552e170ad Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 13:20:42 +0100 Subject: [PATCH 48/54] CI fix --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index be4c56dcb..855894ec2 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ 0.2.0-SNAPSHOT 2025-01-01T00:00:00Z - 17 + 8 3.13.0 1.11.4 2.22.0 From 4315282bb23b0caec4aac5b508f631065c75bd47 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 15:53:08 +0100 Subject: [PATCH 49/54] CI fix --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 855894ec2..ba5205b65 100644 --- a/pom.xml +++ b/pom.xml @@ -104,7 +104,7 @@ 2.12.0 quarkus-bom io.quarkus.platform - 3.2.12.Final + 3.2.9.Final 4.9.3 0.16 From 14179291503de7ad51b48657f4b8c0023c054a34 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 16:18:49 +0100 Subject: [PATCH 50/54] CI fix --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index ba5205b65..739ac5cb8 100644 --- a/pom.xml +++ b/pom.xml @@ -104,7 +104,7 @@ 2.12.0 quarkus-bom io.quarkus.platform - 3.2.9.Final + 2.16.12.Final 4.9.3 0.16 From 7620c013a93c00da6ac9f783eaa62c61b8bd271d Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Tue, 13 Jan 2026 16:46:16 +0100 Subject: [PATCH 51/54] CI fix --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 739ac5cb8..855894ec2 100644 --- a/pom.xml +++ b/pom.xml @@ -104,7 +104,7 @@ 2.12.0 quarkus-bom io.quarkus.platform - 2.16.12.Final + 3.2.12.Final 4.9.3 0.16 From cd66151d9e1fa9368c385e42c747bb2f4ef75fea Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sat, 17 Jan 2026 00:35:00 +0100 Subject: [PATCH 52/54] CI fix --- .../parquet/ParquetConversionSource.java | 114 ++++++------------ .../xtable/parquet/ParquetDataManager.java | 61 ++++++++++ .../xtable/parquet/ITParquetDataManager.java | 10 +- 3 files changed, 98 insertions(+), 87 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java index 24b86cac5..ad5422e0a 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java @@ -18,10 +18,6 @@ package org.apache.xtable.parquet; -import java.io.IOException; -import java.net.URI; -import java.net.URISyntaxException; -import java.nio.file.Paths; import java.time.Instant; import java.util.*; import java.util.stream.Stream; @@ -33,11 +29,9 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.util.functional.RemoteIterators; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; -import org.apache.xtable.exception.ReadException; import org.apache.xtable.hudi.*; import org.apache.xtable.hudi.HudiPathUtils; import org.apache.xtable.model.*; @@ -96,37 +90,15 @@ private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) .build(); } - public Stream getConfigsFromStream( - Stream fileStream, Configuration conf) { - - return fileStream.map( - fileStatus -> { - Path path = fileStatus.getPath(); - - ParquetMetadata metadata = - ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path); - - return ParquetFileConfig.builder() - .schema(metadata.getFileMetaData().getSchema()) - .metadata(metadata) - .path(path) - .size(fileStatus.getLen()) - .modifTime(fileStatus.getModificationTime()) - .rowGroupIndex(0L) - .codec( - metadata.getBlocks().isEmpty() - ? null - : metadata.getBlocks().get(0).getColumns().get(0).getCodec()) - .build(); - }); - } - @Override public InternalTable getTable(Long modificationTime) { // get parquetFile at specific time modificationTime - Stream parquetFiles = getParquetFiles(hadoopConf, basePath); - Stream parquetFilesMetadata = getConfigsFromStream(parquetFiles, hadoopConf); - ParquetFileConfig file = getParquetFileAt(parquetFilesMetadata, modificationTime); + Stream parquetFiles = + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath); + Stream parquetFilesMetadata = + parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf); + ParquetFileConfig file = + parquetDataManagerExtractor.getParquetFileAt(parquetFilesMetadata, modificationTime); return createInternalTableFromFile(file); } @@ -180,10 +152,14 @@ public TableChange getTableChangeForCommit(Long modificationTime) { List filesMetadata = parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( - hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime); + hadoopConf, + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), + modificationTime); List tableChangesAfterMetadata = parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( - hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime); + hadoopConf, + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), + modificationTime); InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream()); for (ParquetFileConfig fileMetadata : filesMetadata) { InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata); @@ -193,8 +169,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) { return TableChange.builder() .sourceIdentifier( getCommitIdentifier( - getMostRecentParquetFile( - getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)) + parquetDataManagerExtractor + .getMostRecentParquetFile( + parquetDataManagerExtractor.getConfigsFromStream( + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), + hadoopConf)) .getModifTime())) .tableAsOfChange(internalTable) .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build()) @@ -202,71 +181,48 @@ public TableChange getTableChangeForCommit(Long modificationTime) { } private InternalTable getMostRecentTable(Stream parquetFiles) { - ParquetFileConfig latestFile = getMostRecentParquetFile(parquetFiles); + ParquetFileConfig latestFile = + parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles); return createInternalTableFromFile(latestFile); } @Override public InternalTable getCurrentTable() { - Stream parquetFiles = getParquetFiles(hadoopConf, basePath); - return getMostRecentTable(getConfigsFromStream(parquetFiles, hadoopConf)); + Stream parquetFiles = + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath); + return getMostRecentTable( + parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf)); } - /** - * get current snapshot - * - * @return - */ @Override public InternalSnapshot getCurrentSnapshot() { Stream internalDataFiles = getInternalDataFiles( - getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)); + parquetDataManagerExtractor.getConfigsFromStream( + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf)); InternalTable table = - getMostRecentTable(getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)); + getMostRecentTable( + parquetDataManagerExtractor.getConfigsFromStream( + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf)); return InternalSnapshot.builder() .table(table) .sourceIdentifier( getCommitIdentifier( - getMostRecentParquetFile( - getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf)) + parquetDataManagerExtractor + .getMostRecentParquetFile( + parquetDataManagerExtractor.getConfigsFromStream( + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), + hadoopConf)) .getModifTime())) .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles)) .build(); } - private ParquetFileConfig getMostRecentParquetFile(Stream parquetFiles) { - return parquetFiles - .max(Comparator.comparing(ParquetFileConfig::getModifTime)) - .orElseThrow(() -> new IllegalStateException("No files found")); - } - - private ParquetFileConfig getParquetFileAt( - Stream parquetConfigs, long targetTime) { - - return parquetConfigs - .filter(config -> config.getModifTime() >= targetTime) - .findFirst() - .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime)); - } - - private Stream getParquetFiles(Configuration hadoopConf, String basePath) { - try { - FileSystem fs = FileSystem.get(hadoopConf); - URI uriBasePath = new URI(basePath); - String parentPath = Paths.get(uriBasePath).toString(); - RemoteIterator iterator = fs.listFiles(new Path(parentPath), true); - return RemoteIterators.toList(iterator).stream() - .filter(file -> file.getPath().getName().endsWith("parquet")); - } catch (IOException | URISyntaxException e) { - throw new ReadException("Unable to read files from file system", e); - } - } - @Override public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) { Stream parquetFilesMetadata = - getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf); + parquetDataManagerExtractor.getConfigsFromStream( + parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf); LongSummaryStatistics stats = parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics(); diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index 5d09352ea..a018e037c 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -18,6 +18,10 @@ package org.apache.xtable.parquet; +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -26,6 +30,10 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; +import org.apache.hadoop.util.functional.RemoteIterators; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; + +import org.apache.xtable.exception.ReadException; /** * Manages Parquet file operations including reading, writing, and partition discovery and path @@ -43,6 +51,59 @@ public static ParquetDataManager getInstance() { return INSTANCE; } + public ParquetFileConfig getMostRecentParquetFile(Stream parquetFiles) { + return parquetFiles + .max(Comparator.comparing(ParquetFileConfig::getModifTime)) + .orElseThrow(() -> new IllegalStateException("No files found")); + } + + public ParquetFileConfig getParquetFileAt( + Stream parquetConfigs, long targetTime) { + + return parquetConfigs + .filter(config -> config.getModifTime() >= targetTime) + .findFirst() + .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime)); + } + + public Stream getParquetFiles(Configuration hadoopConf, String basePath) { + try { + FileSystem fs = FileSystem.get(hadoopConf); + URI uriBasePath = new URI(basePath); + String parentPath = Paths.get(uriBasePath).toString(); + RemoteIterator iterator = fs.listFiles(new Path(parentPath), true); + return RemoteIterators.toList(iterator).stream() + .filter(file -> file.getPath().getName().endsWith("parquet")); + } catch (IOException | URISyntaxException e) { + throw new ReadException("Unable to read files from file system", e); + } + } + + public Stream getConfigsFromStream( + Stream fileStream, Configuration conf) { + + return fileStream.map( + fileStatus -> { + Path path = fileStatus.getPath(); + + ParquetMetadata metadata = + ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path); + + return ParquetFileConfig.builder() + .schema(metadata.getFileMetaData().getSchema()) + .metadata(metadata) + .path(path) + .size(fileStatus.getLen()) + .modifTime(fileStatus.getModificationTime()) + .rowGroupIndex(metadata.getBlocks().size()) + .codec( + metadata.getBlocks().isEmpty() + ? null + : metadata.getBlocks().get(0).getColumns().get(0).getCodec()) + .build(); + }); + } + public List getParquetFilesMetadataInRange( Configuration conf, Stream parquetFiles, long startTime, long endTime) { diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 90f28808f..3aed202e3 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -121,14 +121,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException { .partitionBy("year", "month") .mode("overwrite") .parquet(finalAppendFilePath); - /*long newModifTime = System.currentTimeMillis() - 50000; - for (String partition : newPartitions) { - org.apache.hadoop.fs.Path partitionPath = - new org.apache.hadoop.fs.Path(finalAppendFilePath, partition); - if (fs.exists(partitionPath)) { - updateModificationTimeRecursive(fs, partitionPath, newModifTime); - } - }*/ + Dataset dfWithNewTimes = spark.read().parquet(finalAppendFilePath); dfWithNewTimes .coalesce(1) @@ -139,6 +132,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException { fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true); // conversionSource operations Properties sourceProperties = new Properties(); + // TODO use timestamp col instead (as done in ITParquetConversionSource) String partitionConfig = "id:MONTH:year=yyyy/month=MM"; sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig); SourceTable tableConfig = From 4e4c5cba21ae6396f9626f27fe60139d7313815d Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Sun, 18 Jan 2026 21:24:12 +0100 Subject: [PATCH 53/54] merge ready fixes & tests + refactoring & reformatting --- .../xtable/parquet/ParquetDataManager.java | 8 +++----- .../xtable/parquet/ITParquetDataManager.java | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index a018e037c..ff59f156b 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -36,12 +36,10 @@ import org.apache.xtable.exception.ReadException; /** - * Manages Parquet file operations including reading, writing, and partition discovery and path - * construction. + * Manages Parquet File's Metadata * - *

This class provides functions to handle Parquet metadata, validate schemas during appends, and - * calculate target partition directories based on file modification times and defined partition - * fields. + *

This class provides functions to handle Parquet metadata, creating metadata objects from + * parquet files and filtering the files based on the modification times. */ @Log4j2 public class ParquetDataManager { diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index 3aed202e3..bb32a5024 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -18,6 +18,7 @@ package org.apache.xtable.parquet; +import static org.apache.spark.sql.functions.expr; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; @@ -84,12 +85,12 @@ public void testAppendParquetFileMultiplePartition() throws IOException { RowFactory.create(102, "CB", 2027, 11), RowFactory.create(103, "BA", 2027, 11)); - Dataset df = spark.createDataFrame(data, schema); + Dataset dfInit = spark.createDataFrame(data, schema); Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2"); Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2"); String outputPath = fixedPath.toString(); String finalAppendFilePath = appendFilePath.toString(); - + Dataset df = dfInit.withColumn("full_date", expr("make_date(year, month, 1)")); df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath); // test find files to sync @@ -101,6 +102,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException { // many partitions case List newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11"); long targetModifTime = System.currentTimeMillis() - 360000; + long newModifTime = System.currentTimeMillis() - 50000; + long testTime = System.currentTimeMillis() - 90000; // between two prev times for (String partition : newPartitions) { org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(outputPath, partition); @@ -114,7 +117,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException { RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 11), RowFactory.create(302, "DA", 2027, 11)); - Dataset dfToSync = spark.createDataFrame(futureDataToSync, schema); + Dataset dfToSyncInit = spark.createDataFrame(futureDataToSync, schema); + Dataset dfToSync = dfToSyncInit.withColumn("full_date", expr("make_date(year, month, 1)")); dfToSync .coalesce(1) .write() @@ -132,8 +136,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException { fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true); // conversionSource operations Properties sourceProperties = new Properties(); - // TODO use timestamp col instead (as done in ITParquetConversionSource) - String partitionConfig = "id:MONTH:year=yyyy/month=MM"; + String partitionConfig = "full_date:MONTH:year=yyyy/month=MM"; sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig); SourceTable tableConfig = SourceTable.builder() @@ -146,7 +149,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException { ParquetConversionSource conversionSource = conversionSourceProvider.getConversionSourceInstance(tableConfig); - long newModifTime = System.currentTimeMillis() - 50000; + // long newModifTime = System.currentTimeMillis() - 50000; for (String partition : newPartitions) { org.apache.hadoop.fs.Path partitionPath = @@ -178,6 +181,10 @@ public void testAppendParquetFileMultiplePartition() throws IOException { assertNotNull(snapshot); TableChange changes = conversionSource.getTableChangeForCommit(newModifTime); assertNotNull(changes); + Instant instantBeforeFirstSnapshot = + Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli()); + assertTrue(instantBeforeFirstSnapshot.toEpochMilli() == newModifTime); + assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime))); } private void updateModificationTimeRecursive( From 212e50a941ae18b6cb7823d4675f1a0c79fa0c96 Mon Sep 17 00:00:00 2001 From: Selim Soufargi Date: Mon, 19 Jan 2026 21:44:53 +0100 Subject: [PATCH 54/54] optimization: read metadata once for target file, read parquet files once, reformatting --- .../parquet/ParquetConversionSource.java | 111 +++++++++--------- .../xtable/parquet/ParquetDataManager.java | 100 ++++++++++------ .../xtable/parquet/ParquetFileConfig.java | 25 ++-- .../xtable/parquet/ITParquetDataManager.java | 2 +- 4 files changed, 134 insertions(+), 104 deletions(-) diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java index ad5422e0a..10eccdf40 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java @@ -68,6 +68,16 @@ public class ParquetConversionSource implements ConversionSource { private final String basePath; @NonNull private final Configuration hadoopConf; + private List parquetFiles; + + // helper method to ensure files are loaded once + private List getOrLoadParquetFiles() { + if (this.parquetFiles == null) { + this.parquetFiles = parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath); + } + return this.parquetFiles; + } + private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) { ParquetMetadata parquetMetadata = parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath()); @@ -86,42 +96,38 @@ private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) .layoutStrategy(dataLayoutStrategy) .partitioningFields(partitionFields) .readSchema(schema) - .latestCommitTime(Instant.ofEpochMilli(latestFile.getModifTime())) + .latestCommitTime(Instant.ofEpochMilli(latestFile.getModificationTime())) .build(); } @Override public InternalTable getTable(Long modificationTime) { // get parquetFile at specific time modificationTime - Stream parquetFiles = - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath); - Stream parquetFilesMetadata = - parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf); - ParquetFileConfig file = - parquetDataManagerExtractor.getParquetFileAt(parquetFilesMetadata, modificationTime); + LocatedFileStatus parquetFile = + parquetDataManagerExtractor.getParquetDataFileAt(getOrLoadParquetFiles(), modificationTime); + ParquetFileConfig file = parquetDataManagerExtractor.getConfigFromFile(parquetFile, hadoopConf); return createInternalTableFromFile(file); } private Stream getInternalDataFiles(Stream parquetFiles) { return parquetFiles.map( - file -> - InternalDataFile.builder() - .physicalPath(file.getPath().toString()) - .fileFormat(FileFormat.APACHE_PARQUET) - .fileSizeBytes(file.getSize()) - .partitionValues( - partitionValueExtractor.extractPartitionValues( - partitionSpecExtractor.spec( - partitionValueExtractor.extractSchemaForParquetPartitions( - parquetMetadataExtractor.readParquetMetadata( - hadoopConf, file.getPath()), - file.getPath().toString())), - HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath()))) - .lastModified(file.getModifTime()) - .columnStats( - parquetStatsExtractor.getColumnStatsForaFile( - parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath()))) - .build()); + file -> { + ParquetMetadata metadata = + parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath()); + return InternalDataFile.builder() + .physicalPath(file.getPath().toString()) + .fileFormat(FileFormat.APACHE_PARQUET) + .fileSizeBytes(file.getSize()) + .partitionValues( + partitionValueExtractor.extractPartitionValues( + partitionSpecExtractor.spec( + partitionValueExtractor.extractSchemaForParquetPartitions( + metadata, file.getPath().toString())), + HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath()))) + .lastModified(file.getModificationTime()) + .columnStats(parquetStatsExtractor.getColumnStatsForaFile(metadata)) + .build(); + }); } private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) { @@ -133,7 +139,7 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig partitionValueExtractor.extractSchemaForParquetPartitions( parquetFile.getMetadata(), parquetFile.getPath().toString())), HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath()))) - .lastModified(parquetFile.getModifTime()) + .lastModified(parquetFile.getModificationTime()) .fileSizeBytes(parquetFile.getSize()) .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata())) .build(); @@ -152,15 +158,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) { List filesMetadata = parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( - hadoopConf, - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), - modificationTime); + hadoopConf, getOrLoadParquetFiles(), modificationTime); List tableChangesAfterMetadata = parquetDataManagerExtractor.getParquetFilesMetadataAfterTime( - hadoopConf, - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), - modificationTime); - InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream()); + hadoopConf, getOrLoadParquetFiles(), modificationTime); + InternalTable internalTable = getMostRecentTableConfig(tableChangesAfterMetadata.stream()); for (ParquetFileConfig fileMetadata : filesMetadata) { InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata); addedInternalDataFiles.add(currentDataFile); @@ -170,50 +172,44 @@ public TableChange getTableChangeForCommit(Long modificationTime) { .sourceIdentifier( getCommitIdentifier( parquetDataManagerExtractor - .getMostRecentParquetFile( - parquetDataManagerExtractor.getConfigsFromStream( - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), - hadoopConf)) - .getModifTime())) + .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf) + .getModificationTime())) .tableAsOfChange(internalTable) .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build()) .build(); } - private InternalTable getMostRecentTable(Stream parquetFiles) { + private InternalTable getMostRecentTable( + List parquetFiles, Configuration conf) { + ParquetFileConfig latestFile = + parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles, conf); + return createInternalTableFromFile(latestFile); + } + + private InternalTable getMostRecentTableConfig(Stream parquetFiles) { ParquetFileConfig latestFile = - parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles); + parquetDataManagerExtractor.getMostRecentParquetFileConfig(parquetFiles); return createInternalTableFromFile(latestFile); } @Override public InternalTable getCurrentTable() { - Stream parquetFiles = - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath); - return getMostRecentTable( - parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf)); + return getMostRecentTable(getOrLoadParquetFiles(), hadoopConf); } @Override public InternalSnapshot getCurrentSnapshot() { Stream internalDataFiles = getInternalDataFiles( - parquetDataManagerExtractor.getConfigsFromStream( - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf)); - InternalTable table = - getMostRecentTable( - parquetDataManagerExtractor.getConfigsFromStream( - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf)); + parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf)); + InternalTable table = getMostRecentTable(getOrLoadParquetFiles(), hadoopConf); return InternalSnapshot.builder() .table(table) .sourceIdentifier( getCommitIdentifier( parquetDataManagerExtractor - .getMostRecentParquetFile( - parquetDataManagerExtractor.getConfigsFromStream( - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), - hadoopConf)) - .getModifTime())) + .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf) + .getModificationTime())) .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles)) .build(); } @@ -221,10 +217,9 @@ public InternalSnapshot getCurrentSnapshot() { @Override public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) { Stream parquetFilesMetadata = - parquetDataManagerExtractor.getConfigsFromStream( - parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf); + parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf); LongSummaryStatistics stats = - parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics(); + parquetFilesMetadata.mapToLong(ParquetFileConfig::getModificationTime).summaryStatistics(); if (stats.getCount() == 0) { log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName); diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java index ff59f156b..c6372c198 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java @@ -49,57 +49,89 @@ public static ParquetDataManager getInstance() { return INSTANCE; } - public ParquetFileConfig getMostRecentParquetFile(Stream parquetFiles) { + public ParquetFileConfig getMostRecentParquetFile( + List parquetFiles, Configuration conf) { + LocatedFileStatus file = + parquetFiles.stream() + .max(Comparator.comparing(LocatedFileStatus::getModificationTime)) + .orElseThrow(() -> new IllegalStateException("No files found")); + return getConfigFromFile(file, conf); + } + + public ParquetFileConfig getMostRecentParquetFileConfig(Stream parquetFiles) { return parquetFiles - .max(Comparator.comparing(ParquetFileConfig::getModifTime)) + .max(Comparator.comparing(ParquetFileConfig::getModificationTime)) .orElseThrow(() -> new IllegalStateException("No files found")); } - public ParquetFileConfig getParquetFileAt( - Stream parquetConfigs, long targetTime) { + public LocatedFileStatus getParquetDataFileAt( + List parquetFiles, long targetTime) { - return parquetConfigs - .filter(config -> config.getModifTime() >= targetTime) + return parquetFiles.stream() + .filter(file -> file.getModificationTime() >= targetTime) .findFirst() .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime)); } - public Stream getParquetFiles(Configuration hadoopConf, String basePath) { + public ParquetFileConfig getConfigFromFile(LocatedFileStatus file, Configuration conf) { + + Path path = file.getPath(); + + ParquetMetadata metadata = + ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path); + + return ParquetFileConfig.builder() + .schema(metadata.getFileMetaData().getSchema()) + .metadata(metadata) + .path(path) + .size(file.getLen()) + .modificationTime(file.getModificationTime()) + .rowGroupIndex(metadata.getBlocks().size()) + .codec( + metadata.getBlocks().isEmpty() + ? null + : metadata.getBlocks().get(0).getColumns().get(0).getCodec()) + .build(); + } + + public List getParquetFiles(Configuration hadoopConf, String basePath) { try { FileSystem fs = FileSystem.get(hadoopConf); URI uriBasePath = new URI(basePath); String parentPath = Paths.get(uriBasePath).toString(); RemoteIterator iterator = fs.listFiles(new Path(parentPath), true); return RemoteIterators.toList(iterator).stream() - .filter(file -> file.getPath().getName().endsWith("parquet")); + .filter(file -> file.getPath().getName().endsWith("parquet")) + .collect(Collectors.toList()); } catch (IOException | URISyntaxException e) { throw new ReadException("Unable to read files from file system", e); } } public Stream getConfigsFromStream( - Stream fileStream, Configuration conf) { - - return fileStream.map( - fileStatus -> { - Path path = fileStatus.getPath(); - - ParquetMetadata metadata = - ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path); - - return ParquetFileConfig.builder() - .schema(metadata.getFileMetaData().getSchema()) - .metadata(metadata) - .path(path) - .size(fileStatus.getLen()) - .modifTime(fileStatus.getModificationTime()) - .rowGroupIndex(metadata.getBlocks().size()) - .codec( - metadata.getBlocks().isEmpty() - ? null - : metadata.getBlocks().get(0).getColumns().get(0).getCodec()) - .build(); - }); + List fileStream, Configuration conf) { + + return fileStream.stream() + .map( + fileStatus -> { + Path path = fileStatus.getPath(); + + ParquetMetadata metadata = + ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path); + + return ParquetFileConfig.builder() + .schema(metadata.getFileMetaData().getSchema()) + .metadata(metadata) + .path(path) + .size(fileStatus.getLen()) + .modificationTime(fileStatus.getModificationTime()) + .rowGroupIndex(metadata.getBlocks().size()) + .codec( + metadata.getBlocks().isEmpty() + ? null + : metadata.getBlocks().get(0).getColumns().get(0).getCodec()) + .build(); + }); } public List getParquetFilesMetadataInRange( @@ -109,16 +141,16 @@ public List getParquetFilesMetadataInRange( .filter( file -> file.getModificationTime() >= startTime && file.getModificationTime() <= endTime) - .map(file -> new ParquetFileConfig(conf, file.getPath())) + .map(file -> new ParquetFileConfig(conf, file)) .collect(Collectors.toList()); } public List getParquetFilesMetadataAfterTime( - Configuration conf, Stream parquetFiles, long syncTime) { + Configuration conf, List parquetFiles, long syncTime) { - return parquetFiles + return parquetFiles.stream() .filter(file -> file.getModificationTime() >= syncTime) - .map(file -> new ParquetFileConfig(conf, file.getPath())) + .map(file -> new ParquetFileConfig(conf, file)) .collect(Collectors.toList()); } } diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java index 7a038124a..ec40a79f5 100644 --- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java +++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java @@ -18,21 +18,24 @@ package org.apache.xtable.parquet; -import java.io.IOException; - import lombok.AccessLevel; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.Getter; import lombok.experimental.FieldDefaults; +import lombok.extern.log4j.Log4j2; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.metadata.BlockMetaData; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.schema.MessageType; +import org.apache.xtable.exception.ReadException; + +@Log4j2 @Getter @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE) @Builder @@ -41,26 +44,26 @@ class ParquetFileConfig { MessageType schema; ParquetMetadata metadata; long rowGroupIndex; - long modifTime; + long modificationTime; long size; CompressionCodecName codec; Path path; - public ParquetFileConfig(Configuration conf, Path file) { - long modifTime = -1L; + public ParquetFileConfig(Configuration conf, FileStatus file) { + long modificationTime = -1L; ParquetMetadata metadata = - ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file); + ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file.getPath()); if (metadata.getBlocks().isEmpty()) { throw new IllegalStateException("Parquet file contains no row groups."); } try { - modifTime = file.getFileSystem(conf).getFileStatus(file).getModificationTime(); - } catch (IOException e) { - e.printStackTrace(); + modificationTime = file.getModificationTime(); + } catch (ReadException e) { + log.warn("File reading error: " + e.getMessage()); } - this.path = file; - this.modifTime = modifTime; + this.path = file.getPath(); + this.modificationTime = modificationTime; this.size = metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum(); this.metadata = metadata; this.schema = metadata.getFileMetaData().getSchema(); diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java index bb32a5024..08e11ed93 100644 --- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java +++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java @@ -183,7 +183,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException { assertNotNull(changes); Instant instantBeforeFirstSnapshot = Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli()); - assertTrue(instantBeforeFirstSnapshot.toEpochMilli() == newModifTime); + assertEquals(instantBeforeFirstSnapshot.toEpochMilli(), newModifTime); assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime))); }