From e541a7174c4ce187960f7db4f7c3755e5710170e Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Wed, 10 Dec 2025 20:55:47 +0100
Subject: [PATCH 01/54] given a parquet file return data from a certain
 modification time

---
 .../xtable/parquet/ParquetDataManager.java    | 153 ++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
new file mode 100644
index 000000000..7cf950950
--- /dev/null
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import java.io.IOException;
+import java.time.Instant;
+
+import lombok.Builder;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetReader;
+import org.apache.parquet.hadoop.ParquetWriter;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+
+@Builder
+public class ParquetDataManager {
+  private static final String TIME_FIELD = "modified_at";
+  public Path outputFile;
+  private ParquetReader<Group> reader;
+  private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
+
+  private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path file) {
+    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, file);
+    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
+    try (ParquetWriter<Group> writer =
+        new ParquetWriter<Group>(
+            outputFile,
+            new GroupWriteSupport(),
+            parquetFileConfig.getCodec(),
+            (int) parquetFileConfig.getRowGroupSize(),
+            pageSize,
+            pageSize, // dictionaryPageSize
+            true, // enableDictionary
+            false, // enableValidation
+            ParquetWriter.DEFAULT_WRITER_VERSION,
+            conf)) {
+      Group currentGroup = null;
+      while ((currentGroup = (Group) reader.read()) != null) {
+        writer.write(currentGroup);
+      }
+    } catch (Exception e) {
+
+      e.printStackTrace();
+    }
+    return outputFile;
+  }
+
+  public long convertTimeField(Type timeFieldType, int timeIndex, Group record) {
+    PrimitiveType timeFieldTypePrimitive = timeFieldType.asPrimitiveType();
+    long timestampValue = record.getLong(timeIndex, 0);
+
+    // Check the logical type to determine the unit:
+    LogicalTypeAnnotation logicalType = timeFieldTypePrimitive.getLogicalTypeAnnotation();
+    // check the cases of Nanos and millis as well
+    if (logicalType != null
+        && logicalType.equals(
+            LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS))) {
+      // convert microseconds to milliseconds
+      timestampValue = timestampValue / 1000;
+    }
+    // continue case INT96 and BINARY (time as string)
+    return timestampValue;
+  }
+
+  public ParquetReader readParquetDataGroupsFromTime(
+      String filePath, Configuration conf, Instant modifTime) {
+    Path file = new Path(filePath);
+    try {
+      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
+      MessageType schema =
+          metadataExtractor.readParquetMetadata(conf, file).getFileMetaData().getSchema();
+      Group record = null;
+
+      // get the index of the modified_at field
+      int timeIndex = schema.getFieldIndex(TIME_FIELD);
+      Type timeFieldType = null;
+      long recordTime = 0L;
+      int isAfterModificationTime = 0;
+      while ((reader.read()) != null) {
+        // TODO check if TIME_FIELD needs to be extracted or can use default one
+        timeFieldType = schema.getType(TIME_FIELD);
+        recordTime = convertTimeField(timeFieldType, timeIndex, record);
+        isAfterModificationTime = Instant.ofEpochMilli(recordTime).compareTo(modifTime);
+        if (isAfterModificationTime >= 0) {
+          break;
+        }
+      }
+      // write the remaining reader using a writer
+
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+
+    return reader;
+  }
+}
+
+class ParquetFileConfig {
+  private final MessageType schema;
+  private final long rowGroupSize;
+  private final CompressionCodecName codec;
+
+  public ParquetFileConfig(Configuration conf, Path file) {
+    ParquetMetadata metadata =
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
+    MessageType schema = metadata.getFileMetaData().getSchema();
+    if (metadata.getBlocks().isEmpty()) {
+      throw new IllegalStateException("Baseline Parquet file has no row groups.");
+    }
+    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
+    this.schema = schema;
+    this.rowGroupSize = rowGroupSize;
+    this.codec = codec;
+  }
+
+  public MessageType getSchema() {
+    return schema;
+  }
+
+  public long getRowGroupSize() {
+    return rowGroupSize;
+  }
+
+  public CompressionCodecName getCodec() {
+    return codec;
+  }
+}

From 15e282a04a56d6d28cbfb6f3d3fb9b9afe040417 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 13 Dec 2025 18:20:00 +0100
Subject: [PATCH 02/54] create the path based on the partition then inject the
 file to append into the parquet table

---
 .../xtable/parquet/ParquetDataManager.java    | 135 ++++++------------
 .../xtable/parquet/ParquetFileConfig.java     |  57 ++++++++
 2 files changed, 100 insertions(+), 92 deletions(-)
 create mode 100644 xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 7cf950950..a80ab8a61 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -20,6 +20,9 @@
 
 import java.io.IOException;
 import java.time.Instant;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+import java.util.UUID;
 
 import lombok.Builder;
 
@@ -30,22 +33,51 @@
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
-import org.apache.parquet.hadoop.metadata.CompressionCodecName;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
-import org.apache.parquet.schema.LogicalTypeAnnotation;
-import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.schema.PrimitiveType;
-import org.apache.parquet.schema.Type;
 
 @Builder
 public class ParquetDataManager {
-  private static final String TIME_FIELD = "modified_at";
-  public Path outputFile;
-  private ParquetReader<Group> reader;
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
 
-  private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path file) {
-    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, file);
+  public ParquetReader readParquetDataAsReader(String filePath, Configuration conf) {
+    ParquetReader reader = null;
+    Path file = new Path(filePath);
+    try {
+      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    return reader;
+  }
+
+  private Path appendNewParquetFile(
+      Configuration conf,
+      String rootPath,
+      Path fileToAppend,
+      String partitionColumn,
+      Instant modifTime) {
+    Path finalFile = null;
+    // construct the file path to inject into the existing partitioned file
+    String partitionValue =
+        DateTimeFormatter.ISO_LOCAL_DATE.format(
+            modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
+    String partitionDir = partitionColumn + partitionValue;
+    String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
+    Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
+    // return its reader for convenience of writing
+    ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf);
+    // append/write it in the right partition
+    finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
+    return finalFile;
+  }
+
+  private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
+    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
+    return parquetFileConfig;
+  }
+
+  private Path writeNewParquetFile(
+      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) {
+    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
     int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
     try (ParquetWriter<Group> writer =
         new ParquetWriter<Group>(
@@ -69,85 +101,4 @@ private Path writeNewParquetFile(Configuration conf, ParquetReader reader, Path
     }
     return outputFile;
   }
-
-  public long convertTimeField(Type timeFieldType, int timeIndex, Group record) {
-    PrimitiveType timeFieldTypePrimitive = timeFieldType.asPrimitiveType();
-    long timestampValue = record.getLong(timeIndex, 0);
-
-    // Check the logical type to determine the unit:
-    LogicalTypeAnnotation logicalType = timeFieldTypePrimitive.getLogicalTypeAnnotation();
-    // check the cases of Nanos and millis as well
-    if (logicalType != null
-        && logicalType.equals(
-            LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS))) {
-      // convert microseconds to milliseconds
-      timestampValue = timestampValue / 1000;
-    }
-    // continue case INT96 and BINARY (time as string)
-    return timestampValue;
-  }
-
-  public ParquetReader readParquetDataGroupsFromTime(
-      String filePath, Configuration conf, Instant modifTime) {
-    Path file = new Path(filePath);
-    try {
-      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
-      MessageType schema =
-          metadataExtractor.readParquetMetadata(conf, file).getFileMetaData().getSchema();
-      Group record = null;
-
-      // get the index of the modified_at field
-      int timeIndex = schema.getFieldIndex(TIME_FIELD);
-      Type timeFieldType = null;
-      long recordTime = 0L;
-      int isAfterModificationTime = 0;
-      while ((reader.read()) != null) {
-        // TODO check if TIME_FIELD needs to be extracted or can use default one
-        timeFieldType = schema.getType(TIME_FIELD);
-        recordTime = convertTimeField(timeFieldType, timeIndex, record);
-        isAfterModificationTime = Instant.ofEpochMilli(recordTime).compareTo(modifTime);
-        if (isAfterModificationTime >= 0) {
-          break;
-        }
-      }
-      // write the remaining reader using a writer
-
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
-
-    return reader;
-  }
-}
-
-class ParquetFileConfig {
-  private final MessageType schema;
-  private final long rowGroupSize;
-  private final CompressionCodecName codec;
-
-  public ParquetFileConfig(Configuration conf, Path file) {
-    ParquetMetadata metadata =
-        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
-    MessageType schema = metadata.getFileMetaData().getSchema();
-    if (metadata.getBlocks().isEmpty()) {
-      throw new IllegalStateException("Baseline Parquet file has no row groups.");
-    }
-    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
-    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
-    this.schema = schema;
-    this.rowGroupSize = rowGroupSize;
-    this.codec = codec;
-  }
-
-  public MessageType getSchema() {
-    return schema;
-  }
-
-  public long getRowGroupSize() {
-    return rowGroupSize;
-  }
-
-  public CompressionCodecName getCodec() {
-    return codec;
-  }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
new file mode 100644
index 000000000..6cdf1d4f3
--- /dev/null
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.CompressionCodecName;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.schema.MessageType;
+
+class ParquetFileConfig {
+  private final MessageType schema;
+  private final long rowGroupSize;
+  private final CompressionCodecName codec;
+
+  public ParquetFileConfig(Configuration conf, Path file) {
+    ParquetMetadata metadata =
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
+    MessageType schema = metadata.getFileMetaData().getSchema();
+    if (metadata.getBlocks().isEmpty()) {
+      throw new IllegalStateException("Baseline Parquet file has no row groups.");
+    }
+    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
+    this.schema = schema;
+    this.rowGroupSize = rowGroupSize;
+    this.codec = codec;
+  }
+
+  public MessageType getSchema() {
+    return schema;
+  }
+
+  public long getRowGroupSize() {
+    return rowGroupSize;
+  }
+
+  public CompressionCodecName getCodec() {
+    return codec;
+  }
+}

From 2ee71c938d8f1eed47d83a2f00cbd47bfabee448 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 14 Dec 2025 21:19:42 +0100
Subject: [PATCH 03/54] Handle case of path construction with file partitioned
 over many fields, interfacing with ConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    | 40 +++++++++++++++----
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index a80ab8a61..9a38c775a 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -22,7 +22,9 @@
 import java.time.Instant;
 import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
+import java.util.List;
 import java.util.UUID;
+import java.util.stream.Collectors;
 
 import lombok.Builder;
 
@@ -34,6 +36,11 @@
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
 
+import org.apache.xtable.model.schema.InternalPartitionField;
+import org.apache.xtable.model.stat.PartitionValue;
+import org.apache.xtable.model.stat.Range;
+import org.apache.xtable.model.storage.InternalDataFile;
+
 @Builder
 public class ParquetDataManager {
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
@@ -49,18 +56,37 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf
     return reader;
   }
 
+  // partition fields are already computed, given a parquet file InternalDataFile must be derived
+  // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(
       Configuration conf,
       String rootPath,
-      Path fileToAppend,
-      String partitionColumn,
-      Instant modifTime) {
+      InternalDataFile internalParquetFile,
+      List<InternalPartitionField> partitionFields) {
     Path finalFile = null;
+    String partitionDir = "";
+    List<PartitionValue> partitionValues = internalParquetFile.getPartitionValues();
+    Instant modifTime = Instant.ofEpochMilli(internalParquetFile.getLastModified());
+    Path fileToAppend = new Path(internalParquetFile.toString());
     // construct the file path to inject into the existing partitioned file
-    String partitionValue =
-        DateTimeFormatter.ISO_LOCAL_DATE.format(
-            modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-    String partitionDir = partitionColumn + partitionValue;
+    if (partitionValues == null || partitionValues.isEmpty()) {
+      String partitionValue =
+          DateTimeFormatter.ISO_LOCAL_DATE.format(
+              modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
+      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
+    } else {
+      // handle multiple partitioning case (year and month etc.)
+      partitionDir =
+          partitionValues.stream()
+              .map(
+                  pv -> {
+                    Range epochValueObject = pv.getRange();
+                    // epochValueObject is always sure to be long
+                    String valueStr = String.valueOf(epochValueObject.getMaxValue());
+                    return pv.getPartitionField() + "=" + valueStr;
+                  })
+              .collect(Collectors.joining("/"));
+    }
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // return its reader for convenience of writing

From 6032e5f62aa74347a3403ed544320d4fff918d07 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 14 Dec 2025 22:03:44 +0100
Subject: [PATCH 04/54] test append Parquet file into table init

---
 .../parquet/TestParquetDataManager.java       | 75 +++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
new file mode 100644
index 000000000..2365b5fef
--- /dev/null
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+package org.apache.xtable.parquet;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+public class TestParquetDataManager {
+  @TempDir static java.nio.file.Path tempDir = Paths.get("./");
+
+  @Test
+  public void testAppendParquetFile() throws IOException {
+    SparkSession spark =
+        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+
+    List<Row> data =
+        Arrays.asList(
+            RowFactory.create(101, "A", 2025, 12),
+            RowFactory.create(102, "B", 2025, 12),
+            RowFactory.create(201, "C", 2025, 11),
+            RowFactory.create(301, "D", 2024, 7));
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    java.nio.file.Path path = tempDir.resolve("parquet-partitioned_table_test");
+    File file = path.toFile();
+    String outputPath = file.getPath();
+
+    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // TODO create an InternalDataFile and Partition Fields for testing purposes
+
+    // TODO test appendNewParquetFile()
+
+    // TODO validate the final table
+    spark.stop();
+  }
+}

From f6fdc72be71ea937de7a06e0d95aa6fa893879c2 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 15 Dec 2025 21:05:06 +0100
Subject: [PATCH 05/54] add function to test schema equivalence before
 appending

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java  | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 9a38c775a..f0e8a22cd 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -56,6 +56,12 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf
     return reader;
   }
 
+  // check required before appending the file
+  private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
+    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
+    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
+    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
+  }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(

From a94c3f36c9587cfe62ff7330ef5537e03aff8751 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 16 Dec 2025 12:59:02 +0100
Subject: [PATCH 06/54] construct path to inject to based on partitions

---
 .../xtable/parquet/ParquetDataManager.java    | 175 +++++++++++++++---
 .../parquet/TestParquetDataManager.java       |   9 +-
 2 files changed, 151 insertions(+), 33 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index f0e8a22cd..a19d8f585 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -19,16 +19,18 @@
 package org.apache.xtable.parquet;
 
 import java.io.IOException;
-import java.time.Instant;
-import java.time.ZoneId;
+import java.time.*;
 import java.time.format.DateTimeFormatter;
-import java.util.List;
-import java.util.UUID;
+import java.time.temporal.ChronoUnit;
+import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import lombok.Builder;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.example.data.Group;
 import org.apache.parquet.hadoop.ParquetReader;
@@ -36,10 +38,7 @@
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
 
-import org.apache.xtable.model.schema.InternalPartitionField;
-import org.apache.xtable.model.stat.PartitionValue;
-import org.apache.xtable.model.stat.Range;
-import org.apache.xtable.model.storage.InternalDataFile;
+import org.apache.iceberg.PartitionField;
 
 @Builder
 public class ParquetDataManager {
@@ -56,48 +55,170 @@ public ParquetReader readParquetDataAsReader(String filePath, Configuration conf
     return reader;
   }
 
+  // TODO check if footer of added file can cause problems (catalog) when reading the full table
+  // after appending
   // check required before appending the file
   private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
     ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
     ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
     return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
   }
+
+  Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> partitionFields) {
+    Map<String, String> partitionValues = new HashMap<>();
+    String[] parts = partitionDir.split("/");
+    for (String part : parts) {
+      String[] keyValue = part.split("=");
+      if (keyValue.length == 2) {
+        partitionValues.put(keyValue[0].toLowerCase(), keyValue[1]);
+      }
+    }
+    int year = 1970;
+    int month = 1;
+    int day = 1;
+    int hour = 0;
+    int minute = 0;
+    int second = 0;
+
+    for (PartitionField field : partitionFields) {
+      String fieldName = field.name().toLowerCase();
+      String value = partitionValues.get(fieldName);
+
+      if (value != null) {
+        try {
+          int intValue = Integer.parseInt(value);
+
+          switch (fieldName) {
+            case "year":
+              year = intValue;
+              break;
+            case "month":
+              month = intValue;
+              break;
+            case "day":
+              day = intValue;
+              break;
+            case "hour":
+              hour = intValue;
+              break;
+            case "minute":
+              minute = intValue;
+              break;
+            case "second":
+              second = intValue;
+              break;
+            default:
+              break;
+          }
+        } catch (NumberFormatException e) {
+          System.err.println(
+              "Warning: Invalid number format for partition field '" + fieldName + "': " + value);
+        }
+      }
+    }
+
+    try {
+      LocalDateTime localDateTime = LocalDateTime.of(year, month, day, hour, minute, second);
+      return localDateTime.toInstant(ZoneOffset.UTC);
+    } catch (java.time.DateTimeException e) {
+      throw new IllegalArgumentException(
+          "Invalid partition directory date components: " + partitionDir, e);
+    }
+  }
+
+  private ChronoUnit getGranularityUnit(List<PartitionField> partitionFields) {
+    if (partitionFields == null || partitionFields.isEmpty()) {
+      return ChronoUnit.DAYS;
+    }
+    // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month')
+    String lastFieldName = partitionFields.get(partitionFields.size() - 1).name();
+
+    if (lastFieldName.equalsIgnoreCase("year")) {
+      return ChronoUnit.YEARS;
+    } else if (lastFieldName.equalsIgnoreCase("month")) {
+      return ChronoUnit.MONTHS;
+    } else if (lastFieldName.equalsIgnoreCase("day") || lastFieldName.equalsIgnoreCase("date")) {
+      return ChronoUnit.DAYS;
+    } else if (lastFieldName.equalsIgnoreCase("hour")) {
+      return ChronoUnit.HOURS;
+    } else if (lastFieldName.equalsIgnoreCase("minute")) {
+      return ChronoUnit.MINUTES;
+    } else {
+      return ChronoUnit.DAYS;
+    }
+  }
+
+  private String findTargetPartitionFolder(
+      Instant modifTime,
+      List<PartitionField> partitionFields,
+      // could be retrieved from getParquetFiles() of ParquetConversionSource
+      Stream<LocatedFileStatus> parquetFiles)
+      throws IOException {
+
+    long modifTimeMillis = modifTime.toEpochMilli();
+    final ZoneId ZONE = ZoneId.systemDefault();
+    ChronoUnit partitionUnit = getGranularityUnit(partitionFields);
+    List<String> allFolders =
+        parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList());
+
+    // sort the folders by their start time
+    allFolders.sort(Comparator.comparing(f -> parsePartitionDirToStartTime(f, partitionFields)));
+
+    for (int i = 0; i < allFolders.size(); i++) {
+      String currentFolder = allFolders.get(i);
+
+      // start time of the current folder
+      Instant currentStartTime = parsePartitionDirToStartTime(currentFolder, partitionFields);
+      long currentStartMillis = currentStartTime.toEpochMilli();
+
+      // determine the end time (which is the start time of the next logical partition)
+      long nextStartMillis;
+      if (i + 1 < allFolders.size()) {
+        nextStartMillis =
+            parsePartitionDirToStartTime(allFolders.get(i + 1), partitionFields).toEpochMilli();
+      } else {
+        ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE);
+        ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit);
+        nextStartMillis = nextStartZDT.toInstant().toEpochMilli();
+      }
+      if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) {
+        return currentFolder;
+      }
+    }
+    return "";
+  }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(
       Configuration conf,
       String rootPath,
-      InternalDataFile internalParquetFile,
-      List<InternalPartitionField> partitionFields) {
+      FileStatus parquetFile,
+      Stream<LocatedFileStatus> parquetFiles,
+      List<PartitionField> partitionFields) {
     Path finalFile = null;
     String partitionDir = "";
-    List<PartitionValue> partitionValues = internalParquetFile.getPartitionValues();
-    Instant modifTime = Instant.ofEpochMilli(internalParquetFile.getLastModified());
-    Path fileToAppend = new Path(internalParquetFile.toString());
+    Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
+    Path fileToAppend = parquetFile.getPath();
     // construct the file path to inject into the existing partitioned file
-    if (partitionValues == null || partitionValues.isEmpty()) {
+    if (partitionFields == null || partitionFields.isEmpty()) {
       String partitionValue =
           DateTimeFormatter.ISO_LOCAL_DATE.format(
               modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
+      partitionDir = partitionFields.get(0).name() + partitionValue;
     } else {
-      // handle multiple partitioning case (year and month etc.)
-      partitionDir =
-          partitionValues.stream()
-              .map(
-                  pv -> {
-                    Range epochValueObject = pv.getRange();
-                    // epochValueObject is always sure to be long
-                    String valueStr = String.valueOf(epochValueObject.getMaxValue());
-                    return pv.getPartitionField() + "=" + valueStr;
-                  })
-              .collect(Collectors.joining("/"));
+      // handle multiple partitioning case (year and month etc.), find the target partition dir
+      try {
+        partitionDir = findTargetPartitionFolder(modifTime, partitionFields, parquetFiles);
+      } catch (IOException e) {
+        e.printStackTrace();
+      }
     }
+    // construct the path
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // return its reader for convenience of writing
     ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf);
-    // append/write it in the right partition
+    // then inject/append/write it in the right partition
     finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
     return finalFile;
   }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
index 2365b5fef..9db52a70d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
@@ -18,8 +18,8 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.File;
 import java.io.IOException;
+import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.List;
@@ -32,10 +32,8 @@
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
 
 public class TestParquetDataManager {
-  @TempDir static java.nio.file.Path tempDir = Paths.get("./");
 
   @Test
   public void testAppendParquetFile() throws IOException {
@@ -59,9 +57,8 @@ public void testAppendParquetFile() throws IOException {
             RowFactory.create(301, "D", 2024, 7));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    java.nio.file.Path path = tempDir.resolve("parquet-partitioned_table_test");
-    File file = path.toFile();
-    String outputPath = file.getPath();
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
+    String outputPath = fixedPath.toFile().getName();
 
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 

From f8bdbfe2ff26cb816e8d57f566f3de5aaf6b6831 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Wed, 17 Dec 2025 00:25:01 +0100
Subject: [PATCH 07/54] fix imports

---
 .../xtable/parquet/ParquetDataManager.java    | 32 ++++++++++++-------
 ...Manager.java => ITParquetDataManager.java} |  2 +-
 2 files changed, 22 insertions(+), 12 deletions(-)
 rename xtable-core/src/test/java/org/apache/xtable/parquet/{TestParquetDataManager.java => ITParquetDataManager.java} (98%)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index a19d8f585..23c590642 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -19,10 +19,18 @@
 package org.apache.xtable.parquet;
 
 import java.io.IOException;
-import java.time.*;
+import java.time.Instant;
+import java.time.LocalDateTime;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
 import java.time.format.DateTimeFormatter;
 import java.time.temporal.ChronoUnit;
-import java.util.*;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -38,7 +46,7 @@
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
 
-import org.apache.iceberg.PartitionField;
+import org.apache.xtable.model.schema.InternalPartitionField;
 
 @Builder
 public class ParquetDataManager {
@@ -64,7 +72,8 @@ private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fi
     return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
   }
 
-  Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> partitionFields) {
+  Instant parsePartitionDirToStartTime(
+      String partitionDir, List<InternalPartitionField> partitionFields) {
     Map<String, String> partitionValues = new HashMap<>();
     String[] parts = partitionDir.split("/");
     for (String part : parts) {
@@ -80,8 +89,8 @@ Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> p
     int minute = 0;
     int second = 0;
 
-    for (PartitionField field : partitionFields) {
-      String fieldName = field.name().toLowerCase();
+    for (InternalPartitionField field : partitionFields) {
+      String fieldName = field.getSourceField().getName().toLowerCase();
       String value = partitionValues.get(fieldName);
 
       if (value != null) {
@@ -126,12 +135,13 @@ Instant parsePartitionDirToStartTime(String partitionDir, List<PartitionField> p
     }
   }
 
-  private ChronoUnit getGranularityUnit(List<PartitionField> partitionFields) {
+  private ChronoUnit getGranularityUnit(List<InternalPartitionField> partitionFields) {
     if (partitionFields == null || partitionFields.isEmpty()) {
       return ChronoUnit.DAYS;
     }
     // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month')
-    String lastFieldName = partitionFields.get(partitionFields.size() - 1).name();
+    String lastFieldName =
+        partitionFields.get(partitionFields.size() - 1).getSourceField().getName();
 
     if (lastFieldName.equalsIgnoreCase("year")) {
       return ChronoUnit.YEARS;
@@ -150,7 +160,7 @@ private ChronoUnit getGranularityUnit(List<PartitionField> partitionFields) {
 
   private String findTargetPartitionFolder(
       Instant modifTime,
-      List<PartitionField> partitionFields,
+      List<InternalPartitionField> partitionFields,
       // could be retrieved from getParquetFiles() of ParquetConversionSource
       Stream<LocatedFileStatus> parquetFiles)
       throws IOException {
@@ -194,7 +204,7 @@ private Path appendNewParquetFile(
       String rootPath,
       FileStatus parquetFile,
       Stream<LocatedFileStatus> parquetFiles,
-      List<PartitionField> partitionFields) {
+      List<InternalPartitionField> partitionFields) {
     Path finalFile = null;
     String partitionDir = "";
     Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
@@ -204,7 +214,7 @@ private Path appendNewParquetFile(
       String partitionValue =
           DateTimeFormatter.ISO_LOCAL_DATE.format(
               modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-      partitionDir = partitionFields.get(0).name() + partitionValue;
+      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
     } else {
       // handle multiple partitioning case (year and month etc.), find the target partition dir
       try {
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
similarity index 98%
rename from xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
rename to xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 9db52a70d..30d044578 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/TestParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -33,7 +33,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.Test;
 
-public class TestParquetDataManager {
+public class ITParquetDataManager {
 
   @Test
   public void testAppendParquetFile() throws IOException {

From c04a983733bc67614c224caca696333ba96867eb Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Wed, 17 Dec 2025 20:57:24 +0100
Subject: [PATCH 08/54] refactoring (lombok, logs, javadocs and function and
 approach commenting)

---
 .../xtable/parquet/ParquetDataManager.java    | 32 +++++++++++++----
 .../xtable/parquet/ParquetFileConfig.java     | 35 ++++++++-----------
 2 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 23c590642..ca0496684 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -35,6 +35,7 @@
 import java.util.stream.Stream;
 
 import lombok.Builder;
+import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
@@ -48,17 +49,27 @@
 
 import org.apache.xtable.model.schema.InternalPartitionField;
 
+/**
+ * Manages Parquet file operations including reading, writing, and partition discovery and path
+ * construction.
+ *
+ * <p>This class provides functions to handle Parquet metadata, validate schemas during appends, and
+ * calculate target partition directories based on file modification times and defined partition
+ * fields.
+ */
+@Log4j2
 @Builder
 public class ParquetDataManager {
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
 
-  public ParquetReader readParquetDataAsReader(String filePath, Configuration conf) {
+  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
     ParquetReader reader = null;
     Path file = new Path(filePath);
     try {
       reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
     } catch (IOException e) {
-      e.printStackTrace();
+      log.error("Unexpected error during Parquet read: {}", filePath, e);
+      throw new IOException("Unexpected error reading Parquet file", e);
     }
     return reader;
   }
@@ -158,6 +169,9 @@ private ChronoUnit getGranularityUnit(List<InternalPartitionField> partitionFiel
     }
   }
 
+  // find the target partition by comparing the modficationTime of the file to add with the current
+  // tables
+  // if the modifTime falls between two partitions then we know it belongs to the first
   private String findTargetPartitionFolder(
       Instant modifTime,
       List<InternalPartitionField> partitionFields,
@@ -167,6 +181,7 @@ private String findTargetPartitionFolder(
 
     long modifTimeMillis = modifTime.toEpochMilli();
     final ZoneId ZONE = ZoneId.systemDefault();
+    // find the partition granularity (days, hours...)
     ChronoUnit partitionUnit = getGranularityUnit(partitionFields);
     List<String> allFolders =
         parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList());
@@ -189,6 +204,7 @@ private String findTargetPartitionFolder(
       } else {
         ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE);
         ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit);
+        // to evaluate the partition date value and make comparable with the modifTime in Instant
         nextStartMillis = nextStartZDT.toInstant().toEpochMilli();
       }
       if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) {
@@ -204,7 +220,8 @@ private Path appendNewParquetFile(
       String rootPath,
       FileStatus parquetFile,
       Stream<LocatedFileStatus> parquetFiles,
-      List<InternalPartitionField> partitionFields) {
+      List<InternalPartitionField> partitionFields)
+      throws IOException {
     Path finalFile = null;
     String partitionDir = "";
     Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
@@ -227,7 +244,7 @@ private Path appendNewParquetFile(
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // return its reader for convenience of writing
-    ParquetReader reader = readParquetDataAsReader(fileToAppend.getName(), conf);
+    ParquetReader reader = getParquetReader(fileToAppend.getName(), conf);
     // then inject/append/write it in the right partition
     finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
     return finalFile;
@@ -239,7 +256,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
   }
 
   private Path writeNewParquetFile(
-      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile) {
+      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
+      throws IOException {
     ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
     int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
     try (ParquetWriter<Group> writer =
@@ -259,8 +277,8 @@ private Path writeNewParquetFile(
         writer.write(currentGroup);
       }
     } catch (Exception e) {
-
-      e.printStackTrace();
+      log.error("Unexpected error during Parquet write: {}", outputFile, e);
+      throw new IOException("Failed to complete Parquet write operation", e);
     }
     return outputFile;
   }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 6cdf1d4f3..e4b645f5d 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -18,40 +18,33 @@
  
 package org.apache.xtable.parquet;
 
+import lombok.AccessLevel;
+import lombok.Getter;
+import lombok.experimental.FieldDefaults;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
+@Getter
+@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 class ParquetFileConfig {
-  private final MessageType schema;
-  private final long rowGroupSize;
-  private final CompressionCodecName codec;
+  MessageType schema;
+  long rowGroupSize;
+  CompressionCodecName codec;
 
   public ParquetFileConfig(Configuration conf, Path file) {
     ParquetMetadata metadata =
         ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
-    MessageType schema = metadata.getFileMetaData().getSchema();
+
     if (metadata.getBlocks().isEmpty()) {
-      throw new IllegalStateException("Baseline Parquet file has no row groups.");
+      throw new IllegalStateException("Parquet file contains no row groups.");
     }
-    long rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
-    CompressionCodecName codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
-    this.schema = schema;
-    this.rowGroupSize = rowGroupSize;
-    this.codec = codec;
-  }
-
-  public MessageType getSchema() {
-    return schema;
-  }
-
-  public long getRowGroupSize() {
-    return rowGroupSize;
-  }
 
-  public CompressionCodecName getCodec() {
-    return codec;
+    this.schema = metadata.getFileMetaData().getSchema();
+    this.rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
   }
 }

From 5f2541e9c81d91b8b2a1cc3fe986457a7e3bb96c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Thu, 1 Jan 2026 18:03:56 +0100
Subject: [PATCH 09/54] use appendFile to append a file into a table while
 tracking the appends using the FileStatus' modifTime attribute

---
 .../xtable/parquet/ParquetDataManager.java    | 139 ++++++++++++------
 1 file changed, 93 insertions(+), 46 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index ca0496684..19777be81 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -39,13 +39,20 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.ParquetWriter;
 import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.hadoop.util.HadoopOutputFile;
+import org.apache.parquet.io.InputFile;
+import org.apache.parquet.schema.MessageType;
 
 import org.apache.xtable.model.schema.InternalPartitionField;
 
@@ -61,27 +68,8 @@
 @Builder
 public class ParquetDataManager {
   private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
-
-  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
-    ParquetReader reader = null;
-    Path file = new Path(filePath);
-    try {
-      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
-    } catch (IOException e) {
-      log.error("Unexpected error during Parquet read: {}", filePath, e);
-      throw new IOException("Unexpected error reading Parquet file", e);
-    }
-    return reader;
-  }
-
-  // TODO check if footer of added file can cause problems (catalog) when reading the full table
-  // after appending
-  // check required before appending the file
-  private boolean checkSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
-    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
-    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
-    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
-  }
+  private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB
+  private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB
 
   Instant parsePartitionDirToStartTime(
       String partitionDir, List<InternalPartitionField> partitionFields) {
@@ -213,6 +201,34 @@ private String findTargetPartitionFolder(
     }
     return "";
   }
+
+  private Path writeNewParquetFile(
+      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
+      throws IOException {
+    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
+    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
+    try (ParquetWriter<Group> writer =
+        new ParquetWriter<Group>(
+            outputFile,
+            new GroupWriteSupport(),
+            parquetFileConfig.getCodec(),
+            (int) parquetFileConfig.getRowGroupSize(),
+            pageSize,
+            pageSize, // dictionaryPageSize
+            true, // enableDictionary
+            false, // enableValidation
+            ParquetWriter.DEFAULT_WRITER_VERSION,
+            conf)) {
+      Group currentGroup = null;
+      while ((currentGroup = (Group) reader.read()) != null) {
+        writer.write(currentGroup);
+      }
+    } catch (Exception e) {
+      log.error("Unexpected error during Parquet write: {}", outputFile, e);
+      throw new IOException("Failed to complete Parquet write operation", e);
+    }
+    return outputFile;
+  }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
   private Path appendNewParquetFile(
@@ -250,36 +266,67 @@ private Path appendNewParquetFile(
     return finalFile;
   }
 
+  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
+    ParquetReader reader = null;
+    Path file = new Path(filePath);
+    try {
+      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
+    } catch (IOException e) {
+      log.error("Unexpected error during Parquet read: {}", filePath, e);
+      throw new IOException("Unexpected error reading Parquet file", e);
+    }
+    return reader;
+  }
+
+  /* Alternative Approach (without path construction) using Parquet API to append a file */
+
+  // after appending check required before appending the file
+  private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
+    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
+    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
+    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
+  }
+
   private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
 
-  private Path writeNewParquetFile(
-      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
-      throws IOException {
-    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
-    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
-    try (ParquetWriter<Group> writer =
-        new ParquetWriter<Group>(
-            outputFile,
-            new GroupWriteSupport(),
-            parquetFileConfig.getCodec(),
-            (int) parquetFileConfig.getRowGroupSize(),
-            pageSize,
-            pageSize, // dictionaryPageSize
-            true, // enableDictionary
-            false, // enableValidation
-            ParquetWriter.DEFAULT_WRITER_VERSION,
-            conf)) {
-      Group currentGroup = null;
-      while ((currentGroup = (Group) reader.read()) != null) {
-        writer.write(currentGroup);
-      }
-    } catch (Exception e) {
-      log.error("Unexpected error during Parquet write: {}", outputFile, e);
-      throw new IOException("Failed to complete Parquet write operation", e);
+  // Method2 to append a file into a table
+  public void mergeParquetFiles(
+      Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
+    Configuration conf = new Configuration();
+    ParquetFileWriter writer =
+        new ParquetFileWriter(
+            HadoopOutputFile.fromPath(outputPath, conf),
+            schema,
+            ParquetFileWriter.Mode.CREATE,
+            DEFAULT_BLOCK_SIZE,
+            0);
+    // write the initial table with the appended file to add into the outputPath
+    writer.start();
+    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
+    InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
+    if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
+      writer.appendFile(inputFile);
+      writer.appendFile(inputFileToAppend);
     }
-    return outputFile;
+    Map<String, String> combinedMeta = new HashMap<>();
+    // track the append date and save it in the footer
+    int appendCount =
+        Integer.parseInt(
+            ParquetFileReader.readFooter(conf, filePath)
+                .getFileMetaData()
+                .getKeyValueMetaData()
+                .getOrDefault("total_appends", "0"));
+    String newKey = "append_date_" + (appendCount + 1);
+    // get the equivalent fileStatus from the file-to-append Path
+    FileSystem fs = FileSystem.get(conf);
+    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
+    // save its modification time (for later sync related retrieval) in the metadata of the output
+    // table
+    combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime()));
+    combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
+    writer.end(combinedMeta);
   }
 }

From 47e707688d3ced9ee34c477c988224dc5c663ff4 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Thu, 1 Jan 2026 19:01:06 +0100
Subject: [PATCH 10/54] find the files that satisfy to the time condition

---
 .../xtable/parquet/ParquetDataManager.java    | 65 +++++++++++--------
 1 file changed, 39 insertions(+), 26 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 19777be81..777b829be 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -26,11 +26,7 @@
 import java.time.ZonedDateTime;
 import java.time.format.DateTimeFormatter;
 import java.time.temporal.ChronoUnit;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.UUID;
+import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -47,8 +43,8 @@
 import org.apache.parquet.hadoop.ParquetFileWriter;
 import org.apache.parquet.hadoop.ParquetReader;
 import org.apache.parquet.hadoop.ParquetWriter;
-import org.apache.parquet.hadoop.example.GroupReadSupport;
 import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
 import org.apache.parquet.hadoop.util.HadoopOutputFile;
 import org.apache.parquet.io.InputFile;
@@ -231,7 +227,7 @@ private Path writeNewParquetFile(
   }
   // partition fields are already computed, given a parquet file InternalDataFile must be derived
   // (e.g., using createInternalDataFileFromParquetFile())
-  private Path appendNewParquetFile(
+  private void appendNewParquetFile(
       Configuration conf,
       String rootPath,
       FileStatus parquetFile,
@@ -259,26 +255,12 @@ private Path appendNewParquetFile(
     // construct the path
     String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
-    // return its reader for convenience of writing
-    ParquetReader reader = getParquetReader(fileToAppend.getName(), conf);
     // then inject/append/write it in the right partition
-    finalFile = writeNewParquetFile(conf, reader, fileToAppend, outputFile);
-    return finalFile;
+    MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema();
+    AppendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
   }
 
-  public ParquetReader getParquetReader(String filePath, Configuration conf) throws IOException {
-    ParquetReader reader = null;
-    Path file = new Path(filePath);
-    try {
-      reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
-    } catch (IOException e) {
-      log.error("Unexpected error during Parquet read: {}", filePath, e);
-      throw new IOException("Unexpected error reading Parquet file", e);
-    }
-    return reader;
-  }
-
-  /* Alternative Approach (without path construction) using Parquet API to append a file */
+  /* Use Parquet API to append to a file */
 
   // after appending check required before appending the file
   private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
@@ -292,8 +274,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
-  // Method2 to append a file into a table
-  public void mergeParquetFiles(
+  // append a file into a table
+  public void AppendNewParquetFiles(
       Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
     Configuration conf = new Configuration();
     ParquetFileWriter writer =
@@ -329,4 +311,35 @@ public void mergeParquetFiles(
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
     writer.end(combinedMeta);
   }
+  // Find and retrieve the file paths that satisfy the time condition
+  // Each partition folder contains one merged_file in turn containing many appends
+  public List<Path> findFilesAfterModifTime(
+      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
+    List<Path> results = new ArrayList<>();
+    FileSystem fs = directoryPath.getFileSystem(conf);
+
+    FileStatus[] statuses =
+        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
+
+    for (FileStatus status : statuses) {
+      Path filePath = status.getPath();
+
+      try {
+        ParquetMetadata footer = ParquetFileReader.readFooter(conf, filePath);
+        Map<String, String> meta = footer.getFileMetaData().getKeyValueMetaData();
+
+        int totalAppends = Integer.parseInt(meta.getOrDefault("total_appends", "0"));
+        if (Long.parseLong(meta.get("append_date_0")) > targetModifTime) {
+          results.add(filePath);
+          break;
+        } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) {
+          continue;
+        }
+      } catch (Exception e) {
+        log.error("Could not read metadata for: {}", filePath, e);
+        throw new IOException("Could not read metadata for", e);
+      }
+    }
+    return results;
+  }
 }

From fbb09ecbc284c7c1eaa96a22934b91134f0e9750 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Thu, 1 Jan 2026 19:16:18 +0100
Subject: [PATCH 11/54] treat appends as separate files to add in the target
 partition folder

---
 .../xtable/parquet/ParquetDataManager.java    | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 777b829be..cc1f473d0 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -34,10 +34,7 @@
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocatedFileStatus;
-import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.*;
 import org.apache.parquet.example.data.Group;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetFileWriter;
@@ -257,7 +254,15 @@ private void appendNewParquetFile(
     Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
     // then inject/append/write it in the right partition
     MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema();
-    AppendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
+    appendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
+
+    // OR
+    /*FileSystem fs = outputFile.getFileSystem(conf);
+    if (!fs.exists(outputFile.getParent())) {
+      fs.mkdirs(outputFile.getParent());
+    }
+    // copy a seperate file into the target partition folder
+    FileUtil.copy(fs, fileToAppend, fs, outputFile, false, conf);*/
   }
 
   /* Use Parquet API to append to a file */
@@ -275,7 +280,7 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
   }
 
   // append a file into a table
-  public void AppendNewParquetFiles(
+  public void appendNewParquetFiles(
       Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
     Configuration conf = new Configuration();
     ParquetFileWriter writer =
@@ -335,11 +340,17 @@ public List<Path> findFilesAfterModifTime(
         } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) {
           continue;
         }
+        // OR
+        /*if (status.getPath().getName().endsWith(".parquet") &&
+                status.getModificationTime() > targetModifTime) {
+          results.add(status.getPath());
+        }*/
       } catch (Exception e) {
         log.error("Could not read metadata for: {}", filePath, e);
         throw new IOException("Could not read metadata for", e);
       }
     }
+
     return results;
   }
 }

From fe19a6055c92079a938ff2feb83d105e1b6c720e Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 20:10:35 +0100
Subject: [PATCH 12/54] update approach: selective block compaction

---
 .../xtable/parquet/ParquetDataManager.java    | 295 +++++-------------
 .../xtable/parquet/ParquetFileConfig.java     |   2 +-
 2 files changed, 73 insertions(+), 224 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index cc1f473d0..563c6ca3d 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -19,36 +19,23 @@
 package org.apache.xtable.parquet;
 
 import java.io.IOException;
-import java.time.Instant;
-import java.time.LocalDateTime;
-import java.time.ZoneId;
-import java.time.ZoneOffset;
-import java.time.ZonedDateTime;
-import java.time.format.DateTimeFormatter;
-import java.time.temporal.ChronoUnit;
 import java.util.*;
-import java.util.stream.Collectors;
-import java.util.stream.Stream;
 
 import lombok.Builder;
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
-import org.apache.parquet.example.data.Group;
+import org.apache.parquet.format.converter.ParquetMetadataConverter;
 import org.apache.parquet.hadoop.ParquetFileReader;
 import org.apache.parquet.hadoop.ParquetFileWriter;
-import org.apache.parquet.hadoop.ParquetReader;
-import org.apache.parquet.hadoop.ParquetWriter;
-import org.apache.parquet.hadoop.example.GroupWriteSupport;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.hadoop.util.HadoopInputFile;
 import org.apache.parquet.hadoop.util.HadoopOutputFile;
 import org.apache.parquet.io.InputFile;
 import org.apache.parquet.schema.MessageType;
 
-import org.apache.xtable.model.schema.InternalPartitionField;
-
 /**
  * Manages Parquet file operations including reading, writing, and partition discovery and path
  * construction.
@@ -64,207 +51,6 @@ public class ParquetDataManager {
   private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB
   private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB
 
-  Instant parsePartitionDirToStartTime(
-      String partitionDir, List<InternalPartitionField> partitionFields) {
-    Map<String, String> partitionValues = new HashMap<>();
-    String[] parts = partitionDir.split("/");
-    for (String part : parts) {
-      String[] keyValue = part.split("=");
-      if (keyValue.length == 2) {
-        partitionValues.put(keyValue[0].toLowerCase(), keyValue[1]);
-      }
-    }
-    int year = 1970;
-    int month = 1;
-    int day = 1;
-    int hour = 0;
-    int minute = 0;
-    int second = 0;
-
-    for (InternalPartitionField field : partitionFields) {
-      String fieldName = field.getSourceField().getName().toLowerCase();
-      String value = partitionValues.get(fieldName);
-
-      if (value != null) {
-        try {
-          int intValue = Integer.parseInt(value);
-
-          switch (fieldName) {
-            case "year":
-              year = intValue;
-              break;
-            case "month":
-              month = intValue;
-              break;
-            case "day":
-              day = intValue;
-              break;
-            case "hour":
-              hour = intValue;
-              break;
-            case "minute":
-              minute = intValue;
-              break;
-            case "second":
-              second = intValue;
-              break;
-            default:
-              break;
-          }
-        } catch (NumberFormatException e) {
-          System.err.println(
-              "Warning: Invalid number format for partition field '" + fieldName + "': " + value);
-        }
-      }
-    }
-
-    try {
-      LocalDateTime localDateTime = LocalDateTime.of(year, month, day, hour, minute, second);
-      return localDateTime.toInstant(ZoneOffset.UTC);
-    } catch (java.time.DateTimeException e) {
-      throw new IllegalArgumentException(
-          "Invalid partition directory date components: " + partitionDir, e);
-    }
-  }
-
-  private ChronoUnit getGranularityUnit(List<InternalPartitionField> partitionFields) {
-    if (partitionFields == null || partitionFields.isEmpty()) {
-      return ChronoUnit.DAYS;
-    }
-    // get the most granular field (the last one in the list, e.g., 'hour' after 'year' and 'month')
-    String lastFieldName =
-        partitionFields.get(partitionFields.size() - 1).getSourceField().getName();
-
-    if (lastFieldName.equalsIgnoreCase("year")) {
-      return ChronoUnit.YEARS;
-    } else if (lastFieldName.equalsIgnoreCase("month")) {
-      return ChronoUnit.MONTHS;
-    } else if (lastFieldName.equalsIgnoreCase("day") || lastFieldName.equalsIgnoreCase("date")) {
-      return ChronoUnit.DAYS;
-    } else if (lastFieldName.equalsIgnoreCase("hour")) {
-      return ChronoUnit.HOURS;
-    } else if (lastFieldName.equalsIgnoreCase("minute")) {
-      return ChronoUnit.MINUTES;
-    } else {
-      return ChronoUnit.DAYS;
-    }
-  }
-
-  // find the target partition by comparing the modficationTime of the file to add with the current
-  // tables
-  // if the modifTime falls between two partitions then we know it belongs to the first
-  private String findTargetPartitionFolder(
-      Instant modifTime,
-      List<InternalPartitionField> partitionFields,
-      // could be retrieved from getParquetFiles() of ParquetConversionSource
-      Stream<LocatedFileStatus> parquetFiles)
-      throws IOException {
-
-    long modifTimeMillis = modifTime.toEpochMilli();
-    final ZoneId ZONE = ZoneId.systemDefault();
-    // find the partition granularity (days, hours...)
-    ChronoUnit partitionUnit = getGranularityUnit(partitionFields);
-    List<String> allFolders =
-        parquetFiles.map(status -> status.getPath().toString()).collect(Collectors.toList());
-
-    // sort the folders by their start time
-    allFolders.sort(Comparator.comparing(f -> parsePartitionDirToStartTime(f, partitionFields)));
-
-    for (int i = 0; i < allFolders.size(); i++) {
-      String currentFolder = allFolders.get(i);
-
-      // start time of the current folder
-      Instant currentStartTime = parsePartitionDirToStartTime(currentFolder, partitionFields);
-      long currentStartMillis = currentStartTime.toEpochMilli();
-
-      // determine the end time (which is the start time of the next logical partition)
-      long nextStartMillis;
-      if (i + 1 < allFolders.size()) {
-        nextStartMillis =
-            parsePartitionDirToStartTime(allFolders.get(i + 1), partitionFields).toEpochMilli();
-      } else {
-        ZonedDateTime currentStartZDT = currentStartTime.atZone(ZONE);
-        ZonedDateTime nextStartZDT = currentStartZDT.plus(1, partitionUnit);
-        // to evaluate the partition date value and make comparable with the modifTime in Instant
-        nextStartMillis = nextStartZDT.toInstant().toEpochMilli();
-      }
-      if (modifTimeMillis >= currentStartMillis && modifTimeMillis < nextStartMillis) {
-        return currentFolder;
-      }
-    }
-    return "";
-  }
-
-  private Path writeNewParquetFile(
-      Configuration conf, ParquetReader reader, Path fileToAppend, Path outputFile)
-      throws IOException {
-    ParquetFileConfig parquetFileConfig = getParquetFileConfig(conf, fileToAppend);
-    int pageSize = ParquetWriter.DEFAULT_PAGE_SIZE;
-    try (ParquetWriter<Group> writer =
-        new ParquetWriter<Group>(
-            outputFile,
-            new GroupWriteSupport(),
-            parquetFileConfig.getCodec(),
-            (int) parquetFileConfig.getRowGroupSize(),
-            pageSize,
-            pageSize, // dictionaryPageSize
-            true, // enableDictionary
-            false, // enableValidation
-            ParquetWriter.DEFAULT_WRITER_VERSION,
-            conf)) {
-      Group currentGroup = null;
-      while ((currentGroup = (Group) reader.read()) != null) {
-        writer.write(currentGroup);
-      }
-    } catch (Exception e) {
-      log.error("Unexpected error during Parquet write: {}", outputFile, e);
-      throw new IOException("Failed to complete Parquet write operation", e);
-    }
-    return outputFile;
-  }
-  // partition fields are already computed, given a parquet file InternalDataFile must be derived
-  // (e.g., using createInternalDataFileFromParquetFile())
-  private void appendNewParquetFile(
-      Configuration conf,
-      String rootPath,
-      FileStatus parquetFile,
-      Stream<LocatedFileStatus> parquetFiles,
-      List<InternalPartitionField> partitionFields)
-      throws IOException {
-    Path finalFile = null;
-    String partitionDir = "";
-    Instant modifTime = Instant.ofEpochMilli(parquetFile.getModificationTime());
-    Path fileToAppend = parquetFile.getPath();
-    // construct the file path to inject into the existing partitioned file
-    if (partitionFields == null || partitionFields.isEmpty()) {
-      String partitionValue =
-          DateTimeFormatter.ISO_LOCAL_DATE.format(
-              modifTime.atZone(ZoneId.systemDefault()).toLocalDate());
-      partitionDir = partitionFields.get(0).getSourceField().getName() + partitionValue;
-    } else {
-      // handle multiple partitioning case (year and month etc.), find the target partition dir
-      try {
-        partitionDir = findTargetPartitionFolder(modifTime, partitionFields, parquetFiles);
-      } catch (IOException e) {
-        e.printStackTrace();
-      }
-    }
-    // construct the path
-    String fileName = "part-" + System.currentTimeMillis() + "-" + UUID.randomUUID() + ".parquet";
-    Path outputFile = new Path(new Path(rootPath, partitionDir), fileName);
-    // then inject/append/write it in the right partition
-    MessageType schema = getParquetFileConfig(conf, new Path(rootPath)).getSchema();
-    appendNewParquetFiles(outputFile, new Path(rootPath), fileToAppend, schema);
-
-    // OR
-    /*FileSystem fs = outputFile.getFileSystem(conf);
-    if (!fs.exists(outputFile.getParent())) {
-      fs.mkdirs(outputFile.getParent());
-    }
-    // copy a seperate file into the target partition folder
-    FileUtil.copy(fs, fileToAppend, fs, outputFile, false, conf);*/
-  }
-
   /* Use Parquet API to append to a file */
 
   // after appending check required before appending the file
@@ -279,23 +65,25 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
-  // append a file into a table
-  public void appendNewParquetFiles(
-      Path outputPath, Path filePath, Path fileToAppend, MessageType schema) throws IOException {
+  // append a file into a table (merges two files into one .parquet under a partition folder)
+  public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
+      throws IOException {
     Configuration conf = new Configuration();
+    long firstBlockIndex = getParquetFileConfig(conf, fileToAppend).getRowGroupSize();
     ParquetFileWriter writer =
         new ParquetFileWriter(
-            HadoopOutputFile.fromPath(outputPath, conf),
+            HadoopOutputFile.fromPath(filePath, conf),
+            // HadoopOutputFile.fromPath(outputPath, conf),
             schema,
             ParquetFileWriter.Mode.CREATE,
             DEFAULT_BLOCK_SIZE,
             0);
     // write the initial table with the appended file to add into the outputPath
     writer.start();
-    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
+    // HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
     InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
-      writer.appendFile(inputFile);
+      // writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
     }
     Map<String, String> combinedMeta = new HashMap<>();
@@ -310,12 +98,73 @@ public void appendNewParquetFiles(
     // get the equivalent fileStatus from the file-to-append Path
     FileSystem fs = FileSystem.get(conf);
     FileStatus fileStatus = fs.getFileStatus(fileToAppend);
-    // save its modification time (for later sync related retrieval) in the metadata of the output
+    // save block indexes and modification time (for later sync related retrieval) in the metadata
+    // of the output
     // table
     combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime()));
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
+    combinedMeta.put(
+        "index_start_block_of_append_" + String.valueOf(appendCount + 1),
+        String.valueOf(firstBlockIndex));
     writer.end(combinedMeta);
   }
+  // selective compaction of parquet blocks
+  public List<Path> formNewTargetFiles(Configuration conf, Path directoryPath, long targetModifTime)
+      throws IOException {
+    List<Path> finalPaths = new ArrayList<>();
+    FileSystem fs = directoryPath.getFileSystem(conf);
+
+    FileStatus[] statuses =
+        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
+
+    for (FileStatus status : statuses) {
+
+      Path filePath = status.getPath();
+      ParquetMetadata bigFileFooter =
+          ParquetFileReader.readFooter(conf, filePath, ParquetMetadataConverter.NO_FILTER);
+      MessageType schema = bigFileFooter.getFileMetaData().getSchema();
+      int totalAppends =
+          Integer.parseInt(
+              bigFileFooter
+                  .getFileMetaData()
+                  .getKeyValueMetaData()
+                  .getOrDefault("total_appends", "0"));
+      for (int i = 1; i < totalAppends; i++) {
+        int startBlock =
+            Integer.valueOf(
+                bigFileFooter
+                    .getFileMetaData()
+                    .getKeyValueMetaData()
+                    .get("index_start_block_of_append_" + totalAppends));
+        int endBlock =
+            Integer.valueOf(
+                bigFileFooter
+                    .getFileMetaData()
+                    .getKeyValueMetaData()
+                    .get("index_end_block_of_append_" + (totalAppends + 1)));
+        Path targetSyncFilePath =
+            new Path(
+                status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock));
+        try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
+          writer.start();
+          for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
+            BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
+            if (j >= startBlock
+                && j <= endBlock
+                && status.getModificationTime() >= targetModifTime) {
+              List<BlockMetaData> blockList =
+                  Collections.<BlockMetaData>singletonList(blockMetadata);
+              FSDataInputStream targetInputStream = fs.open(status.getPath());
+              writer.appendRowGroups(targetInputStream, blockList, false);
+            }
+          }
+          writer.end(new HashMap<>());
+          finalPaths.add(targetSyncFilePath);
+        }
+      }
+    }
+    return finalPaths;
+  }
   // Find and retrieve the file paths that satisfy the time condition
   // Each partition folder contains one merged_file in turn containing many appends
   public List<Path> findFilesAfterModifTime(
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index e4b645f5d..64c9e9cfc 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -44,7 +44,7 @@ public ParquetFileConfig(Configuration conf, Path file) {
     }
 
     this.schema = metadata.getFileMetaData().getSchema();
-    this.rowGroupSize = metadata.getBlocks().get(0).getTotalByteSize();
+    this.rowGroupSize = metadata.getBlocks().size(); // .get(0).getTotalByteSize()
     this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
   }
 }

From da7f300b18172b21f968ba4583d2fbe98dc0488c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 20:50:01 +0100
Subject: [PATCH 13/54] update approach: added a basic test to check data
 selection using modificationTime selector

---
 .../xtable/parquet/ParquetDataManager.java    | 41 +------------
 .../xtable/parquet/ITParquetDataManager.java  | 58 +++++++++++++++++--
 2 files changed, 56 insertions(+), 43 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 563c6ca3d..75f3c4f80 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -109,8 +109,8 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
-  public List<Path> formNewTargetFiles(Configuration conf, Path directoryPath, long targetModifTime)
-      throws IOException {
+  public static List<Path> formNewTargetFiles(
+      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
     List<Path> finalPaths = new ArrayList<>();
     FileSystem fs = directoryPath.getFileSystem(conf);
 
@@ -165,41 +165,4 @@ public List<Path> formNewTargetFiles(Configuration conf, Path directoryPath, lon
     }
     return finalPaths;
   }
-  // Find and retrieve the file paths that satisfy the time condition
-  // Each partition folder contains one merged_file in turn containing many appends
-  public List<Path> findFilesAfterModifTime(
-      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
-    List<Path> results = new ArrayList<>();
-    FileSystem fs = directoryPath.getFileSystem(conf);
-
-    FileStatus[] statuses =
-        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
-
-    for (FileStatus status : statuses) {
-      Path filePath = status.getPath();
-
-      try {
-        ParquetMetadata footer = ParquetFileReader.readFooter(conf, filePath);
-        Map<String, String> meta = footer.getFileMetaData().getKeyValueMetaData();
-
-        int totalAppends = Integer.parseInt(meta.getOrDefault("total_appends", "0"));
-        if (Long.parseLong(meta.get("append_date_0")) > targetModifTime) {
-          results.add(filePath);
-          break;
-        } else if (Long.parseLong(meta.get("append_date_" + totalAppends)) < targetModifTime) {
-          continue;
-        }
-        // OR
-        /*if (status.getPath().getName().endsWith(".parquet") &&
-                status.getModificationTime() > targetModifTime) {
-          results.add(status.getPath());
-        }*/
-      } catch (Exception e) {
-        log.error("Could not read metadata for: {}", filePath, e);
-        throw new IOException("Could not read metadata for", e);
-      }
-    }
-
-    return results;
-  }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 30d044578..c084eb6df 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,12 +18,17 @@
  
 package org.apache.xtable.parquet;
 
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -39,7 +44,7 @@ public class ITParquetDataManager {
   public void testAppendParquetFile() throws IOException {
     SparkSession spark =
         SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
-
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
             new StructField[] {
@@ -62,11 +67,56 @@ public void testAppendParquetFile() throws IOException {
 
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
-    // TODO create an InternalDataFile and Partition Fields for testing purposes
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath =
+        new org.apache.hadoop.fs.Path("target/fixed-parquet-data/parquet-partitioned_table_test");
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the file
+    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
+    long newModifTime = System.currentTimeMillis() - 5000;
+    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7");
+
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }
+    List<org.apache.hadoop.fs.Path> resultingFiles =
+        ParquetDataManager.formNewTargetFiles(
+            conf, new org.apache.hadoop.fs.Path(fixedPath.toUri()), newModifTime);
+    // check if resultingFiles contains the append data only (through the partition names)
+    for (org.apache.hadoop.fs.Path p : resultingFiles) {
+      String pathString = p.toString();
+      //  should be TRUE
+      boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
 
-    // TODO test appendNewParquetFile()
+      // should be FALSE
+      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+
+      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
+      assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+    }
+    // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file)
 
-    // TODO validate the final table
     spark.stop();
   }
+
+  private void updateModificationTimeRecursive(
+      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
+    org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> it =
+        fs.listFiles(path, true);
+    while (it.hasNext()) {
+      org.apache.hadoop.fs.LocatedFileStatus status = it.next();
+      if (status.getPath().getName().endsWith(".parquet")) {
+        fs.setTimes(status.getPath(), time, -1);
+      }
+    }
+  }
 }

From a8730b7c33fd4aabe9066a855b630bfb0e08bdda Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 21:18:28 +0100
Subject: [PATCH 14/54] fix append based on partition value

---
 .../xtable/parquet/ParquetDataManager.java    | 30 +++++++++++++++----
 .../xtable/parquet/ParquetFileConfig.java     |  4 +--
 .../xtable/parquet/ITParquetDataManager.java  |  2 ++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 75f3c4f80..6585fda41 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,25 +65,45 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
+  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) throws IOException {
+    Configuration conf = new Configuration();
+
+    // e.g., "year=2024/month=12"
+    String targetPartition = getPartitionPath(targetPath);
+    String sourcePartition = getPartitionPath(sourcePath);
+
+    if (!targetPartition.equals(sourcePartition)) {
+      throw new IllegalArgumentException("Partition Mismatch! Cannot merge " +
+              sourcePartition + " into " + targetPartition);
+    }
+
+    // append files within the same partition foldr
+    appendNewParquetFiles(targetPath, sourcePath, schema);
+  }
+
+  private String getPartitionPath(Path path) {
+    // Returns the parent directory name (e.g., country=US)
+    // For nested partitions, you might need path.getParent().getParent()...
+    return path.getParent().getName();
+  }
   // append a file into a table (merges two files into one .parquet under a partition folder)
   public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
     Configuration conf = new Configuration();
-    long firstBlockIndex = getParquetFileConfig(conf, fileToAppend).getRowGroupSize();
+    long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
     ParquetFileWriter writer =
         new ParquetFileWriter(
             HadoopOutputFile.fromPath(filePath, conf),
-            // HadoopOutputFile.fromPath(outputPath, conf),
             schema,
             ParquetFileWriter.Mode.CREATE,
             DEFAULT_BLOCK_SIZE,
             0);
     // write the initial table with the appended file to add into the outputPath
     writer.start();
-    // HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
+    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
     InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
-      // writer.appendFile(inputFile);
+      writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
     }
     Map<String, String> combinedMeta = new HashMap<>();
@@ -105,7 +125,7 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
     combinedMeta.put(
         "index_start_block_of_append_" + String.valueOf(appendCount + 1),
-        String.valueOf(firstBlockIndex));
+        String.valueOf(firstBlockIndex+1));
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 64c9e9cfc..33ee08d38 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -32,7 +32,7 @@
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 class ParquetFileConfig {
   MessageType schema;
-  long rowGroupSize;
+  long rowGroupIndex;
   CompressionCodecName codec;
 
   public ParquetFileConfig(Configuration conf, Path file) {
@@ -44,7 +44,7 @@ public ParquetFileConfig(Configuration conf, Path file) {
     }
 
     this.schema = metadata.getFileMetaData().getSchema();
-    this.rowGroupSize = metadata.getBlocks().size(); // .get(0).getTotalByteSize()
+    this.rowGroupIndex = metadata.getBlocks().size();
     this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
   }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index c084eb6df..3aff654a1 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -79,6 +79,8 @@ public void testAppendParquetFile() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
+    // TODO create the folders manually for a new partition value as appendFile works only within the same partition value
+
     long newModifTime = System.currentTimeMillis() - 5000;
     List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7");
 

From d19ccbf2ebe776f69dbf65e560c8bcf4433f8599 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 22:46:38 +0100
Subject: [PATCH 15/54] fix test with basic example where partitions are not
 considered

---
 .../xtable/parquet/ParquetDataManager.java    |  9 ++++---
 .../xtable/parquet/ITParquetDataManager.java  | 26 ++++++++-----------
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 6585fda41..01fb6b75f 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,7 +65,8 @@ private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAp
     return parquetFileConfig;
   }
 
-  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema) throws IOException {
+  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema)
+      throws IOException {
     Configuration conf = new Configuration();
 
     // e.g., "year=2024/month=12"
@@ -73,8 +74,8 @@ public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageTy
     String sourcePartition = getPartitionPath(sourcePath);
 
     if (!targetPartition.equals(sourcePartition)) {
-      throw new IllegalArgumentException("Partition Mismatch! Cannot merge " +
-              sourcePartition + " into " + targetPartition);
+      throw new IllegalArgumentException(
+          "Partition Mismatch! Cannot merge " + sourcePartition + " into " + targetPartition);
     }
 
     // append files within the same partition foldr
@@ -125,7 +126,7 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
     combinedMeta.put(
         "index_start_block_of_append_" + String.valueOf(appendCount + 1),
-        String.valueOf(firstBlockIndex+1));
+        String.valueOf(firstBlockIndex + 1));
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 3aff654a1..61a9a9cf5 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,7 +18,6 @@
  
 package org.apache.xtable.parquet;
 
-import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
@@ -55,11 +54,9 @@ public void testAppendParquetFile() throws IOException {
             });
 
     List<Row> data =
-        Arrays.asList(
-            RowFactory.create(101, "A", 2025, 12),
-            RowFactory.create(102, "B", 2025, 12),
-            RowFactory.create(201, "C", 2025, 11),
-            RowFactory.create(301, "D", 2024, 7));
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
+    /* RowFactory.create(201, "C", 2025, 11),
+    RowFactory.create(301, "D", 2024, 7));*/
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
@@ -69,20 +66,20 @@ public void testAppendParquetFile() throws IOException {
 
     // test find files to sync
     long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath =
-        new org.apache.hadoop.fs.Path("target/fixed-parquet-data/parquet-partitioned_table_test");
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
     FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
     // set the modification time to the file
     updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
     // create new file to append using Spark
     List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 7));
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
-    // TODO create the folders manually for a new partition value as appendFile works only within the same partition value
+    // TODO create the folders manually for a new partition value as appendFile works only within
+    // the same partition value
 
     long newModifTime = System.currentTimeMillis() - 5000;
-    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=7");
+    List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
 
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
@@ -91,19 +88,18 @@ public void testAppendParquetFile() throws IOException {
       }
     }
     List<org.apache.hadoop.fs.Path> resultingFiles =
-        ParquetDataManager.formNewTargetFiles(
-            conf, new org.apache.hadoop.fs.Path(fixedPath.toUri()), newModifTime);
+        ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
       //  should be TRUE
-      boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
+      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
 
       // should be FALSE
       boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
 
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-      assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
     // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file)
 

From aecb204ede4ad15851e1c403866e5cb88366be2a Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 23:06:36 +0100
Subject: [PATCH 16/54] fix test with basic example where partitions are not
 considered2

---
 .../xtable/parquet/ITParquetDataManager.java  | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 61a9a9cf5..ad5a41f2a 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -60,7 +60,8 @@ public void testAppendParquetFile() throws IOException {
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
-    String outputPath = fixedPath.toFile().getName();
+    String outputPath = fixedPath.toString();
+    ;
 
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
@@ -79,25 +80,30 @@ public void testAppendParquetFile() throws IOException {
     // the same partition value
 
     long newModifTime = System.currentTimeMillis() - 5000;
-    List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
-
-    for (String partition : newPartitions) {
+    // TODO many partitions case
+    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
+    /* for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
       if (fs.exists(partitionPath)) {
         updateModificationTimeRecursive(fs, partitionPath, newModifTime);
       }
-    }
+    }*/
+    org.apache.hadoop.fs.Path partitionPath =
+        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
+    updateModificationTimeRecursive(fs, partitionPath, newModifTime);
     List<org.apache.hadoop.fs.Path> resultingFiles =
         ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
-      //  should be TRUE
+      //  should be TRUE (test for many partitions)
       boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
 
-      // should be FALSE
+      // should be FALSE (test for many partitions)
       boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
-
+      long modTime = fs.getFileStatus(p).getModificationTime();
+      // test for one partition value
+      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }

From 0ec8cbb7ed740cbe72687d02878742167bca9ac8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Fri, 2 Jan 2026 23:22:34 +0100
Subject: [PATCH 17/54] fix test with basic example where partitions are not
 considered3

---
 .../xtable/parquet/ParquetDataManager.java      | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 01fb6b75f..e342cdacb 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -122,11 +122,16 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     // save block indexes and modification time (for later sync related retrieval) in the metadata
     // of the output
     // table
-    combinedMeta.put(newKey, String.valueOf(fileStatus.getModificationTime()));
-    combinedMeta.put("total_appends", String.valueOf(appendCount + 1));
+    int currentAppendIdx = appendCount + 1;
+    long startBlock = firstBlockIndex;
+    long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size();
+    long endBlock = startBlock + blocksAdded;
+
+    combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
+    combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
+    combinedMeta.put("index_end_block_of_append_" + currentAppendIdx, String.valueOf(endBlock));
     combinedMeta.put(
-        "index_start_block_of_append_" + String.valueOf(appendCount + 1),
-        String.valueOf(firstBlockIndex + 1));
+        "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
   }
   // selective compaction of parquet blocks
@@ -156,13 +161,13 @@ public static List<Path> formNewTargetFiles(
                 bigFileFooter
                     .getFileMetaData()
                     .getKeyValueMetaData()
-                    .get("index_start_block_of_append_" + totalAppends));
+                    .get("index_start_block_of_append_" + i));
         int endBlock =
             Integer.valueOf(
                 bigFileFooter
                     .getFileMetaData()
                     .getKeyValueMetaData()
-                    .get("index_end_block_of_append_" + (totalAppends + 1)));
+                    .get("index_end_block_of_append_" + i));
         Path targetSyncFilePath =
             new Path(
                 status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock));

From 9cb75dfd13c1180b38095e6d3ab5b0f80d97f2c8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 11:29:34 +0100
Subject: [PATCH 18/54] test with time of last append is now

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index ad5a41f2a..ac2a94887 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -79,7 +79,7 @@ public void testAppendParquetFile() throws IOException {
     // TODO create the folders manually for a new partition value as appendFile works only within
     // the same partition value
 
-    long newModifTime = System.currentTimeMillis() - 5000;
+    long newModifTime = System.currentTimeMillis() - 50000;
     // TODO many partitions case
     // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
     /* for (String partition : newPartitions) {
@@ -90,7 +90,7 @@ public void testAppendParquetFile() throws IOException {
     }*/
     org.apache.hadoop.fs.Path partitionPath =
         new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
-    updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
     List<org.apache.hadoop.fs.Path> resultingFiles =
         ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)

From 9e125f242ccd56fa1770ae27dbd6c0e0db4bc3e1 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 12:07:27 +0100
Subject: [PATCH 19/54] test appendFile with Parquet: TODO test with multiple
 partitions 1) append and 2) filter for sync

---
 .../xtable/parquet/ParquetDataManager.java    |  14 +--
 .../xtable/parquet/ITParquetDataManager.java  | 101 +++++++++++++++++-
 2 files changed, 106 insertions(+), 9 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index e342cdacb..5e8bb25bb 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -54,13 +54,14 @@ public class ParquetDataManager {
   /* Use Parquet API to append to a file */
 
   // after appending check required before appending the file
-  private boolean checkIfSchemaIsSame(Configuration conf, Path fileToAppend, Path fileFromTable) {
+  private static boolean checkIfSchemaIsSame(
+      Configuration conf, Path fileToAppend, Path fileFromTable) {
     ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
     ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
     return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
   }
 
-  private ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
+  private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
@@ -88,7 +89,7 @@ private String getPartitionPath(Path path) {
     return path.getParent().getName();
   }
   // append a file into a table (merges two files into one .parquet under a partition folder)
-  public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
+  public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
     Configuration conf = new Configuration();
     long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
@@ -133,15 +134,16 @@ public void appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType
     combinedMeta.put(
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
+    return filePath;
   }
   // selective compaction of parquet blocks
   public static List<Path> formNewTargetFiles(
-      Configuration conf, Path directoryPath, long targetModifTime) throws IOException {
+      Configuration conf, Path partitionPath, long targetModifTime) throws IOException {
     List<Path> finalPaths = new ArrayList<>();
-    FileSystem fs = directoryPath.getFileSystem(conf);
+    FileSystem fs = partitionPath.getFileSystem(conf);
 
     FileStatus[] statuses =
-        fs.listStatus(directoryPath, path -> path.getName().endsWith(".parquet"));
+        fs.listStatus(partitionPath, path -> path.getName().endsWith(".parquet"));
 
     for (FileStatus status : statuses) {
 
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index ac2a94887..b26460a32 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -28,6 +28,10 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Types;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -40,9 +44,9 @@
 public class ITParquetDataManager {
 
   @Test
-  public void testAppendParquetFile() throws IOException {
+  public void testFormParquetFileSinglePartition() throws IOException {
     SparkSession spark =
-        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+        SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
@@ -107,7 +111,98 @@ public void testAppendParquetFile() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-    // TODO test appendNewParquetFile() (using a non-Spark approach to append a parquet file)
+    spark.stop();
+  }
+
+  @Test
+  public void testAppendParquetFileSinglePartition() throws IOException {
+    SparkSession spark =
+        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
+    MessageType schemaParquet =
+        Types.buildMessage()
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("id")
+            .required(PrimitiveType.PrimitiveTypeName.BINARY)
+            .as(LogicalTypeAnnotation.stringType())
+            .named("value")
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("year")
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("month")
+            .named("parquet_schema");
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+    List<Row> data =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
+    /* RowFactory.create(201, "C", 2025, 11),
+    RowFactory.create(301, "D", 2024, 7));*/
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    String outputPath = fixedPath.toString();
+    String finalAppendFilePath = appendFilePath.toString();
+
+    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the table file
+    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync.write().partitionBy("year", "month").mode("overwrite").parquet(finalAppendFilePath);
+    long newModifTime = System.currentTimeMillis() - 50000;
+    // update modifTime for file to append
+    updateModificationTimeRecursive(
+        fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime);
+    org.apache.hadoop.fs.Path outputFile =
+        ParquetDataManager.appendNewParquetFiles(
+            new org.apache.hadoop.fs.Path(outputPath),
+            new org.apache.hadoop.fs.Path(finalAppendFilePath),
+            schemaParquet);
+
+    // TODO many partitions case
+    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
+    /* for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }*/
+    org.apache.hadoop.fs.Path partitionPath =
+        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
+    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+
+    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
+    // data to sync only)
+    List<org.apache.hadoop.fs.Path> resultingFiles =
+        ParquetDataManager.formNewTargetFiles(conf, outputFile, newModifTime);
+    // check if resultingFiles contains the append data only (through the partition names)
+    for (org.apache.hadoop.fs.Path p : resultingFiles) {
+      String pathString = p.toString();
+      //  should be TRUE (test for many partitions)
+      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
+
+      // should be FALSE (test for many partitions)
+      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+      long modTime = fs.getFileStatus(p).getModificationTime();
+      // test for one partition value
+      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
+      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
+      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+    }
 
     spark.stop();
   }

From 233ca770baccb5a6fd00d800bafb1029a7cb5720 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 13:10:02 +0100
Subject: [PATCH 20/54] merge recursively one partition files

---
 .../xtable/parquet/ParquetDataManager.java    | 47 ++++++++++++++++++-
 .../xtable/parquet/ITParquetDataManager.java  |  5 +-
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 5e8bb25bb..6384f5c01 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -88,7 +88,7 @@ private String getPartitionPath(Path path) {
     // For nested partitions, you might need path.getParent().getParent()...
     return path.getParent().getName();
   }
-  // append a file into a table (merges two files into one .parquet under a partition folder)
+  // append a file (merges two files into one .parquet under a partition folder)
   public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
     Configuration conf = new Configuration();
@@ -193,4 +193,49 @@ public static List<Path> formNewTargetFiles(
     }
     return finalPaths;
   }
+
+  public static Path appendPartitionedData(Path targetPath, Path sourcePath, MessageType schema)
+      throws IOException {
+    Configuration conf = new Configuration();
+    FileSystem fs = sourcePath.getFileSystem(conf);
+
+    List<Path> targetFiles = collectParquetFiles(fs, targetPath);
+
+    List<Path> sourceFiles = collectParquetFiles(fs, sourcePath);
+
+    if (targetFiles.isEmpty()) {
+      throw new IOException("Target directory contains no parquet files to append to.");
+    }
+    if (sourceFiles.isEmpty()) {
+      return targetPath;
+    }
+    Path masterTargetFile = targetFiles.get(0);
+
+    for (int i = 1; i < targetFiles.size(); i++) {
+      appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema);
+    }
+    for (Path sourceFile : sourceFiles) {
+      appendNewParquetFiles(masterTargetFile, sourceFile, schema);
+    }
+
+    return masterTargetFile;
+  }
+
+  private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws IOException {
+    List<Path> parquetFiles = new ArrayList<>();
+    if (!fs.exists(root)) return parquetFiles;
+
+    if (fs.getFileStatus(root).isDirectory()) {
+      RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
+      while (it.hasNext()) {
+        Path p = it.next().getPath();
+        if (p.getName().endsWith(".parquet")) {
+          parquetFiles.add(p);
+        }
+      }
+    } else {
+      parquetFiles.add(root);
+    }
+    return parquetFiles;
+  }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index b26460a32..3d7619cf3 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -167,11 +167,14 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     // update modifTime for file to append
     updateModificationTimeRecursive(
         fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime);
+    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
+    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
     org.apache.hadoop.fs.Path outputFile =
-        ParquetDataManager.appendNewParquetFiles(
+        ParquetDataManager.appendPartitionedData(
             new org.apache.hadoop.fs.Path(outputPath),
             new org.apache.hadoop.fs.Path(finalAppendFilePath),
             schemaParquet);
+    // can create big file which can be split if reaches a certain threshold size
 
     // TODO many partitions case
     // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");

From b4cba5a1c358689a8953e7b8d7916d2d2d51abc7 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 13:39:56 +0100
Subject: [PATCH 21/54] fix paths for files to append

---
 .../main/java/org/apache/xtable/parquet/ParquetDataManager.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 6384f5c01..da88a50e2 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -97,7 +97,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
         new ParquetFileWriter(
             HadoopOutputFile.fromPath(filePath, conf),
             schema,
-            ParquetFileWriter.Mode.CREATE,
+            ParquetFileWriter.Mode.OVERWRITE,
             DEFAULT_BLOCK_SIZE,
             0);
     // write the initial table with the appended file to add into the outputPath

From a564b29a7a11659b9ac647996cebfe1f7a34ca51 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 14:08:27 +0100
Subject: [PATCH 22/54] fix bug of appending file path

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index da88a50e2..fbf546d37 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -93,9 +93,10 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
       throws IOException {
     Configuration conf = new Configuration();
     long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
+    Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp");
     ParquetFileWriter writer =
         new ParquetFileWriter(
-            HadoopOutputFile.fromPath(filePath, conf),
+            HadoopOutputFile.fromPath(tempPath, conf),
             schema,
             ParquetFileWriter.Mode.OVERWRITE,
             DEFAULT_BLOCK_SIZE,
@@ -134,6 +135,8 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     combinedMeta.put(
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
+    fs.delete(filePath, false);
+    fs.rename(tempPath, filePath);
     return filePath;
   }
   // selective compaction of parquet blocks

From d1ceafba5cef0f8a916a994d44e74bbecfb45f64 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 14:38:15 +0100
Subject: [PATCH 23/54] fix bug of schema

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 3d7619cf3..6c490bcbb 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -119,6 +119,7 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     SparkSession spark =
         SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
     Configuration conf = spark.sparkContext().hadoopConfiguration();
+    // In testAppendParquetFileSinglePartition
     MessageType schemaParquet =
         Types.buildMessage()
             .required(PrimitiveType.PrimitiveTypeName.INT32)
@@ -126,10 +127,6 @@ public void testAppendParquetFileSinglePartition() throws IOException {
             .required(PrimitiveType.PrimitiveTypeName.BINARY)
             .as(LogicalTypeAnnotation.stringType())
             .named("value")
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("year")
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("month")
             .named("parquet_schema");
     StructType schema =
         DataTypes.createStructType(

From 649250fbfb939367ce4108e7e6bef267911859b8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 15:10:27 +0100
Subject: [PATCH 24/54] make sure returned files are non empty when iterating
 in the partitions

---
 .../org/apache/xtable/parquet/ParquetDataManager.java     | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index fbf546d37..2650e4791 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -226,14 +226,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
 
   private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws IOException {
     List<Path> parquetFiles = new ArrayList<>();
+    Configuration conf = fs.getConf();
     if (!fs.exists(root)) return parquetFiles;
 
     if (fs.getFileStatus(root).isDirectory()) {
       RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
       while (it.hasNext()) {
         Path p = it.next().getPath();
-        if (p.getName().endsWith(".parquet")) {
-          parquetFiles.add(p);
+        if (p.getName().endsWith(".parquet") && !p.getName().startsWith(".")) {
+          ParquetMetadata footer = ParquetFileReader.readFooter(conf, p);
+          if (!footer.getBlocks().isEmpty()) {
+            parquetFiles.add(p);
+          }
         }
       }
     } else {

From 24ff828b03af7d03cdecb3556fbe71254802d682 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 15:32:58 +0100
Subject: [PATCH 25/54] make sure returned files are non empty when iterating
 in the partitions 2

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java  | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 2650e4791..991879d05 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -233,9 +233,11 @@ private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws I
       RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
       while (it.hasNext()) {
         Path p = it.next().getPath();
-        if (p.getName().endsWith(".parquet") && !p.getName().startsWith(".")) {
+        if (p.getName().endsWith(".parquet")
+            && !p.toString().contains("/.")
+            && !p.toString().contains("/_")) {
           ParquetMetadata footer = ParquetFileReader.readFooter(conf, p);
-          if (!footer.getBlocks().isEmpty()) {
+          if (footer.getBlocks() != null && !footer.getBlocks().isEmpty()) {
             parquetFiles.add(p);
           }
         }

From 463d2eea37664ce1aab0414c8ac366f665abb1e6 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 16:20:21 +0100
Subject: [PATCH 26/54] discard empty files when appending

---
 .../org/apache/xtable/parquet/ParquetDataManager.java    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 991879d05..942bf95e8 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -215,9 +215,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
     Path masterTargetFile = targetFiles.get(0);
 
     for (int i = 1; i < targetFiles.size(); i++) {
+      long len = fs.getFileStatus(targetFiles.get(i)).getLen();
+      System.out.println("DEBUG: Attempting to append " + targetFiles.get(i) + " Size: " + len);
       appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema);
     }
     for (Path sourceFile : sourceFiles) {
+      ParquetMetadata sourceFooter = ParquetFileReader.readFooter(conf, sourceFile);
+      if (sourceFooter.getBlocks().isEmpty()) {
+        System.out.println("SKIPPING: " + sourceFile + " has no data.");
+        continue;
+      }
+      long len = fs.getFileStatus(sourceFile).getLen();
+      System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len);
       appendNewParquetFiles(masterTargetFile, sourceFile, schema);
     }
 

From 71d1c34f97f12db45840277a7f7f4cfa095e8222 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 16:45:59 +0100
Subject: [PATCH 27/54] discard empty files when appending

---
 .../apache/xtable/parquet/ParquetDataManager.java   | 13 ++++++++++++-
 .../apache/xtable/parquet/ITParquetDataManager.java |  7 ++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 942bf95e8..76efa93fb 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -212,7 +212,18 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
     if (sourceFiles.isEmpty()) {
       return targetPath;
     }
-    Path masterTargetFile = targetFiles.get(0);
+    Path masterTargetFile = null;
+    for (Path p : targetFiles) {
+      ParquetMetadata metadata = ParquetFileReader.readFooter(conf, p);
+      if (!metadata.getBlocks().isEmpty()) {
+        masterTargetFile = p;
+        break;
+      }
+    }
+
+    if (masterTargetFile == null) {
+      throw new IOException("Target directory contains no files with valid data blocks.");
+    }
 
     for (int i = 1; i < targetFiles.size(); i++) {
       long len = fs.getFileStatus(targetFiles.get(i)).getLen();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 6c490bcbb..bd419c885 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -159,7 +159,12 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     List<Row> futureDataToSync =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync.write().partitionBy("year", "month").mode("overwrite").parquet(finalAppendFilePath);
+    dfToSync
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("overwrite")
+        .parquet(finalAppendFilePath);
     long newModifTime = System.currentTimeMillis() - 50000;
     // update modifTime for file to append
     updateModificationTimeRecursive(

From 3319a91429eec91faeaabb3dce4d0eac95b5b3a5 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 17:08:36 +0100
Subject: [PATCH 28/54] one spark session for both tests

---
 .../xtable/parquet/ITParquetDataManager.java  | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index bd419c885..d12e3016d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -39,14 +39,22 @@
 import org.apache.spark.sql.types.DataTypes;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
+  private static SparkSession spark;
+
+  @BeforeAll
+  public static void setup() {
+    spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
+  }
 
   @Test
   public void testFormParquetFileSinglePartition() throws IOException {
-    SparkSession spark =
-        SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();
+    /*SparkSession spark =
+    SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
@@ -111,13 +119,13 @@ public void testFormParquetFileSinglePartition() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-    spark.stop();
+    // spark.stop();
   }
 
   @Test
   public void testAppendParquetFileSinglePartition() throws IOException {
-    SparkSession spark =
-        SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();
+    /*SparkSession spark =
+    SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
     MessageType schemaParquet =
@@ -209,7 +217,7 @@ public void testAppendParquetFileSinglePartition() throws IOException {
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
 
-    spark.stop();
+    // spark.stop();
   }
 
   private void updateModificationTimeRecursive(
@@ -223,4 +231,11 @@ private void updateModificationTimeRecursive(
       }
     }
   }
+
+  @AfterAll
+  public static void tearDown() {
+    if (spark != null) {
+      spark.stop();
+    }
+  }
 }

From 013ffe4c000b68992460679180eb448ef63c1974 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 17:41:31 +0100
Subject: [PATCH 29/54] bug fix

---
 .../xtable/parquet/ParquetDataManager.java      | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 76efa93fb..471925c1d 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -93,6 +93,8 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
       throws IOException {
     Configuration conf = new Configuration();
     long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
+    ParquetMetadata existingFooter = ParquetFileReader.readFooter(conf, filePath);
+    Map<String, String> existingMeta = existingFooter.getFileMetaData().getKeyValueMetaData();
     Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp");
     ParquetFileWriter writer =
         new ParquetFileWriter(
@@ -110,14 +112,11 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
       writer.appendFile(inputFileToAppend);
     }
     Map<String, String> combinedMeta = new HashMap<>();
+    if (existingMeta != null) {
+      combinedMeta.putAll(existingMeta);
+    }
     // track the append date and save it in the footer
-    int appendCount =
-        Integer.parseInt(
-            ParquetFileReader.readFooter(conf, filePath)
-                .getFileMetaData()
-                .getKeyValueMetaData()
-                .getOrDefault("total_appends", "0"));
-    String newKey = "append_date_" + (appendCount + 1);
+    int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0"));
     // get the equivalent fileStatus from the file-to-append Path
     FileSystem fs = FileSystem.get(conf);
     FileStatus fileStatus = fs.getFileStatus(fileToAppend);
@@ -127,7 +126,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     int currentAppendIdx = appendCount + 1;
     long startBlock = firstBlockIndex;
     long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size();
-    long endBlock = startBlock + blocksAdded;
+    long endBlock = startBlock + blocksAdded - 1;
 
     combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
     combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
@@ -160,7 +159,7 @@ public static List<Path> formNewTargetFiles(
                   .getFileMetaData()
                   .getKeyValueMetaData()
                   .getOrDefault("total_appends", "0"));
-      for (int i = 1; i < totalAppends; i++) {
+      for (int i = 1; i <= totalAppends; i++) {
         int startBlock =
             Integer.valueOf(
                 bigFileFooter

From b939a0ac1229ca6d8fd5cec695d98a0f31c5d395 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 18:03:43 +0100
Subject: [PATCH 30/54] bug fix

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 471925c1d..bf0035630 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -172,9 +172,10 @@ public static List<Path> formNewTargetFiles(
                     .getFileMetaData()
                     .getKeyValueMetaData()
                     .get("index_end_block_of_append_" + i));
-        Path targetSyncFilePath =
-            new Path(
-                status.getPath().getName() + String.valueOf(startBlock) + String.valueOf(endBlock));
+        String newFileName =
+            String.format(
+                "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
+        Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName);
         try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {

From b6d8ddccb49eb83e687f327fee9e3e80f356421c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 3 Jan 2026 18:36:47 +0100
Subject: [PATCH 31/54] cleanups + TODO: partitions match and merge

---
 .../apache/xtable/parquet/ParquetDataManager.java    |  6 ++----
 .../apache/xtable/parquet/ITParquetDataManager.java  | 12 ------------
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index bf0035630..7263612e6 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,10 +65,10 @@ private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path f
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
-
+  // TODO use this method to match the partition folders and merge them one by one (if one exists
+  // but not in other file then create it)
   public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema)
       throws IOException {
-    Configuration conf = new Configuration();
 
     // e.g., "year=2024/month=12"
     String targetPartition = getPartitionPath(targetPath);
@@ -84,8 +84,6 @@ public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageTy
   }
 
   private String getPartitionPath(Path path) {
-    // Returns the parent directory name (e.g., country=US)
-    // For nested partitions, you might need path.getParent().getParent()...
     return path.getParent().getName();
   }
   // append a file (merges two files into one .parquet under a partition folder)
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index d12e3016d..420618b92 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -53,8 +53,6 @@ public static void setup() {
 
   @Test
   public void testFormParquetFileSinglePartition() throws IOException {
-    /*SparkSession spark =
-    SparkSession.builder().appName("TestCreateFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
         DataTypes.createStructType(
@@ -67,9 +65,6 @@ public void testFormParquetFileSinglePartition() throws IOException {
 
     List<Row> data =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-    /* RowFactory.create(201, "C", 2025, 11),
-    RowFactory.create(301, "D", 2024, 7));*/
-
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
     String outputPath = fixedPath.toString();
@@ -119,13 +114,10 @@ public void testFormParquetFileSinglePartition() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-    // spark.stop();
   }
 
   @Test
   public void testAppendParquetFileSinglePartition() throws IOException {
-    /*SparkSession spark =
-    SparkSession.builder().appName("TestAppendFunctionnality").master("local[*]").getOrCreate();*/
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
     MessageType schemaParquet =
@@ -146,8 +138,6 @@ public void testAppendParquetFileSinglePartition() throws IOException {
             });
     List<Row> data =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-    /* RowFactory.create(201, "C", 2025, 11),
-    RowFactory.create(301, "D", 2024, 7));*/
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
@@ -216,8 +206,6 @@ public void testAppendParquetFileSinglePartition() throws IOException {
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
       //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
-
-    // spark.stop();
   }
 
   private void updateModificationTimeRecursive(

From c18ab1c23a16d12accb00e40842261e4931778a8 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 15:57:08 +0100
Subject: [PATCH 32/54] added test for many partitions, TODO integrate
 functions into ParquetConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    |  46 ++++++
 .../xtable/parquet/ITParquetDataManager.java  | 149 +++++++++++++-----
 2 files changed, 156 insertions(+), 39 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 7263612e6..cfd3d66c3 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -265,4 +265,50 @@ private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws I
     }
     return parquetFiles;
   }
+
+  public static List<Path> mergeDatasetsByPartition(
+      Path targetRoot, Path sourceRoot, MessageType schema) throws IOException {
+    List<Path> finalPartitionPaths = new ArrayList<>();
+    Configuration conf = new Configuration();
+    FileSystem fs = targetRoot.getFileSystem(conf);
+
+    Map<String, Path> sourcePartitions = mapPartitions(fs, sourceRoot);
+
+    Map<String, Path> targetPartitions = mapPartitions(fs, targetRoot);
+
+    for (Map.Entry<String, Path> entry : sourcePartitions.entrySet()) {
+      String partitionKey = entry.getKey();
+      Path sourcePartitionPath = entry.getValue();
+
+      if (targetPartitions.containsKey(partitionKey)) {
+        Path targetPartitionPath = targetPartitions.get(partitionKey);
+
+        finalPartitionPaths.add(
+            appendPartitionedData(targetPartitionPath, sourcePartitionPath, schema));
+      } else {
+        // TODO logic for when a partition exists in source but not in target
+        // simply merge the files in the respective partitions using appendPartitionedData() that
+        // takes only one Path/dir
+
+      }
+    }
+    return finalPartitionPaths;
+  }
+
+  private static Map<String, Path> mapPartitions(FileSystem fs, Path root) throws IOException {
+    Map<String, Path> partitionMap = new HashMap<>();
+    RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
+
+    while (it.hasNext()) {
+      Path filePath = it.next().getPath();
+      if (filePath.getName().endsWith(".parquet") && !filePath.toString().contains("/_")) {
+        Path parent = filePath.getParent();
+        String relative = parent.toString().replace(root.toString(), "");
+        if (relative.startsWith("/")) relative = relative.substring(1);
+
+        partitionMap.put(relative, parent);
+      }
+    }
+    return partitionMap;
+  }
 }
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 420618b92..fe2bcc069 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,11 +18,13 @@
  
 package org.apache.xtable.parquet;
 
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
@@ -41,6 +43,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
@@ -52,6 +55,7 @@ public static void setup() {
   }
 
   @Test
+  @Disabled("This test already passed CI")
   public void testFormParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
@@ -68,8 +72,6 @@ public void testFormParquetFileSinglePartition() throws IOException {
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
     String outputPath = fixedPath.toString();
-    ;
-
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
@@ -83,40 +85,26 @@ public void testFormParquetFileSinglePartition() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
-    // TODO create the folders manually for a new partition value as appendFile works only within
-    // the same partition value
 
     long newModifTime = System.currentTimeMillis() - 50000;
-    // TODO many partitions case
-    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
-    /* for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }*/
-    org.apache.hadoop.fs.Path partitionPath =
-        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
-    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+
     List<org.apache.hadoop.fs.Path> resultingFiles =
         ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
-      //  should be TRUE (test for many partitions)
-      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
 
-      // should be FALSE (test for many partitions)
-      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+      boolean isNewData = pathString.contains("year=2026");
+
       long modTime = fs.getFileStatus(p).getModificationTime();
       // test for one partition value
       assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
     }
   }
 
   @Test
+  @Disabled("This test already passed CI")
   public void testAppendParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
@@ -174,19 +162,7 @@ public void testAppendParquetFileSinglePartition() throws IOException {
             new org.apache.hadoop.fs.Path(outputPath),
             new org.apache.hadoop.fs.Path(finalAppendFilePath),
             schemaParquet);
-    // can create big file which can be split if reaches a certain threshold size
-
-    // TODO many partitions case
-    // List<String> newPartitions = Arrays.asList("year=2026/month=12"); // , "year=2027/month=7");
-    /* for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath = new org.apache.hadoop.fs.Path(hdfsPath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }*/
-    org.apache.hadoop.fs.Path partitionPath =
-        new org.apache.hadoop.fs.Path(hdfsPath, "year=2026/month=12");
-    // updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+    // TODO can create big file which can be split if reaches a certain threshold size
 
     // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
     // data to sync only)
@@ -195,16 +171,111 @@ public void testAppendParquetFileSinglePartition() throws IOException {
     // check if resultingFiles contains the append data only (through the partition names)
     for (org.apache.hadoop.fs.Path p : resultingFiles) {
       String pathString = p.toString();
-      //  should be TRUE (test for many partitions)
-      boolean isNewData = pathString.contains("year=2026"); // || pathString.contains("year=2027");
-
-      // should be FALSE (test for many partitions)
-      boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+      boolean isNewData = pathString.contains("year=2026");
       long modTime = fs.getFileStatus(p).getModificationTime();
       // test for one partition value
       assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
       assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-      //  assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+    }
+  }
+
+  @Test
+  public void testAppendParquetFileMultiplePartition() throws IOException {
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
+
+    MessageType schemaParquet =
+        Types.buildMessage()
+            .required(PrimitiveType.PrimitiveTypeName.INT32)
+            .named("id")
+            .required(PrimitiveType.PrimitiveTypeName.BINARY)
+            .as(LogicalTypeAnnotation.stringType())
+            .named("value")
+            .named("parquet_schema");
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+    List<Row> data =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11));
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    String outputPath = fixedPath.toString();
+    String finalAppendFilePath = appendFilePath.toString();
+
+    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the table file
+    // update modifTime for file to append
+    // many partitions case
+    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, targetModifTime);
+      }
+    }
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 11));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("overwrite")
+        .parquet(finalAppendFilePath);
+    long newModifTime = System.currentTimeMillis() - 50000;
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }
+    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
+    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
+    // the result contains all partition paths with the parquet files compacted
+    List<org.apache.hadoop.fs.Path> outputPaths =
+        ParquetDataManager.mergeDatasetsByPartition(
+            new org.apache.hadoop.fs.Path(outputPath),
+            new org.apache.hadoop.fs.Path(finalAppendFilePath),
+            schemaParquet);
+    // TODO can create big file which can be split if reaches a certain threshold size
+    // implement bin-packing function for large files
+
+    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
+    // data to sync only)
+    List<List<org.apache.hadoop.fs.Path>> finalPaths = new ArrayList<>();
+    for (org.apache.hadoop.fs.Path resPath : outputPaths) {
+      List<org.apache.hadoop.fs.Path> resultingFiles =
+          ParquetDataManager.formNewTargetFiles(conf, resPath, newModifTime);
+      finalPaths.add(resultingFiles);
+    }
+    // check if resultingFiles contains the append data only
+    for (List<org.apache.hadoop.fs.Path> fPaths : finalPaths) {
+      for (org.apache.hadoop.fs.Path p : fPaths) {
+        String pathString = p.toString();
+        //  should be TRUE
+        boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
+        // should be FALSE
+        boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
+        long modTime = fs.getFileStatus(p).getModificationTime();
+        // test for one partition value
+        assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
+        assertTrue(isNewData, "Path should belong to appended data: " + pathString);
+        assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
+      }
     }
   }
 

From b7c613e0def55e32b121e65ceef45da241b9d5e3 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 17:39:04 +0100
Subject: [PATCH 33/54] selecting data bug fix, TODO integrate functions into
 ParquetConversionSource

---
 .../java/org/apache/xtable/parquet/ParquetDataManager.java | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index cfd3d66c3..9fd6fd925 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -170,6 +170,9 @@ public static List<Path> formNewTargetFiles(
                     .getFileMetaData()
                     .getKeyValueMetaData()
                     .get("index_end_block_of_append_" + i));
+        long modifTime =
+            Long.parseLong(
+                bigFileFooter.getFileMetaData().getKeyValueMetaData().get("append_date_" + i));
         String newFileName =
             String.format(
                 "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
@@ -178,9 +181,7 @@ public static List<Path> formNewTargetFiles(
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
             BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
-            if (j >= startBlock
-                && j <= endBlock
-                && status.getModificationTime() >= targetModifTime) {
+            if (j >= startBlock && j <= endBlock && modifTime >= targetModifTime) {
               List<BlockMetaData> blockList =
                   Collections.<BlockMetaData>singletonList(blockMetadata);
               FSDataInputStream targetInputStream = fs.open(status.getPath());

From 2a75f4925daedc63f6186fc34033478e16ba5621 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 17:59:33 +0100
Subject: [PATCH 34/54] run all tests, TODO integrate functions into
 ParquetConversionSource

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java    | 2 --
 1 file changed, 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index fe2bcc069..de49a1d86 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -55,7 +55,6 @@ public static void setup() {
   }
 
   @Test
-  @Disabled("This test already passed CI")
   public void testFormParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
@@ -104,7 +103,6 @@ public void testFormParquetFileSinglePartition() throws IOException {
   }
 
   @Test
-  @Disabled("This test already passed CI")
   public void testAppendParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition

From f1538b07e0dc79ec9e36d2cc437f4209c01a0f3d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 18:49:33 +0100
Subject: [PATCH 35/54] run all tests, TODO integrate functions into
 ParquetConversionSource

---
 .../apache/xtable/parquet/ITParquetDataManager.java    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index de49a1d86..0049af0bb 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -69,7 +69,7 @@ public void testFormParquetFileSinglePartition() throws IOException {
     List<Row> data =
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test");
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test_0");
     String outputPath = fixedPath.toString();
     df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
@@ -126,8 +126,8 @@ public void testAppendParquetFileSinglePartition() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_1");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_1");
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
 
@@ -201,8 +201,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test");
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
 

From cdedeaecf8a467229edf758166f62e74ddb25bab Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 19:08:47 +0100
Subject: [PATCH 36/54] spotless:apply, TODO integrate functions into
 ParquetConversionSource

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 0049af0bb..e47ddcf11 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -43,7 +43,6 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {

From e87312031363e60f20029c7e6827414ea6420fb5 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 4 Jan 2026 23:50:19 +0100
Subject: [PATCH 37/54] bug fixes, TODO integrate functions into
 ParquetConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    | 48 +++++++++----------
 .../xtable/parquet/ITParquetDataManager.java  | 27 +++++++++--
 2 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 9fd6fd925..bac6a5e74 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -65,27 +65,7 @@ private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path f
     ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
     return parquetFileConfig;
   }
-  // TODO use this method to match the partition folders and merge them one by one (if one exists
-  // but not in other file then create it)
-  public void appendWithPartitionCheck(Path targetPath, Path sourcePath, MessageType schema)
-      throws IOException {
-
-    // e.g., "year=2024/month=12"
-    String targetPartition = getPartitionPath(targetPath);
-    String sourcePartition = getPartitionPath(sourcePath);
-
-    if (!targetPartition.equals(sourcePartition)) {
-      throw new IllegalArgumentException(
-          "Partition Mismatch! Cannot merge " + sourcePartition + " into " + targetPartition);
-    }
-
-    // append files within the same partition foldr
-    appendNewParquetFiles(targetPath, sourcePath, schema);
-  }
-
-  private String getPartitionPath(Path path) {
-    return path.getParent().getName();
-  }
+  // TODO add safe guards for possible empty parquet files
   // append a file (merges two files into one .parquet under a partition folder)
   public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
       throws IOException {
@@ -105,6 +85,12 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     writer.start();
     HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
     InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
+    // get the equivalent fileStatus from the file-to-append Path
+    FileSystem fs = FileSystem.get(conf);
+    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
+    // save modif Time (not needed for real case scenario)
+    // long prevModifTime = fileStatus.getModificationTime();
+
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
       writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
@@ -115,9 +101,7 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     }
     // track the append date and save it in the footer
     int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0"));
-    // get the equivalent fileStatus from the file-to-append Path
-    FileSystem fs = FileSystem.get(conf);
-    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
+
     // save block indexes and modification time (for later sync related retrieval) in the metadata
     // of the output
     // table
@@ -133,9 +117,13 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
     fs.delete(filePath, false);
+    // restore modifTime not needed actually (only for test purposes)
+    // the append happens here so the time must NOT updated manually
+    // fs.setTimes(tempPath, prevModifTime, -1);
     fs.rename(tempPath, filePath);
     return filePath;
   }
+  // TODO add safe guards for possible empty parquet files
   // selective compaction of parquet blocks
   public static List<Path> formNewTargetFiles(
       Configuration conf, Path partitionPath, long targetModifTime) throws IOException {
@@ -178,6 +166,7 @@ public static List<Path> formNewTargetFiles(
                 "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
         Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName);
         try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
+          boolean wasDataAppended = false;
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
             BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
@@ -186,10 +175,17 @@ public static List<Path> formNewTargetFiles(
                   Collections.<BlockMetaData>singletonList(blockMetadata);
               FSDataInputStream targetInputStream = fs.open(status.getPath());
               writer.appendRowGroups(targetInputStream, blockList, false);
+              wasDataAppended = true;
             }
           }
+          // manually restore the modifTime after using appendRowGroups()
           writer.end(new HashMap<>());
-          finalPaths.add(targetSyncFilePath);
+          if (wasDataAppended) {
+            fs.setTimes(targetSyncFilePath, modifTime, -1);
+            finalPaths.add(targetSyncFilePath);
+          } else {
+            fs.delete(targetSyncFilePath, false);
+          }
         }
       }
     }
@@ -236,7 +232,9 @@ public static Path appendPartitionedData(Path targetPath, Path sourcePath, Messa
         continue;
       }
       long len = fs.getFileStatus(sourceFile).getLen();
+      long len_2 = fs.getFileStatus(masterTargetFile).getLen();
       System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len);
+      System.out.println("DEBUG: Attempting to append " + masterTargetFile + " Size: " + len_2);
       appendNewParquetFiles(masterTargetFile, sourceFile, schema);
     }
 
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index e47ddcf11..83d2c0a91 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -43,6 +43,7 @@
 import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
@@ -54,6 +55,7 @@ public static void setup() {
   }
 
   @Test
+  @Disabled("")
   public void testFormParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     StructType schema =
@@ -102,6 +104,7 @@ public void testFormParquetFileSinglePartition() throws IOException {
   }
 
   @Test
+  @Disabled("")
   public void testAppendParquetFileSinglePartition() throws IOException {
     Configuration conf = spark.sparkContext().hadoopConfiguration();
     // In testAppendParquetFileSinglePartition
@@ -197,7 +200,11 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
               DataTypes.createStructField("month", DataTypes.IntegerType, false)
             });
     List<Row> data =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2027, 11));
+        Arrays.asList(
+            RowFactory.create(100, "A", 2026, 12),
+            RowFactory.create(101, "AA", 2026, 12),
+            RowFactory.create(102, "CB", 2027, 11),
+            RowFactory.create(103, "BA", 2027, 11));
 
     Dataset<Row> df = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
@@ -205,7 +212,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
 
-    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
     long targetModifTime = System.currentTimeMillis() - 360000;
@@ -224,10 +231,13 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     }
     // create new file to append using Spark
     List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2027, 11));
+        Arrays.asList(
+            RowFactory.create(101, "A", 2026, 12),
+            RowFactory.create(301, "D", 2027, 11),
+            RowFactory.create(302, "DA", 2027, 11));
     Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
     dfToSync
-        .coalesce(1)
+        .coalesce(1) // since we are exec this test on low data volume
         .write()
         .partitionBy("year", "month")
         .mode("overwrite")
@@ -248,6 +258,13 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
             new org.apache.hadoop.fs.Path(outputPath),
             new org.apache.hadoop.fs.Path(finalAppendFilePath),
             schemaParquet);
+    // reset time FOR TESTING PURPOSES ONLY
+    // (in real, newModifTime time is the appending time as set by the appendRow() in the
+    // DataManager)
+    // THIS IS NEED TO CHECK AGAINST THE TIME FROM WHICH THE LAST APPEND HAPPENED
+    for (org.apache.hadoop.fs.Path oPath : outputPaths) {
+      fs.setTimes(oPath, newModifTime, -1);
+    }
     // TODO can create big file which can be split if reaches a certain threshold size
     // implement bin-packing function for large files
 
@@ -269,7 +286,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
         long modTime = fs.getFileStatus(p).getModificationTime();
         // test for one partition value
-        assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
+        assertTrue(modTime >= newModifTime, "File discovered was actually old data: " + pathString);
         assertTrue(isNewData, "Path should belong to appended data: " + pathString);
         assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
       }

From 4d0f245c02512840d16c1567a40201efea7ca678 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 5 Jan 2026 09:51:17 +0100
Subject: [PATCH 38/54] bug fixes, TODO integrate functions into
 ParquetConversionSource

---
 .../xtable/parquet/ParquetDataManager.java    | 32 +++++++++++++------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index bac6a5e74..e0f8daee2 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -88,9 +88,6 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     // get the equivalent fileStatus from the file-to-append Path
     FileSystem fs = FileSystem.get(conf);
     FileStatus fileStatus = fs.getFileStatus(fileToAppend);
-    // save modif Time (not needed for real case scenario)
-    // long prevModifTime = fileStatus.getModificationTime();
-
     if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
       writer.appendFile(inputFile);
       writer.appendFile(inputFileToAppend);
@@ -106,9 +103,23 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
     // of the output
     // table
     int currentAppendIdx = appendCount + 1;
-    long startBlock = firstBlockIndex;
-    long blocksAdded = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks().size();
-    long endBlock = startBlock + blocksAdded - 1;
+    List<BlockMetaData> initialBlocks = existingFooter.getBlocks();
+    long totalInitialCompressedSize = 0;
+    // first the blocks start at 0
+    for (BlockMetaData block : initialBlocks) {
+      totalInitialCompressedSize += block.getCompressedSize();
+    }
+    long startBlock = 0;
+    if (appendCount > 0) {
+      startBlock = totalInitialCompressedSize;
+    }
+
+    List<BlockMetaData> addedBlocks = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks();
+    long totalAddedCompressedSize = 0;
+    for (BlockMetaData block : addedBlocks) {
+      totalAddedCompressedSize += block.getCompressedSize();
+    }
+    long endBlock = totalInitialCompressedSize + totalAddedCompressedSize;
 
     combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
     combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
@@ -117,9 +128,6 @@ public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, Messa
         "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
     writer.end(combinedMeta);
     fs.delete(filePath, false);
-    // restore modifTime not needed actually (only for test purposes)
-    // the append happens here so the time must NOT updated manually
-    // fs.setTimes(tempPath, prevModifTime, -1);
     fs.rename(tempPath, filePath);
     return filePath;
   }
@@ -170,7 +178,11 @@ public static List<Path> formNewTargetFiles(
           writer.start();
           for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
             BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
-            if (j >= startBlock && j <= endBlock && modifTime >= targetModifTime) {
+            long currentBlockStart = blockMetadata.getStartingPos();
+            long currentBlockEnd = currentBlockStart + blockMetadata.getCompressedSize();
+            if (currentBlockStart >= startBlock
+                && currentBlockEnd <= endBlock
+                && modifTime >= targetModifTime) {
               List<BlockMetaData> blockList =
                   Collections.<BlockMetaData>singletonList(blockMetadata);
               FSDataInputStream targetInputStream = fs.open(status.getPath());

From 219656e02194c807c93868e7ab42c96d4e6fc136 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 22:29:29 +0100
Subject: [PATCH 39/54] handle lightweight metadata in the conversionSource

---
 .../parquet/ParquetConversionSource.java      |  70 ++--
 .../xtable/parquet/ParquetDataManager.java    | 298 ++----------------
 .../xtable/parquet/ParquetFileConfig.java     |  20 +-
 .../xtable/parquet/ITParquetDataManager.java  | 275 ----------------
 4 files changed, 90 insertions(+), 573 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index e7eff9616..26d758026 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -56,6 +56,10 @@ public class ParquetConversionSource implements ConversionSource<Long> {
 
   private static final ParquetSchemaExtractor schemaExtractor =
       ParquetSchemaExtractor.getInstance();
+  private static final ParquetMetadataExtractor metadataExtractor =
+      ParquetMetadataExtractor.getInstance();
+  private static final ParquetDataManager parquetDataManagerExtractor =
+      ParquetDataManager.getInstance();
 
   private static final ParquetMetadataExtractor parquetMetadataExtractor =
       ParquetMetadataExtractor.getInstance();
@@ -69,7 +73,7 @@ public class ParquetConversionSource implements ConversionSource<Long> {
   private final String basePath;
   @NonNull private final Configuration hadoopConf;
 
-  private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile) {
+  private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) {
     ParquetMetadata parquetMetadata =
         parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath());
     MessageType parquetSchema = parquetMetadataExtractor.getSchema(parquetMetadata);
@@ -87,15 +91,41 @@ private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile)
         .layoutStrategy(dataLayoutStrategy)
         .partitioningFields(partitionFields)
         .readSchema(schema)
-        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModificationTime()))
+        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModifTime()))
         .build();
   }
 
+  public Stream<ParquetFileConfig> getConfigsFromStream(
+      Stream<LocatedFileStatus> fileStream, Configuration conf) {
+
+    return fileStream.map(
+        fileStatus -> {
+          Path path = fileStatus.getPath();
+
+          ParquetMetadata metadata =
+              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+          return ParquetFileConfig.builder()
+              .schema(metadata.getFileMetaData().getSchema())
+              .metadata(metadata)
+              .path(path)
+              .size(fileStatus.getLen())
+              .modifTime(fileStatus.getModificationTime())
+              .rowGroupIndex(0L)
+              .codec(
+                  metadata.getBlocks().isEmpty()
+                      ? null
+                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+              .build();
+        });
+  }
+
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
     Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    LocatedFileStatus file = getParquetFileAt(parquetFiles, modificationTime);
+    Stream<ParquetFileConfig> parquetFilesMetadata = getConfigsFromStream(parquetFiles, hadoopConf);
+    ParquetFileConfig file = getParquetFileAt(parquetFilesMetadata, modificationTime);
     return createInternalTableFromFile(file);
   }
 
@@ -156,7 +186,10 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
         parquetFiles
             .filter(fileStatus -> fileStatus.getModificationTime() > modificationTime)
             .collect(Collectors.toList());
-    InternalTable internalTable = getMostRecentTable(parquetFiles);
+    List<ParquetFileConfig> tableChangesAfterMetadata =
+        parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
+            hadoopConf, parquetFiles, modificationTime);
+    InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
     for (FileStatus tableStatus : tableChangesAfter) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(tableStatus);
       addedInternalDataFiles.add(currentDataFile);
@@ -168,15 +201,15 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
         .build();
   }
 
-  private InternalTable getMostRecentTable(Stream<LocatedFileStatus> parquetFiles) {
-    LocatedFileStatus latestFile = getMostRecentParquetFile(parquetFiles);
+  private InternalTable getMostRecentTable(Stream<ParquetFileConfig> parquetFiles) {
+    ParquetFileConfig latestFile = getMostRecentParquetFile(parquetFiles);
     return createInternalTableFromFile(latestFile);
   }
 
   @Override
   public InternalTable getCurrentTable() {
     Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    return getMostRecentTable(parquetFiles);
+    return getMostRecentTable(getConfigsFromStream(parquetFiles, hadoopConf));
   }
 
   /**
@@ -189,29 +222,30 @@ public InternalSnapshot getCurrentSnapshot() {
     // to avoid consume the stream call the method twice to return the same stream of parquet files
     Stream<InternalDataFile> internalDataFiles =
         getInternalDataFiles(getParquetFiles(hadoopConf, basePath));
-    InternalTable table = getMostRecentTable(getParquetFiles(hadoopConf, basePath));
+    Stream<ParquetFileConfig> parquetFilesMetadata =
+        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
+    InternalTable table = getMostRecentTable(parquetFilesMetadata);
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
-            getCommitIdentifier(
-                getMostRecentParquetFile(getParquetFiles(hadoopConf, basePath))
-                    .getModificationTime()))
+            getCommitIdentifier(getMostRecentParquetFile(parquetFilesMetadata).getModifTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
 
-  private LocatedFileStatus getMostRecentParquetFile(Stream<LocatedFileStatus> parquetFiles) {
+  private ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles
-        .max(Comparator.comparing(FileStatus::getModificationTime))
+        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
         .orElseThrow(() -> new IllegalStateException("No files found"));
   }
 
-  private LocatedFileStatus getParquetFileAt(
-      Stream<LocatedFileStatus> parquetFiles, long modificationTime) {
-    return parquetFiles
-        .filter(fileStatus -> fileStatus.getModificationTime() == modificationTime)
+  private ParquetFileConfig getParquetFileAt(
+      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
+
+    return parquetConfigs
+        .filter(config -> config.getModifTime() >= targetTime)
         .findFirst()
-        .orElseThrow(() -> new IllegalStateException("No file found at " + modificationTime));
+        .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
   }
 
   private Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index e0f8daee2..5d09352ea 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -18,23 +18,14 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.IOException;
 import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
-import lombok.Builder;
 import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
-import org.apache.parquet.format.converter.ParquetMetadataConverter;
-import org.apache.parquet.hadoop.ParquetFileReader;
-import org.apache.parquet.hadoop.ParquetFileWriter;
-import org.apache.parquet.hadoop.metadata.BlockMetaData;
-import org.apache.parquet.hadoop.metadata.ParquetMetadata;
-import org.apache.parquet.hadoop.util.HadoopInputFile;
-import org.apache.parquet.hadoop.util.HadoopOutputFile;
-import org.apache.parquet.io.InputFile;
-import org.apache.parquet.schema.MessageType;
 
 /**
  * Manages Parquet file operations including reading, writing, and partition discovery and path
@@ -45,281 +36,30 @@
  * fields.
  */
 @Log4j2
-@Builder
 public class ParquetDataManager {
-  private ParquetMetadataExtractor metadataExtractor = ParquetMetadataExtractor.getInstance();
-  private static final long DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024; // 128MB
-  private static final int DEFAULT_PAGE_SIZE = 1 * 1024 * 1024; // 1MB
+  public static final ParquetDataManager INSTANCE = new ParquetDataManager();
 
-  /* Use Parquet API to append to a file */
-
-  // after appending check required before appending the file
-  private static boolean checkIfSchemaIsSame(
-      Configuration conf, Path fileToAppend, Path fileFromTable) {
-    ParquetFileConfig schemaFileAppend = getParquetFileConfig(conf, fileToAppend);
-    ParquetFileConfig schemaFileFromTable = getParquetFileConfig(conf, fileFromTable);
-    return schemaFileAppend.getSchema().equals(schemaFileFromTable.getSchema());
-  }
-
-  private static ParquetFileConfig getParquetFileConfig(Configuration conf, Path fileToAppend) {
-    ParquetFileConfig parquetFileConfig = new ParquetFileConfig(conf, fileToAppend);
-    return parquetFileConfig;
-  }
-  // TODO add safe guards for possible empty parquet files
-  // append a file (merges two files into one .parquet under a partition folder)
-  public static Path appendNewParquetFiles(Path filePath, Path fileToAppend, MessageType schema)
-      throws IOException {
-    Configuration conf = new Configuration();
-    long firstBlockIndex = getParquetFileConfig(conf, filePath).getRowGroupIndex();
-    ParquetMetadata existingFooter = ParquetFileReader.readFooter(conf, filePath);
-    Map<String, String> existingMeta = existingFooter.getFileMetaData().getKeyValueMetaData();
-    Path tempPath = new Path(filePath.getParent(), "." + filePath.getName() + ".tmp");
-    ParquetFileWriter writer =
-        new ParquetFileWriter(
-            HadoopOutputFile.fromPath(tempPath, conf),
-            schema,
-            ParquetFileWriter.Mode.OVERWRITE,
-            DEFAULT_BLOCK_SIZE,
-            0);
-    // write the initial table with the appended file to add into the outputPath
-    writer.start();
-    HadoopInputFile inputFile = HadoopInputFile.fromPath(filePath, conf);
-    InputFile inputFileToAppend = HadoopInputFile.fromPath(fileToAppend, conf);
-    // get the equivalent fileStatus from the file-to-append Path
-    FileSystem fs = FileSystem.get(conf);
-    FileStatus fileStatus = fs.getFileStatus(fileToAppend);
-    if (checkIfSchemaIsSame(conf, fileToAppend, filePath)) {
-      writer.appendFile(inputFile);
-      writer.appendFile(inputFileToAppend);
-    }
-    Map<String, String> combinedMeta = new HashMap<>();
-    if (existingMeta != null) {
-      combinedMeta.putAll(existingMeta);
-    }
-    // track the append date and save it in the footer
-    int appendCount = Integer.parseInt(combinedMeta.getOrDefault("total_appends", "0"));
-
-    // save block indexes and modification time (for later sync related retrieval) in the metadata
-    // of the output
-    // table
-    int currentAppendIdx = appendCount + 1;
-    List<BlockMetaData> initialBlocks = existingFooter.getBlocks();
-    long totalInitialCompressedSize = 0;
-    // first the blocks start at 0
-    for (BlockMetaData block : initialBlocks) {
-      totalInitialCompressedSize += block.getCompressedSize();
-    }
-    long startBlock = 0;
-    if (appendCount > 0) {
-      startBlock = totalInitialCompressedSize;
-    }
-
-    List<BlockMetaData> addedBlocks = ParquetFileReader.readFooter(conf, fileToAppend).getBlocks();
-    long totalAddedCompressedSize = 0;
-    for (BlockMetaData block : addedBlocks) {
-      totalAddedCompressedSize += block.getCompressedSize();
-    }
-    long endBlock = totalInitialCompressedSize + totalAddedCompressedSize;
-
-    combinedMeta.put("total_appends", String.valueOf(currentAppendIdx));
-    combinedMeta.put("index_start_block_of_append_" + currentAppendIdx, String.valueOf(startBlock));
-    combinedMeta.put("index_end_block_of_append_" + currentAppendIdx, String.valueOf(endBlock));
-    combinedMeta.put(
-        "append_date_" + currentAppendIdx, String.valueOf(fileStatus.getModificationTime()));
-    writer.end(combinedMeta);
-    fs.delete(filePath, false);
-    fs.rename(tempPath, filePath);
-    return filePath;
-  }
-  // TODO add safe guards for possible empty parquet files
-  // selective compaction of parquet blocks
-  public static List<Path> formNewTargetFiles(
-      Configuration conf, Path partitionPath, long targetModifTime) throws IOException {
-    List<Path> finalPaths = new ArrayList<>();
-    FileSystem fs = partitionPath.getFileSystem(conf);
-
-    FileStatus[] statuses =
-        fs.listStatus(partitionPath, path -> path.getName().endsWith(".parquet"));
-
-    for (FileStatus status : statuses) {
-
-      Path filePath = status.getPath();
-      ParquetMetadata bigFileFooter =
-          ParquetFileReader.readFooter(conf, filePath, ParquetMetadataConverter.NO_FILTER);
-      MessageType schema = bigFileFooter.getFileMetaData().getSchema();
-      int totalAppends =
-          Integer.parseInt(
-              bigFileFooter
-                  .getFileMetaData()
-                  .getKeyValueMetaData()
-                  .getOrDefault("total_appends", "0"));
-      for (int i = 1; i <= totalAppends; i++) {
-        int startBlock =
-            Integer.valueOf(
-                bigFileFooter
-                    .getFileMetaData()
-                    .getKeyValueMetaData()
-                    .get("index_start_block_of_append_" + i));
-        int endBlock =
-            Integer.valueOf(
-                bigFileFooter
-                    .getFileMetaData()
-                    .getKeyValueMetaData()
-                    .get("index_end_block_of_append_" + i));
-        long modifTime =
-            Long.parseLong(
-                bigFileFooter.getFileMetaData().getKeyValueMetaData().get("append_date_" + i));
-        String newFileName =
-            String.format(
-                "%s_block%d_%d.parquet", status.getPath().getName(), startBlock, endBlock);
-        Path targetSyncFilePath = new Path(status.getPath().getParent(), newFileName);
-        try (ParquetFileWriter writer = new ParquetFileWriter(conf, schema, targetSyncFilePath)) {
-          boolean wasDataAppended = false;
-          writer.start();
-          for (int j = 0; j < bigFileFooter.getBlocks().size(); j++) {
-            BlockMetaData blockMetadata = bigFileFooter.getBlocks().get(j);
-            long currentBlockStart = blockMetadata.getStartingPos();
-            long currentBlockEnd = currentBlockStart + blockMetadata.getCompressedSize();
-            if (currentBlockStart >= startBlock
-                && currentBlockEnd <= endBlock
-                && modifTime >= targetModifTime) {
-              List<BlockMetaData> blockList =
-                  Collections.<BlockMetaData>singletonList(blockMetadata);
-              FSDataInputStream targetInputStream = fs.open(status.getPath());
-              writer.appendRowGroups(targetInputStream, blockList, false);
-              wasDataAppended = true;
-            }
-          }
-          // manually restore the modifTime after using appendRowGroups()
-          writer.end(new HashMap<>());
-          if (wasDataAppended) {
-            fs.setTimes(targetSyncFilePath, modifTime, -1);
-            finalPaths.add(targetSyncFilePath);
-          } else {
-            fs.delete(targetSyncFilePath, false);
-          }
-        }
-      }
-    }
-    return finalPaths;
-  }
-
-  public static Path appendPartitionedData(Path targetPath, Path sourcePath, MessageType schema)
-      throws IOException {
-    Configuration conf = new Configuration();
-    FileSystem fs = sourcePath.getFileSystem(conf);
-
-    List<Path> targetFiles = collectParquetFiles(fs, targetPath);
-
-    List<Path> sourceFiles = collectParquetFiles(fs, sourcePath);
-
-    if (targetFiles.isEmpty()) {
-      throw new IOException("Target directory contains no parquet files to append to.");
-    }
-    if (sourceFiles.isEmpty()) {
-      return targetPath;
-    }
-    Path masterTargetFile = null;
-    for (Path p : targetFiles) {
-      ParquetMetadata metadata = ParquetFileReader.readFooter(conf, p);
-      if (!metadata.getBlocks().isEmpty()) {
-        masterTargetFile = p;
-        break;
-      }
-    }
-
-    if (masterTargetFile == null) {
-      throw new IOException("Target directory contains no files with valid data blocks.");
-    }
-
-    for (int i = 1; i < targetFiles.size(); i++) {
-      long len = fs.getFileStatus(targetFiles.get(i)).getLen();
-      System.out.println("DEBUG: Attempting to append " + targetFiles.get(i) + " Size: " + len);
-      appendNewParquetFiles(masterTargetFile, targetFiles.get(i), schema);
-    }
-    for (Path sourceFile : sourceFiles) {
-      ParquetMetadata sourceFooter = ParquetFileReader.readFooter(conf, sourceFile);
-      if (sourceFooter.getBlocks().isEmpty()) {
-        System.out.println("SKIPPING: " + sourceFile + " has no data.");
-        continue;
-      }
-      long len = fs.getFileStatus(sourceFile).getLen();
-      long len_2 = fs.getFileStatus(masterTargetFile).getLen();
-      System.out.println("DEBUG: Attempting to append " + sourceFile + " Size: " + len);
-      System.out.println("DEBUG: Attempting to append " + masterTargetFile + " Size: " + len_2);
-      appendNewParquetFiles(masterTargetFile, sourceFile, schema);
-    }
-
-    return masterTargetFile;
-  }
-
-  private static List<Path> collectParquetFiles(FileSystem fs, Path root) throws IOException {
-    List<Path> parquetFiles = new ArrayList<>();
-    Configuration conf = fs.getConf();
-    if (!fs.exists(root)) return parquetFiles;
-
-    if (fs.getFileStatus(root).isDirectory()) {
-      RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
-      while (it.hasNext()) {
-        Path p = it.next().getPath();
-        if (p.getName().endsWith(".parquet")
-            && !p.toString().contains("/.")
-            && !p.toString().contains("/_")) {
-          ParquetMetadata footer = ParquetFileReader.readFooter(conf, p);
-          if (footer.getBlocks() != null && !footer.getBlocks().isEmpty()) {
-            parquetFiles.add(p);
-          }
-        }
-      }
-    } else {
-      parquetFiles.add(root);
-    }
-    return parquetFiles;
+  public static ParquetDataManager getInstance() {
+    return INSTANCE;
   }
 
-  public static List<Path> mergeDatasetsByPartition(
-      Path targetRoot, Path sourceRoot, MessageType schema) throws IOException {
-    List<Path> finalPartitionPaths = new ArrayList<>();
-    Configuration conf = new Configuration();
-    FileSystem fs = targetRoot.getFileSystem(conf);
-
-    Map<String, Path> sourcePartitions = mapPartitions(fs, sourceRoot);
-
-    Map<String, Path> targetPartitions = mapPartitions(fs, targetRoot);
-
-    for (Map.Entry<String, Path> entry : sourcePartitions.entrySet()) {
-      String partitionKey = entry.getKey();
-      Path sourcePartitionPath = entry.getValue();
+  public List<ParquetFileConfig> getParquetFilesMetadataInRange(
+      Configuration conf, Stream<LocatedFileStatus> parquetFiles, long startTime, long endTime) {
 
-      if (targetPartitions.containsKey(partitionKey)) {
-        Path targetPartitionPath = targetPartitions.get(partitionKey);
-
-        finalPartitionPaths.add(
-            appendPartitionedData(targetPartitionPath, sourcePartitionPath, schema));
-      } else {
-        // TODO logic for when a partition exists in source but not in target
-        // simply merge the files in the respective partitions using appendPartitionedData() that
-        // takes only one Path/dir
-
-      }
-    }
-    return finalPartitionPaths;
+    return parquetFiles
+        .filter(
+            file ->
+                file.getModificationTime() >= startTime && file.getModificationTime() <= endTime)
+        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .collect(Collectors.toList());
   }
 
-  private static Map<String, Path> mapPartitions(FileSystem fs, Path root) throws IOException {
-    Map<String, Path> partitionMap = new HashMap<>();
-    RemoteIterator<LocatedFileStatus> it = fs.listFiles(root, true);
-
-    while (it.hasNext()) {
-      Path filePath = it.next().getPath();
-      if (filePath.getName().endsWith(".parquet") && !filePath.toString().contains("/_")) {
-        Path parent = filePath.getParent();
-        String relative = parent.toString().replace(root.toString(), "");
-        if (relative.startsWith("/")) relative = relative.substring(1);
+  public List<ParquetFileConfig> getParquetFilesMetadataAfterTime(
+      Configuration conf, Stream<LocatedFileStatus> parquetFiles, long syncTime) {
 
-        partitionMap.put(relative, parent);
-      }
-    }
-    return partitionMap;
+    return parquetFiles
+        .filter(file -> file.getModificationTime() >= syncTime)
+        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .collect(Collectors.toList());
   }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 33ee08d38..9e7755011 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -18,31 +18,49 @@
  
 package org.apache.xtable.parquet;
 
+import java.io.IOException;
+
 import lombok.AccessLevel;
+import lombok.Builder;
 import lombok.Getter;
 import lombok.experimental.FieldDefaults;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
 @Getter
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
+@Builder
 class ParquetFileConfig {
   MessageType schema;
+  ParquetMetadata metadata;
   long rowGroupIndex;
+  long modifTime;
+  long size;
   CompressionCodecName codec;
+  Path path;
 
   public ParquetFileConfig(Configuration conf, Path file) {
+    long modifTime = -1L;
     ParquetMetadata metadata =
         ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
 
     if (metadata.getBlocks().isEmpty()) {
       throw new IllegalStateException("Parquet file contains no row groups.");
     }
-
+    try {
+      modifTime = file.getFileSystem(conf).getFileStatus(file).getModificationTime();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    this.path = file;
+    this.modifTime = modifTime;
+    this.size = metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum();
+    this.metadata = metadata;
     this.schema = metadata.getFileMetaData().getSchema();
     this.rowGroupIndex = metadata.getBlocks().size();
     this.codec = metadata.getBlocks().get(0).getColumns().get(0).getCodec();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 83d2c0a91..8d8a16a2d 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,33 +18,9 @@
  
 package org.apache.xtable.parquet;
 
-import static org.junit.jupiter.api.Assertions.assertFalse;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-import java.io.IOException;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.parquet.schema.LogicalTypeAnnotation;
-import org.apache.parquet.schema.MessageType;
-import org.apache.parquet.schema.PrimitiveType;
-import org.apache.parquet.schema.Types;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.RowFactory;
 import org.apache.spark.sql.SparkSession;
-import org.apache.spark.sql.types.DataTypes;
-import org.apache.spark.sql.types.StructField;
-import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
 
 public class ITParquetDataManager {
   private static SparkSession spark;
@@ -54,257 +30,6 @@ public static void setup() {
     spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
   }
 
-  @Test
-  @Disabled("")
-  public void testFormParquetFileSinglePartition() throws IOException {
-    Configuration conf = spark.sparkContext().hadoopConfiguration();
-    StructType schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("value", DataTypes.StringType, false),
-              DataTypes.createStructField("year", DataTypes.IntegerType, false),
-              DataTypes.createStructField("month", DataTypes.IntegerType, false)
-            });
-
-    List<Row> data =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-    Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet-partitioned_table_test_0");
-    String outputPath = fixedPath.toString();
-    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
-
-    // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
-    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
-    // set the modification time to the file
-    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
-    // create new file to append using Spark
-    List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync.write().partitionBy("year", "month").mode("append").parquet(outputPath);
-
-    long newModifTime = System.currentTimeMillis() - 50000;
-
-    List<org.apache.hadoop.fs.Path> resultingFiles =
-        ParquetDataManager.formNewTargetFiles(conf, hdfsPath, newModifTime);
-    // check if resultingFiles contains the append data only (through the partition names)
-    for (org.apache.hadoop.fs.Path p : resultingFiles) {
-      String pathString = p.toString();
-
-      boolean isNewData = pathString.contains("year=2026");
-
-      long modTime = fs.getFileStatus(p).getModificationTime();
-      // test for one partition value
-      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
-      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-    }
-  }
-
-  @Test
-  @Disabled("")
-  public void testAppendParquetFileSinglePartition() throws IOException {
-    Configuration conf = spark.sparkContext().hadoopConfiguration();
-    // In testAppendParquetFileSinglePartition
-    MessageType schemaParquet =
-        Types.buildMessage()
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("id")
-            .required(PrimitiveType.PrimitiveTypeName.BINARY)
-            .as(LogicalTypeAnnotation.stringType())
-            .named("value")
-            .named("parquet_schema");
-    StructType schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("value", DataTypes.StringType, false),
-              DataTypes.createStructField("year", DataTypes.IntegerType, false),
-              DataTypes.createStructField("month", DataTypes.IntegerType, false)
-            });
-    List<Row> data =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(102, "B", 2026, 12));
-
-    Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_1");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_1");
-    String outputPath = fixedPath.toString();
-    String finalAppendFilePath = appendFilePath.toString();
-
-    df.write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
-
-    // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
-    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
-    // set the modification time to the table file
-    updateModificationTimeRecursive(fs, hdfsPath, targetModifTime);
-    // create new file to append using Spark
-    List<Row> futureDataToSync =
-        Arrays.asList(RowFactory.create(101, "A", 2026, 12), RowFactory.create(301, "D", 2026, 12));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync
-        .coalesce(1)
-        .write()
-        .partitionBy("year", "month")
-        .mode("overwrite")
-        .parquet(finalAppendFilePath);
-    long newModifTime = System.currentTimeMillis() - 50000;
-    // update modifTime for file to append
-    updateModificationTimeRecursive(
-        fs, new org.apache.hadoop.fs.Path(finalAppendFilePath), newModifTime);
-    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
-    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
-    org.apache.hadoop.fs.Path outputFile =
-        ParquetDataManager.appendPartitionedData(
-            new org.apache.hadoop.fs.Path(outputPath),
-            new org.apache.hadoop.fs.Path(finalAppendFilePath),
-            schemaParquet);
-    // TODO can create big file which can be split if reaches a certain threshold size
-
-    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
-    // data to sync only)
-    List<org.apache.hadoop.fs.Path> resultingFiles =
-        ParquetDataManager.formNewTargetFiles(conf, outputFile, newModifTime);
-    // check if resultingFiles contains the append data only (through the partition names)
-    for (org.apache.hadoop.fs.Path p : resultingFiles) {
-      String pathString = p.toString();
-      boolean isNewData = pathString.contains("year=2026");
-      long modTime = fs.getFileStatus(p).getModificationTime();
-      // test for one partition value
-      assertTrue(modTime > newModifTime, "File discovered was actually old data: " + pathString);
-      assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-    }
-  }
-
-  @Test
-  public void testAppendParquetFileMultiplePartition() throws IOException {
-    Configuration conf = spark.sparkContext().hadoopConfiguration();
-
-    MessageType schemaParquet =
-        Types.buildMessage()
-            .required(PrimitiveType.PrimitiveTypeName.INT32)
-            .named("id")
-            .required(PrimitiveType.PrimitiveTypeName.BINARY)
-            .as(LogicalTypeAnnotation.stringType())
-            .named("value")
-            .named("parquet_schema");
-    StructType schema =
-        DataTypes.createStructType(
-            new StructField[] {
-              DataTypes.createStructField("id", DataTypes.IntegerType, false),
-              DataTypes.createStructField("value", DataTypes.StringType, false),
-              DataTypes.createStructField("year", DataTypes.IntegerType, false),
-              DataTypes.createStructField("month", DataTypes.IntegerType, false)
-            });
-    List<Row> data =
-        Arrays.asList(
-            RowFactory.create(100, "A", 2026, 12),
-            RowFactory.create(101, "AA", 2026, 12),
-            RowFactory.create(102, "CB", 2027, 11),
-            RowFactory.create(103, "BA", 2027, 11));
-
-    Dataset<Row> df = spark.createDataFrame(data, schema);
-    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
-    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
-    String outputPath = fixedPath.toString();
-    String finalAppendFilePath = appendFilePath.toString();
-
-    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
-
-    // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
-    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
-    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
-    // set the modification time to the table file
-    // update modifTime for file to append
-    // many partitions case
-    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(outputPath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, targetModifTime);
-      }
-    }
-    // create new file to append using Spark
-    List<Row> futureDataToSync =
-        Arrays.asList(
-            RowFactory.create(101, "A", 2026, 12),
-            RowFactory.create(301, "D", 2027, 11),
-            RowFactory.create(302, "DA", 2027, 11));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
-    dfToSync
-        .coalesce(1) // since we are exec this test on low data volume
-        .write()
-        .partitionBy("year", "month")
-        .mode("overwrite")
-        .parquet(finalAppendFilePath);
-    long newModifTime = System.currentTimeMillis() - 50000;
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }
-    // recursively append all files under a specific partition path (e.g. "year=2026/month=12")
-    // (assuming that outputPath and finalAppendFilePath are same partitions for different files)
-    // the result contains all partition paths with the parquet files compacted
-    List<org.apache.hadoop.fs.Path> outputPaths =
-        ParquetDataManager.mergeDatasetsByPartition(
-            new org.apache.hadoop.fs.Path(outputPath),
-            new org.apache.hadoop.fs.Path(finalAppendFilePath),
-            schemaParquet);
-    // reset time FOR TESTING PURPOSES ONLY
-    // (in real, newModifTime time is the appending time as set by the appendRow() in the
-    // DataManager)
-    // THIS IS NEED TO CHECK AGAINST THE TIME FROM WHICH THE LAST APPEND HAPPENED
-    for (org.apache.hadoop.fs.Path oPath : outputPaths) {
-      fs.setTimes(oPath, newModifTime, -1);
-    }
-    // TODO can create big file which can be split if reaches a certain threshold size
-    // implement bin-packing function for large files
-
-    // test whether the appended data can be selectively filtered for incr sync (e.g. get me the
-    // data to sync only)
-    List<List<org.apache.hadoop.fs.Path>> finalPaths = new ArrayList<>();
-    for (org.apache.hadoop.fs.Path resPath : outputPaths) {
-      List<org.apache.hadoop.fs.Path> resultingFiles =
-          ParquetDataManager.formNewTargetFiles(conf, resPath, newModifTime);
-      finalPaths.add(resultingFiles);
-    }
-    // check if resultingFiles contains the append data only
-    for (List<org.apache.hadoop.fs.Path> fPaths : finalPaths) {
-      for (org.apache.hadoop.fs.Path p : fPaths) {
-        String pathString = p.toString();
-        //  should be TRUE
-        boolean isNewData = pathString.contains("year=2026") || pathString.contains("year=2027");
-        // should be FALSE
-        boolean isOldData = pathString.contains("year=2024") || pathString.contains("year=2025");
-        long modTime = fs.getFileStatus(p).getModificationTime();
-        // test for one partition value
-        assertTrue(modTime >= newModifTime, "File discovered was actually old data: " + pathString);
-        assertTrue(isNewData, "Path should belong to appended data: " + pathString);
-        assertFalse(isOldData, "Path should NOT belong to old data: " + pathString);
-      }
-    }
-  }
-
-  private void updateModificationTimeRecursive(
-      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
-    org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> it =
-        fs.listFiles(path, true);
-    while (it.hasNext()) {
-      org.apache.hadoop.fs.LocatedFileStatus status = it.next();
-      if (status.getPath().getName().endsWith(".parquet")) {
-        fs.setTimes(status.getPath(), time, -1);
-      }
-    }
-  }
-
   @AfterAll
   public static void tearDown() {
     if (spark != null) {

From ff809d7b6aa294b234ec679f693dc8f058d25898 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 22:36:47 +0100
Subject: [PATCH 40/54] fix CI

---
 .../main/java/org/apache/xtable/parquet/ParquetFileConfig.java  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 9e7755011..7a038124a 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 
 import lombok.AccessLevel;
+import lombok.AllArgsConstructor;
 import lombok.Builder;
 import lombok.Getter;
 import lombok.experimental.FieldDefaults;
@@ -35,6 +36,7 @@
 @Getter
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 @Builder
+@AllArgsConstructor
 class ParquetFileConfig {
   MessageType schema;
   ParquetMetadata metadata;

From e06368f9d92a3ac4553352dd01e65e1464c7cc39 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 23:05:15 +0100
Subject: [PATCH 41/54] fix CI, refactoring and isIncrementalSyncSafeFrom
 implementation

---
 .../parquet/ParquetConversionSource.java      | 78 ++++++++++++++-----
 1 file changed, 57 insertions(+), 21 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 26d758026..935a5c942 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -24,11 +24,11 @@
 import java.nio.file.Paths;
 import java.time.Instant;
 import java.util.*;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 import lombok.Builder;
 import lombok.NonNull;
+import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
@@ -52,6 +52,7 @@
 import org.apache.xtable.spi.extractor.ConversionSource;
 
 @Builder
+@Log4j2
 public class ParquetConversionSource implements ConversionSource<Long> {
 
   private static final ParquetSchemaExtractor schemaExtractor =
@@ -129,13 +130,13 @@ public InternalTable getTable(Long modificationTime) {
     return createInternalTableFromFile(file);
   }
 
-  private Stream<InternalDataFile> getInternalDataFiles(Stream<LocatedFileStatus> parquetFiles) {
+  private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles.map(
         file ->
             InternalDataFile.builder()
                 .physicalPath(file.getPath().toString())
                 .fileFormat(FileFormat.APACHE_PARQUET)
-                .fileSizeBytes(file.getLen())
+                .fileSizeBytes(file.getSize())
                 .partitionValues(
                     partitionValueExtractor.extractPartitionValues(
                         partitionSpecExtractor.spec(
@@ -144,14 +145,14 @@ private Stream<InternalDataFile> getInternalDataFiles(Stream<LocatedFileStatus>
                                     hadoopConf, file.getPath()),
                                 file.getPath().toString())),
                         HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
-                .lastModified(file.getModificationTime())
+                .lastModified(file.getModifTime())
                 .columnStats(
                     parquetStatsExtractor.getColumnStatsForaFile(
                         parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath())))
                 .build());
   }
 
-  private InternalDataFile createInternalDataFileFromParquetFile(FileStatus parquetFile) {
+  private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) {
     return InternalDataFile.builder()
         .physicalPath(parquetFile.getPath().toString())
         .partitionValues(
@@ -162,8 +163,8 @@ private InternalDataFile createInternalDataFileFromParquetFile(FileStatus parque
                             hadoopConf, parquetFile.getPath()),
                         parquetFile.getPath().toString())),
                 basePath))
-        .lastModified(parquetFile.getModificationTime())
-        .fileSizeBytes(parquetFile.getLen())
+        .lastModified(parquetFile.getModifTime())
+        .fileSizeBytes(parquetFile.getSize())
         .columnStats(
             parquetStatsExtractor.getColumnStatsForaFile(
                 parquetMetadataExtractor.readParquetMetadata(hadoopConf, parquetFile.getPath())))
@@ -182,16 +183,15 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
     Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
     Set<InternalDataFile> addedInternalDataFiles = new HashSet<>();
 
-    List<FileStatus> tableChangesAfter =
-        parquetFiles
-            .filter(fileStatus -> fileStatus.getModificationTime() > modificationTime)
-            .collect(Collectors.toList());
+    List<ParquetFileConfig> filesMetadata =
+        parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
+            hadoopConf, parquetFiles, modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
             hadoopConf, parquetFiles, modificationTime);
     InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
-    for (FileStatus tableStatus : tableChangesAfter) {
-      InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(tableStatus);
+    for (ParquetFileConfig fileMetadata : filesMetadata) {
+      InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
       addedInternalDataFiles.add(currentDataFile);
     }
 
@@ -219,16 +219,18 @@ public InternalTable getCurrentTable() {
    */
   @Override
   public InternalSnapshot getCurrentSnapshot() {
-    // to avoid consume the stream call the method twice to return the same stream of parquet files
     Stream<InternalDataFile> internalDataFiles =
-        getInternalDataFiles(getParquetFiles(hadoopConf, basePath));
-    Stream<ParquetFileConfig> parquetFilesMetadata =
-        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
-    InternalTable table = getMostRecentTable(parquetFilesMetadata);
+        getInternalDataFiles(
+            getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
+    InternalTable table =
+        getMostRecentTable(getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
-            getCommitIdentifier(getMostRecentParquetFile(parquetFilesMetadata).getModifTime()))
+            getCommitIdentifier(
+                getMostRecentParquetFile(
+                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                    .getModifTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
@@ -262,8 +264,42 @@ private Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, Stri
   }
 
   @Override
-  public boolean isIncrementalSyncSafeFrom(Instant instant) {
-    return false;
+  public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
+    Stream<ParquetFileConfig> parquetFilesMetadata =
+        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
+    LongSummaryStatistics stats =
+        parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics();
+
+    if (stats.getCount() == 0) {
+      log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName);
+      return false;
+    }
+
+    long earliestModTime = stats.getMin();
+    long latestModTime = stats.getMax();
+
+    if (timeInMillis.toEpochMilli() > latestModTime) {
+      log.warn(
+          "Instant {} is in the future relative to the data. Latest file time: {}",
+          timeInMillis.toEpochMilli(),
+          Instant.ofEpochMilli(latestModTime));
+      return false;
+    }
+
+    if (earliestModTime > timeInMillis.toEpochMilli()) {
+      log.warn(
+          "Incremental sync is not safe. Earliest available metadata (time={}) is newer "
+              + "than requested instant {}. Data history has been truncated.",
+          Instant.ofEpochMilli(earliestModTime),
+          timeInMillis.toEpochMilli());
+      return false;
+    }
+
+    log.info(
+        "Incremental sync is safe from instant {} for table {}",
+        timeInMillis.toEpochMilli(),
+        tableName);
+    return true;
   }
 
   @Override

From 7cdccd01e9012b6af74ce6b88d171555d784d80a Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 23:44:36 +0100
Subject: [PATCH 42/54] added test

---
 .../xtable/parquet/ITParquetDataManager.java  | 114 ++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 8d8a16a2d..db17958ac 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,18 +18,132 @@
  
 package org.apache.xtable.parquet;
 
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
 import org.junit.jupiter.api.AfterAll;
 import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.xtable.model.InternalTable;
+import org.apache.xtable.model.storage.TableFormat;
 
 public class ITParquetDataManager {
   private static SparkSession spark;
+  private ParquetConversionSource conversionSource;
 
   @BeforeAll
   public static void setup() {
     spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
   }
 
+  @Test
+  public void testAppendParquetFileMultiplePartition() throws IOException {
+
+    Configuration conf = spark.sparkContext().hadoopConfiguration();
+
+    StructType schema =
+        DataTypes.createStructType(
+            new StructField[] {
+              DataTypes.createStructField("id", DataTypes.IntegerType, false),
+              DataTypes.createStructField("value", DataTypes.StringType, false),
+              DataTypes.createStructField("year", DataTypes.IntegerType, false),
+              DataTypes.createStructField("month", DataTypes.IntegerType, false)
+            });
+    List<Row> data =
+        Arrays.asList(
+            RowFactory.create(100, "A", 2026, 12),
+            RowFactory.create(101, "AA", 2026, 12),
+            RowFactory.create(102, "CB", 2027, 11),
+            RowFactory.create(103, "BA", 2027, 11));
+
+    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
+    Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
+    String outputPath = fixedPath.toString();
+    String finalAppendFilePath = appendFilePath.toString();
+
+    df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
+
+    // test find files to sync
+    long targetModifTime = System.currentTimeMillis() - 360000;
+    org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
+    FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
+    // set the modification time to the table file
+    // update modifTime for file to append
+    // many partitions case
+    List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, targetModifTime);
+      }
+    }
+    // create new file to append using Spark
+    List<Row> futureDataToSync =
+        Arrays.asList(
+            RowFactory.create(101, "A", 2026, 12),
+            RowFactory.create(301, "D", 2027, 11),
+            RowFactory.create(302, "DA", 2027, 11));
+    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    dfToSync
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("overwrite")
+        .parquet(finalAppendFilePath);
+    long newModifTime = System.currentTimeMillis() - 50000;
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
+      if (fs.exists(partitionPath)) {
+        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
+      }
+    }
+    Dataset<Row> dfWithNewTimes = spark.read().parquet(finalAppendFilePath);
+    dfWithNewTimes
+        .coalesce(1)
+        .write()
+        .partitionBy("year", "month")
+        .mode("append")
+        .parquet(outputPath);
+    fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
+    // conversionSource operations
+    InternalTable result = conversionSource.getTable(newModifTime);
+    assertEquals(newModifTime, result.getLatestCommitTime());
+    assertNotNull(result);
+    assertEquals("parquet_table_test_2", result.getName());
+    assertEquals(TableFormat.PARQUET, result.getTableFormat());
+    assertNotNull(result.getReadSchema());
+  }
+
+  private void updateModificationTimeRecursive(
+      FileSystem fs, org.apache.hadoop.fs.Path path, long time) throws IOException {
+    org.apache.hadoop.fs.RemoteIterator<org.apache.hadoop.fs.LocatedFileStatus> it =
+        fs.listFiles(path, true);
+    while (it.hasNext()) {
+      org.apache.hadoop.fs.LocatedFileStatus status = it.next();
+      if (status.getPath().getName().endsWith(".parquet")) {
+        fs.setTimes(status.getPath(), time, -1);
+      }
+    }
+  }
+
   @AfterAll
   public static void tearDown() {
     if (spark != null) {

From b919146226e12768522e27ae4c0c3e1af43cbe35 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 12 Jan 2026 23:52:08 +0100
Subject: [PATCH 43/54] added test: init conversionSource instance

---
 .../org/apache/xtable/parquet/ITParquetDataManager.java   | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index db17958ac..30aa2f6e2 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -124,6 +124,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .parquet(outputPath);
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
+    conversionSource =
+        ParquetConversionSource.builder()
+            .tableName("parquet_table_test_2")
+            .basePath(fixedPath.toString())
+            .hadoopConf(new Configuration()) // Required due to @NonNull
+            .partitionValueExtractor(null)
+            .partitionSpecExtractor(null)
+            .build();
     InternalTable result = conversionSource.getTable(newModifTime);
     assertEquals(newModifTime, result.getLatestCommitTime());
     assertNotNull(result);

From 8ff5aa63d6fb7aaefe6fa8888ea78afa2defe60c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 00:36:34 +0100
Subject: [PATCH 44/54] added test: init conversionSource instance: fix error

---
 .../java/org/apache/xtable/parquet/ITParquetDataManager.java  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 30aa2f6e2..192e7ab3b 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -127,8 +127,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     conversionSource =
         ParquetConversionSource.builder()
             .tableName("parquet_table_test_2")
-            .basePath(fixedPath.toString())
-            .hadoopConf(new Configuration()) // Required due to @NonNull
+            .basePath(fixedPath.toAbsolutePath().toUri().toString())
+            .hadoopConf(new Configuration())
             .partitionValueExtractor(null)
             .partitionSpecExtractor(null)
             .build();

From db9d7aba0816c8d472b98344818f6526f7683b85 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 02:44:29 +0100
Subject: [PATCH 45/54] added test: fix error

---
 .../parquet/ParquetConversionSource.java      | 13 ++--
 .../xtable/parquet/ITParquetDataManager.java  | 67 ++++++++++++++++---
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 935a5c942..7cc66dcea 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -159,15 +159,11 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig
             partitionValueExtractor.extractPartitionValues(
                 partitionSpecExtractor.spec(
                     partitionValueExtractor.extractSchemaForParquetPartitions(
-                        parquetMetadataExtractor.readParquetMetadata(
-                            hadoopConf, parquetFile.getPath()),
-                        parquetFile.getPath().toString())),
+                        parquetFile.getMetadata(), parquetFile.getPath().toString())),
                 basePath))
         .lastModified(parquetFile.getModifTime())
         .fileSizeBytes(parquetFile.getSize())
-        .columnStats(
-            parquetStatsExtractor.getColumnStatsForaFile(
-                parquetMetadataExtractor.readParquetMetadata(hadoopConf, parquetFile.getPath())))
+        .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
         .build();
   }
 
@@ -180,15 +176,14 @@ public CommitsBacklog<Long> getCommitsBacklog(InstantsForIncrementalSync syncIns
 
   @Override
   public TableChange getTableChangeForCommit(Long modificationTime) {
-    Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
     Set<InternalDataFile> addedInternalDataFiles = new HashSet<>();
 
     List<ParquetFileConfig> filesMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, parquetFiles, modificationTime);
+            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, parquetFiles, modificationTime);
+            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
     InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
     for (ParquetFileConfig fileMetadata : filesMetadata) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 192e7ab3b..15bccd721 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -23,11 +23,15 @@
 import java.io.IOException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.time.Instant;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Properties;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.RowFactory;
@@ -39,16 +43,24 @@
 import org.junit.jupiter.api.BeforeAll;
 import org.junit.jupiter.api.Test;
 
+import org.apache.xtable.conversion.SourceTable;
+import org.apache.xtable.model.InternalSnapshot;
 import org.apache.xtable.model.InternalTable;
 import org.apache.xtable.model.storage.TableFormat;
 
 public class ITParquetDataManager {
   private static SparkSession spark;
-  private ParquetConversionSource conversionSource;
+  private static ParquetConversionSourceProvider conversionSourceProvider;
+  public static final String PARTITION_FIELD_SPEC_CONFIG =
+      "xtable.parquet.source.partition_field_spec_config";
 
   @BeforeAll
   public static void setup() {
     spark = SparkSession.builder().appName("ParquetTest").master("local[*]").getOrCreate();
+    conversionSourceProvider = new ParquetConversionSourceProvider();
+    Configuration hadoopConf = new Configuration();
+    hadoopConf.set("fs.defaultFS", "file:///");
+    conversionSourceProvider.init(hadoopConf);
   }
 
   @Test
@@ -80,13 +92,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
-    long targetModifTime = System.currentTimeMillis() - 360000;
+
     org.apache.hadoop.fs.Path hdfsPath = new org.apache.hadoop.fs.Path(outputPath);
     FileSystem fs = FileSystem.get(hdfsPath.toUri(), conf);
     // set the modification time to the table file
     // update modifTime for file to append
     // many partitions case
     List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
+    long targetModifTime = System.currentTimeMillis() - 360000;
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
           new org.apache.hadoop.fs.Path(outputPath, partition);
@@ -107,14 +120,14 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .partitionBy("year", "month")
         .mode("overwrite")
         .parquet(finalAppendFilePath);
-    long newModifTime = System.currentTimeMillis() - 50000;
+    /*long newModifTime = System.currentTimeMillis() - 50000;
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
           new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
       if (fs.exists(partitionPath)) {
         updateModificationTimeRecursive(fs, partitionPath, newModifTime);
       }
-    }
+    }*/
     Dataset<Row> dfWithNewTimes = spark.read().parquet(finalAppendFilePath);
     dfWithNewTimes
         .coalesce(1)
@@ -124,20 +137,52 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .parquet(outputPath);
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
-    conversionSource =
-        ParquetConversionSource.builder()
-            .tableName("parquet_table_test_2")
+    Properties sourceProperties = new Properties();
+    String partitionConfig = "id:MONTH:year=yyyy/month=MM";
+    sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
+    SourceTable tableConfig =
+        SourceTable.builder()
+            .name("parquet_table_test_2")
             .basePath(fixedPath.toAbsolutePath().toUri().toString())
-            .hadoopConf(new Configuration())
-            .partitionValueExtractor(null)
-            .partitionSpecExtractor(null)
+            .additionalProperties(sourceProperties)
+            .formatName(TableFormat.PARQUET)
             .build();
+
+    ParquetConversionSource conversionSource =
+        conversionSourceProvider.getConversionSourceInstance(tableConfig);
+
+    long newModifTime = System.currentTimeMillis() - 50000;
+
+    for (String partition : newPartitions) {
+      org.apache.hadoop.fs.Path partitionPath =
+          new org.apache.hadoop.fs.Path(outputPath, partition);
+
+      RemoteIterator<LocatedFileStatus> it = fs.listFiles(partitionPath, false);
+      while (it.hasNext()) {
+        LocatedFileStatus fileStatus = it.next();
+
+        if (fileStatus.getModificationTime() > newModifTime) {
+          fs.setTimes(fileStatus.getPath(), newModifTime, -1);
+        } else {
+
+          fs.setTimes(fileStatus.getPath(), targetModifTime, -1);
+        }
+      }
+
+      fs.setTimes(partitionPath, newModifTime, -1);
+    }
+
     InternalTable result = conversionSource.getTable(newModifTime);
-    assertEquals(newModifTime, result.getLatestCommitTime());
+    assertEquals(
+        Instant.ofEpochMilli(newModifTime).toString(), result.getLatestCommitTime().toString());
     assertNotNull(result);
     assertEquals("parquet_table_test_2", result.getName());
     assertEquals(TableFormat.PARQUET, result.getTableFormat());
     assertNotNull(result.getReadSchema());
+    // TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
+    InternalSnapshot snapshot = conversionSource.getCurrentSnapshot();
+    // assertNotNull(changes);
+    assertNotNull(snapshot);
   }
 
   private void updateModificationTimeRecursive(

From 324d703207ea0c89615711ccfd92fbd85cfab39c Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 12:43:59 +0100
Subject: [PATCH 46/54] more tests + bug fixes and reformatting

---
 .../org/apache/xtable/parquet/ParquetConversionSource.java | 7 ++++++-
 .../org/apache/xtable/parquet/ITParquetDataManager.java    | 5 +++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 7cc66dcea..24b86cac5 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -160,7 +160,7 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig
                 partitionSpecExtractor.spec(
                     partitionValueExtractor.extractSchemaForParquetPartitions(
                         parquetFile.getMetadata(), parquetFile.getPath().toString())),
-                basePath))
+                HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
         .lastModified(parquetFile.getModifTime())
         .fileSizeBytes(parquetFile.getSize())
         .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
@@ -191,6 +191,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
     }
 
     return TableChange.builder()
+        .sourceIdentifier(
+            getCommitIdentifier(
+                getMostRecentParquetFile(
+                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                    .getModifTime()))
         .tableAsOfChange(internalTable)
         .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
         .build();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 15bccd721..90f28808f 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -46,6 +46,7 @@
 import org.apache.xtable.conversion.SourceTable;
 import org.apache.xtable.model.InternalSnapshot;
 import org.apache.xtable.model.InternalTable;
+import org.apache.xtable.model.TableChange;
 import org.apache.xtable.model.storage.TableFormat;
 
 public class ITParquetDataManager {
@@ -179,10 +180,10 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     assertEquals("parquet_table_test_2", result.getName());
     assertEquals(TableFormat.PARQUET, result.getTableFormat());
     assertNotNull(result.getReadSchema());
-    // TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
     InternalSnapshot snapshot = conversionSource.getCurrentSnapshot();
-    // assertNotNull(changes);
     assertNotNull(snapshot);
+    TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
+    assertNotNull(changes);
   }
 
   private void updateModificationTimeRecursive(

From 3c453e69355f4ad0c0d2f487d45ca912fd620f54 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 13:17:28 +0100
Subject: [PATCH 47/54] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 855894ec2..be4c56dcb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -61,7 +61,7 @@
         <project.version>0.2.0-SNAPSHOT</project.version>
         <!-- To ensure build content is re-producible -->
         <project.build.outputTimestamp>2025-01-01T00:00:00Z</project.build.outputTimestamp>
-        <maven.compiler.target>8</maven.compiler.target>
+        <maven.compiler.target>17</maven.compiler.target>
         <maven-compiler-plugin.version>3.13.0</maven-compiler-plugin.version>
         <avro.version>1.11.4</avro.version>
         <log4j.version>2.22.0</log4j.version>

From e4c0b4cc2470d5c425193a85c87d95c552e170ad Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 13:20:42 +0100
Subject: [PATCH 48/54] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index be4c56dcb..855894ec2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -61,7 +61,7 @@
         <project.version>0.2.0-SNAPSHOT</project.version>
         <!-- To ensure build content is re-producible -->
         <project.build.outputTimestamp>2025-01-01T00:00:00Z</project.build.outputTimestamp>
-        <maven.compiler.target>17</maven.compiler.target>
+        <maven.compiler.target>8</maven.compiler.target>
         <maven-compiler-plugin.version>3.13.0</maven-compiler-plugin.version>
         <avro.version>1.11.4</avro.version>
         <log4j.version>2.22.0</log4j.version>

From 4315282bb23b0caec4aac5b508f631065c75bd47 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 15:53:08 +0100
Subject: [PATCH 49/54] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 855894ec2..ba5205b65 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,7 +104,7 @@
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <quarkus.platform.artifact-id>quarkus-bom</quarkus.platform.artifact-id>
         <quarkus.platform.group-id>io.quarkus.platform</quarkus.platform.group-id>
-        <quarkus.platform.version>3.2.12.Final</quarkus.platform.version> <!-- compatible with Java 11 -->
+        <quarkus.platform.version>3.2.9.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
         <antlr4.version>4.9.3</antlr4.version>
         <jol.core.version>0.16</jol.core.version>
         

From 14179291503de7ad51b48657f4b8c0023c054a34 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 16:18:49 +0100
Subject: [PATCH 50/54] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index ba5205b65..739ac5cb8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,7 +104,7 @@
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <quarkus.platform.artifact-id>quarkus-bom</quarkus.platform.artifact-id>
         <quarkus.platform.group-id>io.quarkus.platform</quarkus.platform.group-id>
-        <quarkus.platform.version>3.2.9.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
+        <quarkus.platform.version>2.16.12.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
         <antlr4.version>4.9.3</antlr4.version>
         <jol.core.version>0.16</jol.core.version>
         

From 7620c013a93c00da6ac9f783eaa62c61b8bd271d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Tue, 13 Jan 2026 16:46:16 +0100
Subject: [PATCH 51/54] CI fix

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 739ac5cb8..855894ec2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -104,7 +104,7 @@
         <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <quarkus.platform.artifact-id>quarkus-bom</quarkus.platform.artifact-id>
         <quarkus.platform.group-id>io.quarkus.platform</quarkus.platform.group-id>
-        <quarkus.platform.version>2.16.12.Final</quarkus.platform.version> <!-- compatible with Java 11 3.2.12.Final-->
+        <quarkus.platform.version>3.2.12.Final</quarkus.platform.version> <!-- compatible with Java 11 -->
         <antlr4.version>4.9.3</antlr4.version>
         <jol.core.version>0.16</jol.core.version>
         

From cd66151d9e1fa9368c385e42c747bb2f4ef75fea Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sat, 17 Jan 2026 00:35:00 +0100
Subject: [PATCH 52/54] CI fix

---
 .../parquet/ParquetConversionSource.java      | 114 ++++++------------
 .../xtable/parquet/ParquetDataManager.java    |  61 ++++++++++
 .../xtable/parquet/ITParquetDataManager.java  |  10 +-
 3 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index 24b86cac5..ad5422e0a 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -18,10 +18,6 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.IOException;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.nio.file.Paths;
 import java.time.Instant;
 import java.util.*;
 import java.util.stream.Stream;
@@ -33,11 +29,9 @@
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.functional.RemoteIterators;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
-import org.apache.xtable.exception.ReadException;
 import org.apache.xtable.hudi.*;
 import org.apache.xtable.hudi.HudiPathUtils;
 import org.apache.xtable.model.*;
@@ -96,37 +90,15 @@ private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile)
         .build();
   }
 
-  public Stream<ParquetFileConfig> getConfigsFromStream(
-      Stream<LocatedFileStatus> fileStream, Configuration conf) {
-
-    return fileStream.map(
-        fileStatus -> {
-          Path path = fileStatus.getPath();
-
-          ParquetMetadata metadata =
-              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
-
-          return ParquetFileConfig.builder()
-              .schema(metadata.getFileMetaData().getSchema())
-              .metadata(metadata)
-              .path(path)
-              .size(fileStatus.getLen())
-              .modifTime(fileStatus.getModificationTime())
-              .rowGroupIndex(0L)
-              .codec(
-                  metadata.getBlocks().isEmpty()
-                      ? null
-                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
-              .build();
-        });
-  }
-
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
-    Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    Stream<ParquetFileConfig> parquetFilesMetadata = getConfigsFromStream(parquetFiles, hadoopConf);
-    ParquetFileConfig file = getParquetFileAt(parquetFilesMetadata, modificationTime);
+    Stream<LocatedFileStatus> parquetFiles =
+        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
+    Stream<ParquetFileConfig> parquetFilesMetadata =
+        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf);
+    ParquetFileConfig file =
+        parquetDataManagerExtractor.getParquetFileAt(parquetFilesMetadata, modificationTime);
     return createInternalTableFromFile(file);
   }
 
@@ -180,10 +152,14 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
 
     List<ParquetFileConfig> filesMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
+            hadoopConf,
+            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+            modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf, getParquetFiles(hadoopConf, basePath), modificationTime);
+            hadoopConf,
+            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+            modificationTime);
     InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
     for (ParquetFileConfig fileMetadata : filesMetadata) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
@@ -193,8 +169,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
     return TableChange.builder()
         .sourceIdentifier(
             getCommitIdentifier(
-                getMostRecentParquetFile(
-                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                parquetDataManagerExtractor
+                    .getMostRecentParquetFile(
+                        parquetDataManagerExtractor.getConfigsFromStream(
+                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+                            hadoopConf))
                     .getModifTime()))
         .tableAsOfChange(internalTable)
         .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
@@ -202,71 +181,48 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
   }
 
   private InternalTable getMostRecentTable(Stream<ParquetFileConfig> parquetFiles) {
-    ParquetFileConfig latestFile = getMostRecentParquetFile(parquetFiles);
+    ParquetFileConfig latestFile =
+        parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles);
     return createInternalTableFromFile(latestFile);
   }
 
   @Override
   public InternalTable getCurrentTable() {
-    Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
-    return getMostRecentTable(getConfigsFromStream(parquetFiles, hadoopConf));
+    Stream<LocatedFileStatus> parquetFiles =
+        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
+    return getMostRecentTable(
+        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf));
   }
 
-  /**
-   * get current snapshot
-   *
-   * @return
-   */
   @Override
   public InternalSnapshot getCurrentSnapshot() {
     Stream<InternalDataFile> internalDataFiles =
         getInternalDataFiles(
-            getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
+            parquetDataManagerExtractor.getConfigsFromStream(
+                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
     InternalTable table =
-        getMostRecentTable(getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf));
+        getMostRecentTable(
+            parquetDataManagerExtractor.getConfigsFromStream(
+                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
             getCommitIdentifier(
-                getMostRecentParquetFile(
-                        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf))
+                parquetDataManagerExtractor
+                    .getMostRecentParquetFile(
+                        parquetDataManagerExtractor.getConfigsFromStream(
+                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
+                            hadoopConf))
                     .getModifTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
 
-  private ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
-    return parquetFiles
-        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
-        .orElseThrow(() -> new IllegalStateException("No files found"));
-  }
-
-  private ParquetFileConfig getParquetFileAt(
-      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
-
-    return parquetConfigs
-        .filter(config -> config.getModifTime() >= targetTime)
-        .findFirst()
-        .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
-  }
-
-  private Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
-    try {
-      FileSystem fs = FileSystem.get(hadoopConf);
-      URI uriBasePath = new URI(basePath);
-      String parentPath = Paths.get(uriBasePath).toString();
-      RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
-      return RemoteIterators.toList(iterator).stream()
-          .filter(file -> file.getPath().getName().endsWith("parquet"));
-    } catch (IOException | URISyntaxException e) {
-      throw new ReadException("Unable to read files from file system", e);
-    }
-  }
-
   @Override
   public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
     Stream<ParquetFileConfig> parquetFilesMetadata =
-        getConfigsFromStream(getParquetFiles(hadoopConf, basePath), hadoopConf);
+        parquetDataManagerExtractor.getConfigsFromStream(
+            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf);
     LongSummaryStatistics stats =
         parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics();
 
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index 5d09352ea..a018e037c 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -18,6 +18,10 @@
  
 package org.apache.xtable.parquet;
 
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Paths;
 import java.util.*;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
@@ -26,6 +30,10 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.*;
+import org.apache.hadoop.util.functional.RemoteIterators;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+
+import org.apache.xtable.exception.ReadException;
 
 /**
  * Manages Parquet file operations including reading, writing, and partition discovery and path
@@ -43,6 +51,59 @@ public static ParquetDataManager getInstance() {
     return INSTANCE;
   }
 
+  public ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
+    return parquetFiles
+        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
+        .orElseThrow(() -> new IllegalStateException("No files found"));
+  }
+
+  public ParquetFileConfig getParquetFileAt(
+      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
+
+    return parquetConfigs
+        .filter(config -> config.getModifTime() >= targetTime)
+        .findFirst()
+        .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
+  }
+
+  public Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
+    try {
+      FileSystem fs = FileSystem.get(hadoopConf);
+      URI uriBasePath = new URI(basePath);
+      String parentPath = Paths.get(uriBasePath).toString();
+      RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
+      return RemoteIterators.toList(iterator).stream()
+          .filter(file -> file.getPath().getName().endsWith("parquet"));
+    } catch (IOException | URISyntaxException e) {
+      throw new ReadException("Unable to read files from file system", e);
+    }
+  }
+
+  public Stream<ParquetFileConfig> getConfigsFromStream(
+      Stream<LocatedFileStatus> fileStream, Configuration conf) {
+
+    return fileStream.map(
+        fileStatus -> {
+          Path path = fileStatus.getPath();
+
+          ParquetMetadata metadata =
+              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+          return ParquetFileConfig.builder()
+              .schema(metadata.getFileMetaData().getSchema())
+              .metadata(metadata)
+              .path(path)
+              .size(fileStatus.getLen())
+              .modifTime(fileStatus.getModificationTime())
+              .rowGroupIndex(metadata.getBlocks().size())
+              .codec(
+                  metadata.getBlocks().isEmpty()
+                      ? null
+                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+              .build();
+        });
+  }
+
   public List<ParquetFileConfig> getParquetFilesMetadataInRange(
       Configuration conf, Stream<LocatedFileStatus> parquetFiles, long startTime, long endTime) {
 
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 90f28808f..3aed202e3 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -121,14 +121,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
         .partitionBy("year", "month")
         .mode("overwrite")
         .parquet(finalAppendFilePath);
-    /*long newModifTime = System.currentTimeMillis() - 50000;
-    for (String partition : newPartitions) {
-      org.apache.hadoop.fs.Path partitionPath =
-          new org.apache.hadoop.fs.Path(finalAppendFilePath, partition);
-      if (fs.exists(partitionPath)) {
-        updateModificationTimeRecursive(fs, partitionPath, newModifTime);
-      }
-    }*/
+
     Dataset<Row> dfWithNewTimes = spark.read().parquet(finalAppendFilePath);
     dfWithNewTimes
         .coalesce(1)
@@ -139,6 +132,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
     Properties sourceProperties = new Properties();
+    // TODO use timestamp col instead (as done in ITParquetConversionSource)
     String partitionConfig = "id:MONTH:year=yyyy/month=MM";
     sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
     SourceTable tableConfig =

From 4e4c5cba21ae6396f9626f27fe60139d7313815d Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Sun, 18 Jan 2026 21:24:12 +0100
Subject: [PATCH 53/54] merge ready fixes & tests + refactoring & reformatting

---
 .../xtable/parquet/ParquetDataManager.java    |  8 +++-----
 .../xtable/parquet/ITParquetDataManager.java  | 19 +++++++++++++------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index a018e037c..ff59f156b 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -36,12 +36,10 @@
 import org.apache.xtable.exception.ReadException;
 
 /**
- * Manages Parquet file operations including reading, writing, and partition discovery and path
- * construction.
+ * Manages Parquet File's Metadata
  *
- * <p>This class provides functions to handle Parquet metadata, validate schemas during appends, and
- * calculate target partition directories based on file modification times and defined partition
- * fields.
+ * <p>This class provides functions to handle Parquet metadata, creating metadata objects from
+ * parquet files and filtering the files based on the modification times.
  */
 @Log4j2
 public class ParquetDataManager {
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index 3aed202e3..bb32a5024 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -18,6 +18,7 @@
  
 package org.apache.xtable.parquet;
 
+import static org.apache.spark.sql.functions.expr;
 import static org.junit.jupiter.api.Assertions.*;
 
 import java.io.IOException;
@@ -84,12 +85,12 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
             RowFactory.create(102, "CB", 2027, 11),
             RowFactory.create(103, "BA", 2027, 11));
 
-    Dataset<Row> df = spark.createDataFrame(data, schema);
+    Dataset<Row> dfInit = spark.createDataFrame(data, schema);
     Path fixedPath = Paths.get("target", "fixed-parquet-data", "parquet_table_test_2");
     Path appendFilePath = Paths.get("target", "fixed-parquet-data", "parquet_file_test_2");
     String outputPath = fixedPath.toString();
     String finalAppendFilePath = appendFilePath.toString();
-
+    Dataset<Row> df = dfInit.withColumn("full_date", expr("make_date(year, month, 1)"));
     df.coalesce(1).write().partitionBy("year", "month").mode("overwrite").parquet(outputPath);
 
     // test find files to sync
@@ -101,6 +102,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     // many partitions case
     List<String> newPartitions = Arrays.asList("year=2026/month=12", "year=2027/month=11");
     long targetModifTime = System.currentTimeMillis() - 360000;
+    long newModifTime = System.currentTimeMillis() - 50000;
+    long testTime = System.currentTimeMillis() - 90000; // between two prev times
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
           new org.apache.hadoop.fs.Path(outputPath, partition);
@@ -114,7 +117,8 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
             RowFactory.create(101, "A", 2026, 12),
             RowFactory.create(301, "D", 2027, 11),
             RowFactory.create(302, "DA", 2027, 11));
-    Dataset<Row> dfToSync = spark.createDataFrame(futureDataToSync, schema);
+    Dataset<Row> dfToSyncInit = spark.createDataFrame(futureDataToSync, schema);
+    Dataset<Row> dfToSync = dfToSyncInit.withColumn("full_date", expr("make_date(year, month, 1)"));
     dfToSync
         .coalesce(1)
         .write()
@@ -132,8 +136,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     fs.delete(new org.apache.hadoop.fs.Path(finalAppendFilePath), true);
     // conversionSource operations
     Properties sourceProperties = new Properties();
-    // TODO use timestamp col instead (as done in ITParquetConversionSource)
-    String partitionConfig = "id:MONTH:year=yyyy/month=MM";
+    String partitionConfig = "full_date:MONTH:year=yyyy/month=MM";
     sourceProperties.put(PARTITION_FIELD_SPEC_CONFIG, partitionConfig);
     SourceTable tableConfig =
         SourceTable.builder()
@@ -146,7 +149,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     ParquetConversionSource conversionSource =
         conversionSourceProvider.getConversionSourceInstance(tableConfig);
 
-    long newModifTime = System.currentTimeMillis() - 50000;
+    // long newModifTime = System.currentTimeMillis() - 50000;
 
     for (String partition : newPartitions) {
       org.apache.hadoop.fs.Path partitionPath =
@@ -178,6 +181,10 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     assertNotNull(snapshot);
     TableChange changes = conversionSource.getTableChangeForCommit(newModifTime);
     assertNotNull(changes);
+    Instant instantBeforeFirstSnapshot =
+        Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli());
+    assertTrue(instantBeforeFirstSnapshot.toEpochMilli() == newModifTime);
+    assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime)));
   }
 
   private void updateModificationTimeRecursive(

From 212e50a941ae18b6cb7823d4675f1a0c79fa0c96 Mon Sep 17 00:00:00 2001
From: Selim Soufargi <ssoufargi.idealab.unical@gmail.com~>
Date: Mon, 19 Jan 2026 21:44:53 +0100
Subject: [PATCH 54/54] optimization: read metadata once for target file, read
 parquet files once, reformatting

---
 .../parquet/ParquetConversionSource.java      | 111 +++++++++---------
 .../xtable/parquet/ParquetDataManager.java    | 100 ++++++++++------
 .../xtable/parquet/ParquetFileConfig.java     |  25 ++--
 .../xtable/parquet/ITParquetDataManager.java  |   2 +-
 4 files changed, 134 insertions(+), 104 deletions(-)

diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
index ad5422e0a..10eccdf40 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetConversionSource.java
@@ -68,6 +68,16 @@ public class ParquetConversionSource implements ConversionSource<Long> {
   private final String basePath;
   @NonNull private final Configuration hadoopConf;
 
+  private List<LocatedFileStatus> parquetFiles;
+
+  // helper method to ensure files are loaded once
+  private List<LocatedFileStatus> getOrLoadParquetFiles() {
+    if (this.parquetFiles == null) {
+      this.parquetFiles = parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
+    }
+    return this.parquetFiles;
+  }
+
   private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) {
     ParquetMetadata parquetMetadata =
         parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath());
@@ -86,42 +96,38 @@ private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile)
         .layoutStrategy(dataLayoutStrategy)
         .partitioningFields(partitionFields)
         .readSchema(schema)
-        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModifTime()))
+        .latestCommitTime(Instant.ofEpochMilli(latestFile.getModificationTime()))
         .build();
   }
 
   @Override
   public InternalTable getTable(Long modificationTime) {
     // get parquetFile at specific time modificationTime
-    Stream<LocatedFileStatus> parquetFiles =
-        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
-    Stream<ParquetFileConfig> parquetFilesMetadata =
-        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf);
-    ParquetFileConfig file =
-        parquetDataManagerExtractor.getParquetFileAt(parquetFilesMetadata, modificationTime);
+    LocatedFileStatus parquetFile =
+        parquetDataManagerExtractor.getParquetDataFileAt(getOrLoadParquetFiles(), modificationTime);
+    ParquetFileConfig file = parquetDataManagerExtractor.getConfigFromFile(parquetFile, hadoopConf);
     return createInternalTableFromFile(file);
   }
 
   private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles.map(
-        file ->
-            InternalDataFile.builder()
-                .physicalPath(file.getPath().toString())
-                .fileFormat(FileFormat.APACHE_PARQUET)
-                .fileSizeBytes(file.getSize())
-                .partitionValues(
-                    partitionValueExtractor.extractPartitionValues(
-                        partitionSpecExtractor.spec(
-                            partitionValueExtractor.extractSchemaForParquetPartitions(
-                                parquetMetadataExtractor.readParquetMetadata(
-                                    hadoopConf, file.getPath()),
-                                file.getPath().toString())),
-                        HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
-                .lastModified(file.getModifTime())
-                .columnStats(
-                    parquetStatsExtractor.getColumnStatsForaFile(
-                        parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath())))
-                .build());
+        file -> {
+          ParquetMetadata metadata =
+              parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath());
+          return InternalDataFile.builder()
+              .physicalPath(file.getPath().toString())
+              .fileFormat(FileFormat.APACHE_PARQUET)
+              .fileSizeBytes(file.getSize())
+              .partitionValues(
+                  partitionValueExtractor.extractPartitionValues(
+                      partitionSpecExtractor.spec(
+                          partitionValueExtractor.extractSchemaForParquetPartitions(
+                              metadata, file.getPath().toString())),
+                      HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
+              .lastModified(file.getModificationTime())
+              .columnStats(parquetStatsExtractor.getColumnStatsForaFile(metadata))
+              .build();
+        });
   }
 
   private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) {
@@ -133,7 +139,7 @@ private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig
                     partitionValueExtractor.extractSchemaForParquetPartitions(
                         parquetFile.getMetadata(), parquetFile.getPath().toString())),
                 HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
-        .lastModified(parquetFile.getModifTime())
+        .lastModified(parquetFile.getModificationTime())
         .fileSizeBytes(parquetFile.getSize())
         .columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
         .build();
@@ -152,15 +158,11 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
 
     List<ParquetFileConfig> filesMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf,
-            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-            modificationTime);
+            hadoopConf, getOrLoadParquetFiles(), modificationTime);
     List<ParquetFileConfig> tableChangesAfterMetadata =
         parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
-            hadoopConf,
-            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-            modificationTime);
-    InternalTable internalTable = getMostRecentTable(tableChangesAfterMetadata.stream());
+            hadoopConf, getOrLoadParquetFiles(), modificationTime);
+    InternalTable internalTable = getMostRecentTableConfig(tableChangesAfterMetadata.stream());
     for (ParquetFileConfig fileMetadata : filesMetadata) {
       InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
       addedInternalDataFiles.add(currentDataFile);
@@ -170,50 +172,44 @@ public TableChange getTableChangeForCommit(Long modificationTime) {
         .sourceIdentifier(
             getCommitIdentifier(
                 parquetDataManagerExtractor
-                    .getMostRecentParquetFile(
-                        parquetDataManagerExtractor.getConfigsFromStream(
-                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-                            hadoopConf))
-                    .getModifTime()))
+                    .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
+                    .getModificationTime()))
         .tableAsOfChange(internalTable)
         .filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
         .build();
   }
 
-  private InternalTable getMostRecentTable(Stream<ParquetFileConfig> parquetFiles) {
+  private InternalTable getMostRecentTable(
+      List<LocatedFileStatus> parquetFiles, Configuration conf) {
+    ParquetFileConfig latestFile =
+        parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles, conf);
+    return createInternalTableFromFile(latestFile);
+  }
+
+  private InternalTable getMostRecentTableConfig(Stream<ParquetFileConfig> parquetFiles) {
     ParquetFileConfig latestFile =
-        parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles);
+        parquetDataManagerExtractor.getMostRecentParquetFileConfig(parquetFiles);
     return createInternalTableFromFile(latestFile);
   }
 
   @Override
   public InternalTable getCurrentTable() {
-    Stream<LocatedFileStatus> parquetFiles =
-        parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
-    return getMostRecentTable(
-        parquetDataManagerExtractor.getConfigsFromStream(parquetFiles, hadoopConf));
+    return getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
   }
 
   @Override
   public InternalSnapshot getCurrentSnapshot() {
     Stream<InternalDataFile> internalDataFiles =
         getInternalDataFiles(
-            parquetDataManagerExtractor.getConfigsFromStream(
-                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
-    InternalTable table =
-        getMostRecentTable(
-            parquetDataManagerExtractor.getConfigsFromStream(
-                parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf));
+            parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf));
+    InternalTable table = getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
     return InternalSnapshot.builder()
         .table(table)
         .sourceIdentifier(
             getCommitIdentifier(
                 parquetDataManagerExtractor
-                    .getMostRecentParquetFile(
-                        parquetDataManagerExtractor.getConfigsFromStream(
-                            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath),
-                            hadoopConf))
-                    .getModifTime()))
+                    .getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
+                    .getModificationTime()))
         .partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
         .build();
   }
@@ -221,10 +217,9 @@ public InternalSnapshot getCurrentSnapshot() {
   @Override
   public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
     Stream<ParquetFileConfig> parquetFilesMetadata =
-        parquetDataManagerExtractor.getConfigsFromStream(
-            parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath), hadoopConf);
+        parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf);
     LongSummaryStatistics stats =
-        parquetFilesMetadata.mapToLong(ParquetFileConfig::getModifTime).summaryStatistics();
+        parquetFilesMetadata.mapToLong(ParquetFileConfig::getModificationTime).summaryStatistics();
 
     if (stats.getCount() == 0) {
       log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName);
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
index ff59f156b..c6372c198 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetDataManager.java
@@ -49,57 +49,89 @@ public static ParquetDataManager getInstance() {
     return INSTANCE;
   }
 
-  public ParquetFileConfig getMostRecentParquetFile(Stream<ParquetFileConfig> parquetFiles) {
+  public ParquetFileConfig getMostRecentParquetFile(
+      List<LocatedFileStatus> parquetFiles, Configuration conf) {
+    LocatedFileStatus file =
+        parquetFiles.stream()
+            .max(Comparator.comparing(LocatedFileStatus::getModificationTime))
+            .orElseThrow(() -> new IllegalStateException("No files found"));
+    return getConfigFromFile(file, conf);
+  }
+
+  public ParquetFileConfig getMostRecentParquetFileConfig(Stream<ParquetFileConfig> parquetFiles) {
     return parquetFiles
-        .max(Comparator.comparing(ParquetFileConfig::getModifTime))
+        .max(Comparator.comparing(ParquetFileConfig::getModificationTime))
         .orElseThrow(() -> new IllegalStateException("No files found"));
   }
 
-  public ParquetFileConfig getParquetFileAt(
-      Stream<ParquetFileConfig> parquetConfigs, long targetTime) {
+  public LocatedFileStatus getParquetDataFileAt(
+      List<LocatedFileStatus> parquetFiles, long targetTime) {
 
-    return parquetConfigs
-        .filter(config -> config.getModifTime() >= targetTime)
+    return parquetFiles.stream()
+        .filter(file -> file.getModificationTime() >= targetTime)
         .findFirst()
         .orElseThrow(() -> new IllegalStateException("No file found at or after " + targetTime));
   }
 
-  public Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
+  public ParquetFileConfig getConfigFromFile(LocatedFileStatus file, Configuration conf) {
+
+    Path path = file.getPath();
+
+    ParquetMetadata metadata =
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+    return ParquetFileConfig.builder()
+        .schema(metadata.getFileMetaData().getSchema())
+        .metadata(metadata)
+        .path(path)
+        .size(file.getLen())
+        .modificationTime(file.getModificationTime())
+        .rowGroupIndex(metadata.getBlocks().size())
+        .codec(
+            metadata.getBlocks().isEmpty()
+                ? null
+                : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+        .build();
+  }
+
+  public List<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
     try {
       FileSystem fs = FileSystem.get(hadoopConf);
       URI uriBasePath = new URI(basePath);
       String parentPath = Paths.get(uriBasePath).toString();
       RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
       return RemoteIterators.toList(iterator).stream()
-          .filter(file -> file.getPath().getName().endsWith("parquet"));
+          .filter(file -> file.getPath().getName().endsWith("parquet"))
+          .collect(Collectors.toList());
     } catch (IOException | URISyntaxException e) {
       throw new ReadException("Unable to read files from file system", e);
     }
   }
 
   public Stream<ParquetFileConfig> getConfigsFromStream(
-      Stream<LocatedFileStatus> fileStream, Configuration conf) {
-
-    return fileStream.map(
-        fileStatus -> {
-          Path path = fileStatus.getPath();
-
-          ParquetMetadata metadata =
-              ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
-
-          return ParquetFileConfig.builder()
-              .schema(metadata.getFileMetaData().getSchema())
-              .metadata(metadata)
-              .path(path)
-              .size(fileStatus.getLen())
-              .modifTime(fileStatus.getModificationTime())
-              .rowGroupIndex(metadata.getBlocks().size())
-              .codec(
-                  metadata.getBlocks().isEmpty()
-                      ? null
-                      : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
-              .build();
-        });
+      List<LocatedFileStatus> fileStream, Configuration conf) {
+
+    return fileStream.stream()
+        .map(
+            fileStatus -> {
+              Path path = fileStatus.getPath();
+
+              ParquetMetadata metadata =
+                  ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, path);
+
+              return ParquetFileConfig.builder()
+                  .schema(metadata.getFileMetaData().getSchema())
+                  .metadata(metadata)
+                  .path(path)
+                  .size(fileStatus.getLen())
+                  .modificationTime(fileStatus.getModificationTime())
+                  .rowGroupIndex(metadata.getBlocks().size())
+                  .codec(
+                      metadata.getBlocks().isEmpty()
+                          ? null
+                          : metadata.getBlocks().get(0).getColumns().get(0).getCodec())
+                  .build();
+            });
   }
 
   public List<ParquetFileConfig> getParquetFilesMetadataInRange(
@@ -109,16 +141,16 @@ public List<ParquetFileConfig> getParquetFilesMetadataInRange(
         .filter(
             file ->
                 file.getModificationTime() >= startTime && file.getModificationTime() <= endTime)
-        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .map(file -> new ParquetFileConfig(conf, file))
         .collect(Collectors.toList());
   }
 
   public List<ParquetFileConfig> getParquetFilesMetadataAfterTime(
-      Configuration conf, Stream<LocatedFileStatus> parquetFiles, long syncTime) {
+      Configuration conf, List<LocatedFileStatus> parquetFiles, long syncTime) {
 
-    return parquetFiles
+    return parquetFiles.stream()
         .filter(file -> file.getModificationTime() >= syncTime)
-        .map(file -> new ParquetFileConfig(conf, file.getPath()))
+        .map(file -> new ParquetFileConfig(conf, file))
         .collect(Collectors.toList());
   }
 }
diff --git a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
index 7a038124a..ec40a79f5 100644
--- a/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
+++ b/xtable-core/src/main/java/org/apache/xtable/parquet/ParquetFileConfig.java
@@ -18,21 +18,24 @@
  
 package org.apache.xtable.parquet;
 
-import java.io.IOException;
-
 import lombok.AccessLevel;
 import lombok.AllArgsConstructor;
 import lombok.Builder;
 import lombok.Getter;
 import lombok.experimental.FieldDefaults;
+import lombok.extern.log4j.Log4j2;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.parquet.hadoop.metadata.BlockMetaData;
 import org.apache.parquet.hadoop.metadata.CompressionCodecName;
 import org.apache.parquet.hadoop.metadata.ParquetMetadata;
 import org.apache.parquet.schema.MessageType;
 
+import org.apache.xtable.exception.ReadException;
+
+@Log4j2
 @Getter
 @FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
 @Builder
@@ -41,26 +44,26 @@ class ParquetFileConfig {
   MessageType schema;
   ParquetMetadata metadata;
   long rowGroupIndex;
-  long modifTime;
+  long modificationTime;
   long size;
   CompressionCodecName codec;
   Path path;
 
-  public ParquetFileConfig(Configuration conf, Path file) {
-    long modifTime = -1L;
+  public ParquetFileConfig(Configuration conf, FileStatus file) {
+    long modificationTime = -1L;
     ParquetMetadata metadata =
-        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file);
+        ParquetMetadataExtractor.getInstance().readParquetMetadata(conf, file.getPath());
 
     if (metadata.getBlocks().isEmpty()) {
       throw new IllegalStateException("Parquet file contains no row groups.");
     }
     try {
-      modifTime = file.getFileSystem(conf).getFileStatus(file).getModificationTime();
-    } catch (IOException e) {
-      e.printStackTrace();
+      modificationTime = file.getModificationTime();
+    } catch (ReadException e) {
+      log.warn("File reading error: " + e.getMessage());
     }
-    this.path = file;
-    this.modifTime = modifTime;
+    this.path = file.getPath();
+    this.modificationTime = modificationTime;
     this.size = metadata.getBlocks().stream().mapToLong(BlockMetaData::getTotalByteSize).sum();
     this.metadata = metadata;
     this.schema = metadata.getFileMetaData().getSchema();
diff --git a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
index bb32a5024..08e11ed93 100644
--- a/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
+++ b/xtable-core/src/test/java/org/apache/xtable/parquet/ITParquetDataManager.java
@@ -183,7 +183,7 @@ public void testAppendParquetFileMultiplePartition() throws IOException {
     assertNotNull(changes);
     Instant instantBeforeFirstSnapshot =
         Instant.ofEpochMilli(snapshot.getTable().getLatestCommitTime().toEpochMilli());
-    assertTrue(instantBeforeFirstSnapshot.toEpochMilli() == newModifTime);
+    assertEquals(instantBeforeFirstSnapshot.toEpochMilli(), newModifTime);
     assertTrue(conversionSource.isIncrementalSyncSafeFrom(Instant.ofEpochMilli(testTime)));
   }