Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
e541a71
given a parquet file return data from a certain modification time
Dec 10, 2025
15e282a
create the path based on the partition then inject the file to append…
Dec 13, 2025
2ee71c9
Handle case of path construction with file partitioned over many fiel…
Dec 14, 2025
6032e5f
test append Parquet file into table init
Dec 14, 2025
f6fdc72
add function to test schema equivalence before appending
Dec 15, 2025
a94c3f3
construct path to inject to based on partitions
Dec 16, 2025
f8bdbfe
fix imports
Dec 16, 2025
c04a983
refactoring (lombok, logs, javadocs and function and approach comment…
Dec 17, 2025
5f2541e
use appendFile to append a file into a table while tracking the appen…
Jan 1, 2026
47e7076
find the files that satisfy to the time condition
Jan 1, 2026
fbb09ec
treat appends as separate files to add in the target partition folder
Jan 1, 2026
fe19a60
update approach: selective block compaction
Jan 2, 2026
da7f300
update approach: added a basic test to check data selection using mod…
Jan 2, 2026
a8730b7
fix append based on partition value
Jan 2, 2026
d19ccbf
fix test with basic example where partitions are not considered
Jan 2, 2026
aecb204
fix test with basic example where partitions are not considered2
Jan 2, 2026
0ec8cbb
fix test with basic example where partitions are not considered3
Jan 2, 2026
9cb75df
test with time of last append is now
Jan 3, 2026
9e125f2
test appendFile with Parquet: TODO test with multiple partitions 1) a…
Jan 3, 2026
233ca77
merge recursively one partition files
Jan 3, 2026
b4cba5a
fix paths for files to append
Jan 3, 2026
a564b29
fix bug of appending file path
Jan 3, 2026
d1ceafb
fix bug of schema
Jan 3, 2026
649250f
make sure returned files are non empty when iterating in the partitions
Jan 3, 2026
24ff828
make sure returned files are non empty when iterating in the partitio…
Jan 3, 2026
463d2ee
discard empty files when appending
Jan 3, 2026
71d1c34
discard empty files when appending
Jan 3, 2026
3319a91
one spark session for both tests
Jan 3, 2026
013ffe4
bug fix
Jan 3, 2026
b939a0a
bug fix
Jan 3, 2026
b6d8ddc
cleanups + TODO: partitions match and merge
Jan 3, 2026
c18ab1c
added test for many partitions, TODO integrate functions into Parquet…
Jan 4, 2026
b7c613e
selecting data bug fix, TODO integrate functions into ParquetConversi…
Jan 4, 2026
2a75f49
run all tests, TODO integrate functions into ParquetConversionSource
Jan 4, 2026
f1538b0
run all tests, TODO integrate functions into ParquetConversionSource
Jan 4, 2026
cdedeae
spotless:apply, TODO integrate functions into ParquetConversionSource
Jan 4, 2026
e873120
bug fixes, TODO integrate functions into ParquetConversionSource
Jan 4, 2026
4d0f245
bug fixes, TODO integrate functions into ParquetConversionSource
Jan 5, 2026
219656e
handle lightweight metadata in the conversionSource
Jan 12, 2026
ff809d7
fix CI
Jan 12, 2026
e06368f
fix CI, refactoring and isIncrementalSyncSafeFrom implementation
Jan 12, 2026
7cdccd0
added test
Jan 12, 2026
b919146
added test: init conversionSource instance
Jan 12, 2026
8ff5aa6
added test: init conversionSource instance: fix error
Jan 12, 2026
db9d7ab
added test: fix error
Jan 13, 2026
324d703
more tests + bug fixes and reformatting
Jan 13, 2026
3c453e6
CI fix
Jan 13, 2026
e4c0b4c
CI fix
Jan 13, 2026
4315282
CI fix
Jan 13, 2026
1417929
CI fix
Jan 13, 2026
7620c01
CI fix
Jan 13, 2026
e458d72
Merge branch 'main' of https://github.com/sapienza88/incubator-xtable…
Jan 14, 2026
9947f1b
Merge branch 'main' of https://github.com/sapienza88/incubator-xtable…
Jan 16, 2026
cd66151
CI fix
Jan 16, 2026
4e4c5cb
merge ready fixes & tests + refactoring & reformatting
Jan 18, 2026
212e50a
optimization: read metadata once for target file, read parquet files …
Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,20 @@

package org.apache.xtable.parquet;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.time.Instant;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import lombok.Builder;
import lombok.NonNull;
import lombok.extern.log4j.Log4j2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.functional.RemoteIterators;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;

import org.apache.xtable.exception.ReadException;
import org.apache.xtable.hudi.*;
import org.apache.xtable.hudi.HudiPathUtils;
import org.apache.xtable.model.*;
Expand All @@ -52,10 +46,15 @@
import org.apache.xtable.spi.extractor.ConversionSource;

@Builder
@Log4j2
public class ParquetConversionSource implements ConversionSource<Long> {

private static final ParquetSchemaExtractor schemaExtractor =
ParquetSchemaExtractor.getInstance();
private static final ParquetMetadataExtractor metadataExtractor =
ParquetMetadataExtractor.getInstance();
private static final ParquetDataManager parquetDataManagerExtractor =
ParquetDataManager.getInstance();

private static final ParquetMetadataExtractor parquetMetadataExtractor =
ParquetMetadataExtractor.getInstance();
Expand All @@ -69,7 +68,17 @@ public class ParquetConversionSource implements ConversionSource<Long> {
private final String basePath;
@NonNull private final Configuration hadoopConf;

private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile) {
private List<LocatedFileStatus> parquetFiles;

// helper method to ensure files are loaded once
private List<LocatedFileStatus> getOrLoadParquetFiles() {
if (this.parquetFiles == null) {
this.parquetFiles = parquetDataManagerExtractor.getParquetFiles(hadoopConf, basePath);
}
return this.parquetFiles;
}

private InternalTable createInternalTableFromFile(ParquetFileConfig latestFile) {
ParquetMetadata parquetMetadata =
parquetMetadataExtractor.readParquetMetadata(hadoopConf, latestFile.getPath());
MessageType parquetSchema = parquetMetadataExtractor.getSchema(parquetMetadata);
Expand All @@ -94,49 +103,45 @@ private InternalTable createInternalTableFromFile(LocatedFileStatus latestFile)
@Override
public InternalTable getTable(Long modificationTime) {
// get parquetFile at specific time modificationTime
Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
LocatedFileStatus file = getParquetFileAt(parquetFiles, modificationTime);
LocatedFileStatus parquetFile =
parquetDataManagerExtractor.getParquetDataFileAt(getOrLoadParquetFiles(), modificationTime);
ParquetFileConfig file = parquetDataManagerExtractor.getConfigFromFile(parquetFile, hadoopConf);
return createInternalTableFromFile(file);
}

private Stream<InternalDataFile> getInternalDataFiles(Stream<LocatedFileStatus> parquetFiles) {
private Stream<InternalDataFile> getInternalDataFiles(Stream<ParquetFileConfig> parquetFiles) {
return parquetFiles.map(
file ->
InternalDataFile.builder()
.physicalPath(file.getPath().toString())
.fileFormat(FileFormat.APACHE_PARQUET)
.fileSizeBytes(file.getLen())
.partitionValues(
partitionValueExtractor.extractPartitionValues(
partitionSpecExtractor.spec(
partitionValueExtractor.extractSchemaForParquetPartitions(
parquetMetadataExtractor.readParquetMetadata(
hadoopConf, file.getPath()),
file.getPath().toString())),
HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
.lastModified(file.getModificationTime())
.columnStats(
parquetStatsExtractor.getColumnStatsForaFile(
parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath())))
.build());
file -> {
ParquetMetadata metadata =
parquetMetadataExtractor.readParquetMetadata(hadoopConf, file.getPath());
return InternalDataFile.builder()
.physicalPath(file.getPath().toString())
.fileFormat(FileFormat.APACHE_PARQUET)
.fileSizeBytes(file.getSize())
.partitionValues(
partitionValueExtractor.extractPartitionValues(
partitionSpecExtractor.spec(
partitionValueExtractor.extractSchemaForParquetPartitions(
metadata, file.getPath().toString())),
HudiPathUtils.getPartitionPath(new Path(basePath), file.getPath())))
.lastModified(file.getModificationTime())
.columnStats(parquetStatsExtractor.getColumnStatsForaFile(metadata))
.build();
});
}

private InternalDataFile createInternalDataFileFromParquetFile(FileStatus parquetFile) {
private InternalDataFile createInternalDataFileFromParquetFile(ParquetFileConfig parquetFile) {
return InternalDataFile.builder()
.physicalPath(parquetFile.getPath().toString())
.partitionValues(
partitionValueExtractor.extractPartitionValues(
partitionSpecExtractor.spec(
partitionValueExtractor.extractSchemaForParquetPartitions(
parquetMetadataExtractor.readParquetMetadata(
hadoopConf, parquetFile.getPath()),
parquetFile.getPath().toString())),
basePath))
parquetFile.getMetadata(), parquetFile.getPath().toString())),
HudiPathUtils.getPartitionPath(new Path(basePath), parquetFile.getPath())))
.lastModified(parquetFile.getModificationTime())
.fileSizeBytes(parquetFile.getLen())
.columnStats(
parquetStatsExtractor.getColumnStatsForaFile(
parquetMetadataExtractor.readParquetMetadata(hadoopConf, parquetFile.getPath())))
.fileSizeBytes(parquetFile.getSize())
.columnStats(parquetStatsExtractor.getColumnStatsForaFile(parquetFile.getMetadata()))
.build();
}

Expand All @@ -149,87 +154,103 @@ public CommitsBacklog<Long> getCommitsBacklog(InstantsForIncrementalSync syncIns

@Override
public TableChange getTableChangeForCommit(Long modificationTime) {
Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
Set<InternalDataFile> addedInternalDataFiles = new HashSet<>();

List<FileStatus> tableChangesAfter =
parquetFiles
.filter(fileStatus -> fileStatus.getModificationTime() > modificationTime)
.collect(Collectors.toList());
InternalTable internalTable = getMostRecentTable(parquetFiles);
for (FileStatus tableStatus : tableChangesAfter) {
InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(tableStatus);
List<ParquetFileConfig> filesMetadata =
parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
hadoopConf, getOrLoadParquetFiles(), modificationTime);
List<ParquetFileConfig> tableChangesAfterMetadata =
parquetDataManagerExtractor.getParquetFilesMetadataAfterTime(
hadoopConf, getOrLoadParquetFiles(), modificationTime);
InternalTable internalTable = getMostRecentTableConfig(tableChangesAfterMetadata.stream());
for (ParquetFileConfig fileMetadata : filesMetadata) {
InternalDataFile currentDataFile = createInternalDataFileFromParquetFile(fileMetadata);
addedInternalDataFiles.add(currentDataFile);
}

return TableChange.builder()
.sourceIdentifier(
getCommitIdentifier(
parquetDataManagerExtractor
.getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
.getModificationTime()))
.tableAsOfChange(internalTable)
.filesDiff(InternalFilesDiff.builder().filesAdded(addedInternalDataFiles).build())
.build();
}

private InternalTable getMostRecentTable(Stream<LocatedFileStatus> parquetFiles) {
LocatedFileStatus latestFile = getMostRecentParquetFile(parquetFiles);
private InternalTable getMostRecentTable(
List<LocatedFileStatus> parquetFiles, Configuration conf) {
ParquetFileConfig latestFile =
parquetDataManagerExtractor.getMostRecentParquetFile(parquetFiles, conf);
return createInternalTableFromFile(latestFile);
}

private InternalTable getMostRecentTableConfig(Stream<ParquetFileConfig> parquetFiles) {
ParquetFileConfig latestFile =
parquetDataManagerExtractor.getMostRecentParquetFileConfig(parquetFiles);
return createInternalTableFromFile(latestFile);
}

@Override
public InternalTable getCurrentTable() {
Stream<LocatedFileStatus> parquetFiles = getParquetFiles(hadoopConf, basePath);
return getMostRecentTable(parquetFiles);
return getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
}

/**
* get current snapshot
*
* @return
*/
@Override
public InternalSnapshot getCurrentSnapshot() {
// to avoid consume the stream call the method twice to return the same stream of parquet files
Stream<InternalDataFile> internalDataFiles =
getInternalDataFiles(getParquetFiles(hadoopConf, basePath));
InternalTable table = getMostRecentTable(getParquetFiles(hadoopConf, basePath));
getInternalDataFiles(
parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf));
InternalTable table = getMostRecentTable(getOrLoadParquetFiles(), hadoopConf);
return InternalSnapshot.builder()
.table(table)
.sourceIdentifier(
getCommitIdentifier(
getMostRecentParquetFile(getParquetFiles(hadoopConf, basePath))
parquetDataManagerExtractor
.getMostRecentParquetFile(getOrLoadParquetFiles(), hadoopConf)
.getModificationTime()))
.partitionedDataFiles(PartitionFileGroup.fromFiles(internalDataFiles))
.build();
}

private LocatedFileStatus getMostRecentParquetFile(Stream<LocatedFileStatus> parquetFiles) {
return parquetFiles
.max(Comparator.comparing(FileStatus::getModificationTime))
.orElseThrow(() -> new IllegalStateException("No files found"));
}
@Override
public boolean isIncrementalSyncSafeFrom(Instant timeInMillis) {
Stream<ParquetFileConfig> parquetFilesMetadata =
parquetDataManagerExtractor.getConfigsFromStream(getOrLoadParquetFiles(), hadoopConf);
LongSummaryStatistics stats =
parquetFilesMetadata.mapToLong(ParquetFileConfig::getModificationTime).summaryStatistics();

private LocatedFileStatus getParquetFileAt(
Stream<LocatedFileStatus> parquetFiles, long modificationTime) {
return parquetFiles
.filter(fileStatus -> fileStatus.getModificationTime() == modificationTime)
.findFirst()
.orElseThrow(() -> new IllegalStateException("No file found at " + modificationTime));
}
if (stats.getCount() == 0) {
log.warn("No parquet files found in table {}. Incremental sync is not possible.", tableName);
return false;
}

private Stream<LocatedFileStatus> getParquetFiles(Configuration hadoopConf, String basePath) {
try {
FileSystem fs = FileSystem.get(hadoopConf);
URI uriBasePath = new URI(basePath);
String parentPath = Paths.get(uriBasePath).toString();
RemoteIterator<LocatedFileStatus> iterator = fs.listFiles(new Path(parentPath), true);
return RemoteIterators.toList(iterator).stream()
.filter(file -> file.getPath().getName().endsWith("parquet"));
} catch (IOException | URISyntaxException e) {
throw new ReadException("Unable to read files from file system", e);
long earliestModTime = stats.getMin();
long latestModTime = stats.getMax();

if (timeInMillis.toEpochMilli() > latestModTime) {
log.warn(
"Instant {} is in the future relative to the data. Latest file time: {}",
timeInMillis.toEpochMilli(),
Instant.ofEpochMilli(latestModTime));
return false;
}
}

@Override
public boolean isIncrementalSyncSafeFrom(Instant instant) {
return false;
if (earliestModTime > timeInMillis.toEpochMilli()) {
log.warn(
"Incremental sync is not safe. Earliest available metadata (time={}) is newer "
+ "than requested instant {}. Data history has been truncated.",
Instant.ofEpochMilli(earliestModTime),
timeInMillis.toEpochMilli());
return false;
}

log.info(
"Incremental sync is safe from instant {} for table {}",
timeInMillis.toEpochMilli(),
tableName);
return true;
}

@Override
Expand Down
Loading