diff --git a/cpp/include/legate_dataframe/parquet.hpp b/cpp/include/legate_dataframe/parquet.hpp index e3f0b1a..d8a7f9b 100644 --- a/cpp/include/legate_dataframe/parquet.hpp +++ b/cpp/include/legate_dataframe/parquet.hpp @@ -98,6 +98,10 @@ class ParquetReadArray : public Task * ├── part-2.parquet * └── ... * + * This function may create the directory but does not ensure it is empty. + * If a previous write wrote more partitions the old files will remain + * leaving the directory in an inconsistent state. + * * @param tbl The table to write. * @param path Destination directory for data. */ diff --git a/cpp/src/parquet.cpp b/cpp/src/parquet.cpp index c12f42c..a265f5a 100644 --- a/cpp/src/parquet.cpp +++ b/cpp/src/parquet.cpp @@ -492,9 +492,7 @@ ParquetReadInfo get_parquet_info(const std::vector& file_paths, void parquet_write(LogicalTable& tbl, const std::string& dirpath) { std::filesystem::create_directories(dirpath); - if (!std::filesystem::is_empty(dirpath)) { - throw std::invalid_argument("if path exist, it must be an empty directory"); - } + auto runtime = legate::Runtime::get_runtime(); legate::AutoTask task = runtime->create_task(get_library(), task::ParquetWrite::TASK_CONFIG.task_id()); diff --git a/python/legate_dataframe/lib/parquet.pyx b/python/legate_dataframe/lib/parquet.pyx index 4906e74..9d54f3e 100644 --- a/python/legate_dataframe/lib/parquet.pyx +++ b/python/legate_dataframe/lib/parquet.pyx @@ -68,6 +68,11 @@ def parquet_write(LogicalTable tbl, path: pathlib.Path | str) -> None: ├── part.2.parquet └── ... + .. note:: + This function will create the directory but does not ensure it is empty. + If a previous write had more partitions the old files will remain + leaving the directory in an inconsistent state. + See Also -------- parquet_read: Read parquet data