From 843c702bfe8070c7c68f558666a602bc2c70b469 Mon Sep 17 00:00:00 2001 From: Leader254 Date: Wed, 11 Feb 2026 15:26:28 +0300 Subject: [PATCH 1/2] feat: Implement Dataframe drop duplicates for sccpp --- README.md | 5 +++++ src/dataframe.cpp | 37 +++++++++++++++++++++++++++++++++++++ src/dataframe.h | 21 +++++++++++++++++++++ tests/dataframe.cpp | 25 ++++++++++++++++++++++++- 4 files changed, 87 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 17db104..6ac48bb 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,11 @@ ctest -R test_dataframe_writer --test-args --gtest_filter=SparkIntegrationTest.P # Run Test Suite directly # -------------------------------- ./test_ + +# -------------------------------- +# Run Single Test Case directly - show output +# -------------------------------- +./test_dataframe --gtest_filter=SparkIntegrationTest.DropDuplicates ``` ### Mem Checks (Valgrind) diff --git a/src/dataframe.cpp b/src/dataframe.cpp index 1e3979d..421e2a5 100644 --- a/src/dataframe.cpp +++ b/src/dataframe.cpp @@ -689,4 +689,41 @@ DataFrameWriter DataFrame::write() config.user_id = user_id_; return DataFrameWriter(stub_, plan_.root(), config); +} + +DataFrame DataFrame::dropDuplicates() +{ + return dropDuplicates({}); +} + +DataFrame DataFrame::dropDuplicates(const std::vector &subset) +{ + spark::connect::Plan new_plan; + + auto *relation = new_plan.mutable_root()->mutable_deduplicate(); + + if (this->plan_.has_root()) + { + relation->mutable_input()->CopyFrom(this->plan_.root()); + } + + if (subset.empty()) { + relation->set_all_columns_as_keys(true); + } else { + for (const auto &col_name : subset) { + relation->add_column_names(col_name); + } + } + + return DataFrame(stub_, new_plan, session_id_, user_id_); +} + +DataFrame DataFrame::drop_duplicates() +{ + return dropDuplicates(); +} + +DataFrame DataFrame::drop_duplicates(const std::vector &subset) +{ + return dropDuplicates(subset); } \ No newline at end of file diff --git a/src/dataframe.h b/src/dataframe.h index 2f0c78f..56a28f6 100644 --- a/src/dataframe.h +++ b/src/dataframe.h @@ -131,6 +131,27 @@ class DataFrame */ DataFrameWriter write(); + /** + * @brief Returns a new DataFrame with duplicate rows removed - equivalent to distinct() function + */ + + DataFrame dropDuplicates(); + /** + * @brief Returns a new DataFrame with duplicate rows removed, + * considering only the given subset of columns - equivalent to distinct() function + */ + DataFrame dropDuplicates(const std::vector& subset); + + /** + * @brief Alias for dropDuplicates(). + */ + DataFrame drop_duplicates(); + + /** + * @brief Alias for dropDuplicates(subset). + */ + DataFrame drop_duplicates(const std::vector& subset); + private: std::shared_ptr stub_; spark::connect::Plan plan_; diff --git a/tests/dataframe.cpp b/tests/dataframe.cpp index a4292e8..35b4457 100644 --- a/tests/dataframe.cpp +++ b/tests/dataframe.cpp @@ -274,4 +274,27 @@ TEST_F(SparkIntegrationTest, WhereFilter) auto filtered_df = df.where("age < 25"); filtered_df.show(); -} \ No newline at end of file +} + +TEST_F(SparkIntegrationTest, DropDuplicates) +{ + // R - raw string literal. + auto df = spark->sql(R"( + SELECT * + FROM VALUES + (14, 'Tom'), + (14, 'Tom'), + (14, 'Alice'), + (14, 'Alice'), + (14, 'Bob'), + (14, 'Bob'), + (15, 'Tom') + AS people(age, name) + )"); + + auto deduped = df.dropDuplicates(); + deduped.show(); + + auto subset_deduped = df.dropDuplicates({"age"}); + subset_deduped.show(); +} From 0f6530ead83783961fbc30deb79868e119f2d1b1 Mon Sep 17 00:00:00 2001 From: Leader254 Date: Wed, 11 Feb 2026 15:48:02 +0300 Subject: [PATCH 2/2] update test case(dropDuplicates) --- tests/dataframe.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/dataframe.cpp b/tests/dataframe.cpp index 35b4457..7c4ecc1 100644 --- a/tests/dataframe.cpp +++ b/tests/dataframe.cpp @@ -288,7 +288,8 @@ TEST_F(SparkIntegrationTest, DropDuplicates) (14, 'Alice'), (14, 'Bob'), (14, 'Bob'), - (15, 'Tom') + (15, 'Tom'), + (15, 'John') AS people(age, name) )");