From 843c702bfe8070c7c68f558666a602bc2c70b469 Mon Sep 17 00:00:00 2001
From: Leader254 <gatheru.samuel@students.kyu.ac.ke>
Date: Wed, 11 Feb 2026 15:26:28 +0300
Subject: [PATCH 1/2] feat: Implement Dataframe drop duplicates for sccpp

---
 README.md           |  5 +++++
 src/dataframe.cpp   | 37 +++++++++++++++++++++++++++++++++++++
 src/dataframe.h     | 21 +++++++++++++++++++++
 tests/dataframe.cpp | 25 ++++++++++++++++++++++++-
 4 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 17db104..6ac48bb 100644
--- a/README.md
+++ b/README.md
@@ -193,6 +193,11 @@ ctest -R test_dataframe_writer --test-args --gtest_filter=SparkIntegrationTest.P
 # Run Test Suite directly
 # --------------------------------
 ./test_<suite_name>
+
+# --------------------------------
+# Run Single Test Case directly - show output
+# --------------------------------
+./test_dataframe --gtest_filter=SparkIntegrationTest.DropDuplicates
 ```
 
 ### Mem Checks (Valgrind)
diff --git a/src/dataframe.cpp b/src/dataframe.cpp
index 1e3979d..421e2a5 100644
--- a/src/dataframe.cpp
+++ b/src/dataframe.cpp
@@ -689,4 +689,41 @@ DataFrameWriter DataFrame::write()
     config.user_id = user_id_;
 
     return DataFrameWriter(stub_, plan_.root(), config);
+}
+
+DataFrame DataFrame::dropDuplicates()
+{
+    return dropDuplicates({});
+}
+
+DataFrame DataFrame::dropDuplicates(const std::vector<std::string> &subset)
+{
+    spark::connect::Plan new_plan;
+
+    auto *relation = new_plan.mutable_root()->mutable_deduplicate();
+
+    if (this->plan_.has_root())
+    {
+        relation->mutable_input()->CopyFrom(this->plan_.root());
+    }
+
+    if (subset.empty()) {
+        relation->set_all_columns_as_keys(true); 
+    } else {
+        for (const auto &col_name : subset) {
+            relation->add_column_names(col_name);
+        }
+    }
+
+    return DataFrame(stub_, new_plan, session_id_, user_id_);
+}
+
+DataFrame DataFrame::drop_duplicates()
+{
+    return dropDuplicates();
+}
+
+DataFrame DataFrame::drop_duplicates(const std::vector<std::string> &subset)
+{
+    return dropDuplicates(subset);
 }
\ No newline at end of file
diff --git a/src/dataframe.h b/src/dataframe.h
index 2f0c78f..56a28f6 100644
--- a/src/dataframe.h
+++ b/src/dataframe.h
@@ -131,6 +131,27 @@ class DataFrame
      */
     DataFrameWriter write();
 
+    /**
+     * @brief Returns a new DataFrame with duplicate rows removed - equivalent to distinct() function
+     */
+
+    DataFrame dropDuplicates();
+    /**
+     * @brief Returns a new DataFrame with duplicate rows removed,
+     *          considering only the given subset of columns - equivalent to distinct() function
+     */
+    DataFrame dropDuplicates(const std::vector<std::string>& subset);
+    
+    /**
+     * @brief Alias for dropDuplicates().
+     */
+    DataFrame drop_duplicates();
+
+    /**
+     * @brief Alias for dropDuplicates(subset).
+     */
+    DataFrame drop_duplicates(const std::vector<std::string>& subset);
+
 private:
     std::shared_ptr<spark::connect::SparkConnectService::Stub> stub_;
     spark::connect::Plan plan_;
diff --git a/tests/dataframe.cpp b/tests/dataframe.cpp
index a4292e8..35b4457 100644
--- a/tests/dataframe.cpp
+++ b/tests/dataframe.cpp
@@ -274,4 +274,27 @@ TEST_F(SparkIntegrationTest, WhereFilter)
 
     auto filtered_df = df.where("age < 25");
     filtered_df.show();
-}
\ No newline at end of file
+}
+
+TEST_F(SparkIntegrationTest, DropDuplicates)
+{
+    // R - raw string literal.
+    auto df = spark->sql(R"(
+        SELECT *
+                FROM VALUES
+                    (14, 'Tom'),
+                    (14, 'Tom'),
+                    (14, 'Alice'),
+                    (14, 'Alice'),
+                    (14, 'Bob'),
+                    (14, 'Bob'),
+                    (15, 'Tom')
+                AS people(age, name)
+    )");
+
+    auto deduped = df.dropDuplicates();
+    deduped.show();
+
+    auto subset_deduped = df.dropDuplicates({"age"});
+    subset_deduped.show();
+}

From 0f6530ead83783961fbc30deb79868e119f2d1b1 Mon Sep 17 00:00:00 2001
From: Leader254 <gatheru.samuel@students.kyu.ac.ke>
Date: Wed, 11 Feb 2026 15:48:02 +0300
Subject: [PATCH 2/2] update test case(dropDuplicates)

---
 tests/dataframe.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/dataframe.cpp b/tests/dataframe.cpp
index 35b4457..7c4ecc1 100644
--- a/tests/dataframe.cpp
+++ b/tests/dataframe.cpp
@@ -288,7 +288,8 @@ TEST_F(SparkIntegrationTest, DropDuplicates)
                     (14, 'Alice'),
                     (14, 'Bob'),
                     (14, 'Bob'),
-                    (15, 'Tom')
+                    (15, 'Tom'),
+                    (15, 'John')
                 AS people(age, name)
     )");