From 825cbcbe310fac099739f33b4e1accfda8a9f5f6 Mon Sep 17 00:00:00 2001
From: Eugene Nikolayev <eugene.nikolayev@gmail.com>
Date: Sat, 24 May 2025 02:20:52 +0300
Subject: [PATCH] Profiler: expose numRecords from the run result.

---
 docs/profiles.md         |  1 +
 pydeequ/profiles.py      | 15 +++++++++++++--
 tests/test_profiles.py   |  4 ++++
 tutorials/profiles.ipynb | 15 +++++++++++++++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/docs/profiles.md b/docs/profiles.md
index 2007cab..ee6b993 100644
--- a/docs/profiles.md
+++ b/docs/profiles.md
@@ -20,5 +20,6 @@ Here are the current supported functionalities of Profiles.
 |  | useSparkSession |  |
 | ColumnProfilesBuilder | ColumnProfilesBuilder(spark_session) | Done |
 |  | property: profiles | Done |
+|  | property: numRecords | Done |
 | StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done |
 | NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done |
diff --git a/pydeequ/profiles.py b/pydeequ/profiles.py
index fbbfd84..5ca3d01 100644
--- a/pydeequ/profiles.py
+++ b/pydeequ/profiles.py
@@ -2,12 +2,12 @@
 """ Profiles file for all the Profiles classes in Deequ"""
 import json
 from collections import namedtuple
+from typing import Optional
 
 from pyspark.sql import DataFrame, SparkSession
 from pydeequ.analyzers import KLLParameters
 from pydeequ.metrics import BucketDistribution
 from pydeequ.pandas_utils import ensure_pyspark_df
-from enum import Enum
 from pydeequ.scala_utils import (
     get_or_else_none,
     java_list_to_python_list,
@@ -239,6 +239,7 @@ def __init__(self, spark_session: SparkSession):
         self._sc = spark_session.sparkContext
         self._jvm = spark_session._jvm
         self._profiles = []
+        self._numRecords = None
         self.columnProfileClasses = {
             "StandardColumnProfile": StandardColumnProfile,
             "StringColumnProfile": StandardColumnProfile,
@@ -251,11 +252,12 @@ def _columnProfilesFromColumnRunBuilderRun(self, run):
         Produces a Java profile based on the designated column
 
         :param run: columnProfilerRunner result
-        :return: a setter for columnProfilerRunner result
+        :return self: a setter for columnProfilerRunner result
         """
         self._run_result = run
         profile_map = self._jvm.scala.collection.JavaConversions.mapAsJavaMap(run.profiles())  # TODO from ScalaUtils
         self._profiles = {column: self._columnProfileBuilder(column, profile_map[column]) for column in profile_map}
+        self._numRecords = run.numRecords()
         return self
 
     @property
@@ -267,6 +269,15 @@ def profiles(self):
         """
         return self._profiles
 
+    @property
+    def numRecords(self) -> Optional[int]:
+        """
+        A getter for the number of records
+
+        :return Optional[int]: number of records
+        """
+        return self._numRecords
+
     def _columnProfileBuilder(self, column, java_column_profile):
         """Factory function for ColumnProfile
         Returns a Java profile based on the designated column
diff --git a/tests/test_profiles.py b/tests/test_profiles.py
index e196ffd..fd46a2c 100644
--- a/tests/test_profiles.py
+++ b/tests/test_profiles.py
@@ -72,6 +72,10 @@ def test_spark_session_type_exception(self):
         except TypeError:
             pass
 
+    def test_profile_numRecords(self):
+        result = ColumnProfilerRunner(self.spark).onData(self.df).run()
+        self.assertEqual(result.numRecords, 3)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tutorials/profiles.ipynb b/tutorials/profiles.ipynb
index b968217..60d15a3 100644
--- a/tutorials/profiles.ipynb
+++ b/tutorials/profiles.ipynb
@@ -952,6 +952,21 @@
     "    print(profile)"
    ]
   },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "print(result.numRecords)",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3010972\n"
+     ]
+    }
+   ],
+   "execution_count": 6
+  },
   {
    "cell_type": "markdown",
    "metadata": {},