From 825cbcbe310fac099739f33b4e1accfda8a9f5f6 Mon Sep 17 00:00:00 2001 From: Eugene Nikolayev Date: Sat, 24 May 2025 02:20:52 +0300 Subject: [PATCH] Profiler: expose numRecords from the run result. --- docs/profiles.md | 1 + pydeequ/profiles.py | 15 +++++++++++++-- tests/test_profiles.py | 4 ++++ tutorials/profiles.ipynb | 15 +++++++++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/docs/profiles.md b/docs/profiles.md index 2007cab..ee6b993 100644 --- a/docs/profiles.md +++ b/docs/profiles.md @@ -20,5 +20,6 @@ Here are the current supported functionalities of Profiles. | | useSparkSession | | | ColumnProfilesBuilder | ColumnProfilesBuilder(spark_session) | Done | | | property: profiles | Done | +| | property: numRecords | Done | | StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done | | NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done | diff --git a/pydeequ/profiles.py b/pydeequ/profiles.py index fbbfd84..5ca3d01 100644 --- a/pydeequ/profiles.py +++ b/pydeequ/profiles.py @@ -2,12 +2,12 @@ """ Profiles file for all the Profiles classes in Deequ""" import json from collections import namedtuple +from typing import Optional from pyspark.sql import DataFrame, SparkSession from pydeequ.analyzers import KLLParameters from pydeequ.metrics import BucketDistribution from pydeequ.pandas_utils import ensure_pyspark_df -from enum import Enum from pydeequ.scala_utils import ( get_or_else_none, java_list_to_python_list, @@ -239,6 +239,7 @@ def __init__(self, spark_session: SparkSession): self._sc = spark_session.sparkContext self._jvm = spark_session._jvm self._profiles = [] + self._numRecords = None self.columnProfileClasses = { "StandardColumnProfile": StandardColumnProfile, "StringColumnProfile": StandardColumnProfile, @@ -251,11 +252,12 @@ def _columnProfilesFromColumnRunBuilderRun(self, run): Produces a Java profile based on the designated column :param run: columnProfilerRunner result - :return: a setter for columnProfilerRunner result + :return self: a setter for columnProfilerRunner result """ self._run_result = run profile_map = self._jvm.scala.collection.JavaConversions.mapAsJavaMap(run.profiles()) # TODO from ScalaUtils self._profiles = {column: self._columnProfileBuilder(column, profile_map[column]) for column in profile_map} + self._numRecords = run.numRecords() return self @property @@ -267,6 +269,15 @@ def profiles(self): """ return self._profiles + @property + def numRecords(self) -> Optional[int]: + """ + A getter for the number of records + + :return Optional[int]: number of records + """ + return self._numRecords + def _columnProfileBuilder(self, column, java_column_profile): """Factory function for ColumnProfile Returns a Java profile based on the designated column diff --git a/tests/test_profiles.py b/tests/test_profiles.py index e196ffd..fd46a2c 100644 --- a/tests/test_profiles.py +++ b/tests/test_profiles.py @@ -72,6 +72,10 @@ def test_spark_session_type_exception(self): except TypeError: pass + def test_profile_numRecords(self): + result = ColumnProfilerRunner(self.spark).onData(self.df).run() + self.assertEqual(result.numRecords, 3) + if __name__ == "__main__": unittest.main() diff --git a/tutorials/profiles.ipynb b/tutorials/profiles.ipynb index b968217..60d15a3 100644 --- a/tutorials/profiles.ipynb +++ b/tutorials/profiles.ipynb @@ -952,6 +952,21 @@ " print(profile)" ] }, + { + "metadata": {}, + "cell_type": "code", + "source": "print(result.numRecords)", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3010972\n" + ] + } + ], + "execution_count": 6 + }, { "cell_type": "markdown", "metadata": {},