Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/profiles.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ Here are the current supported functionalities of Profiles.
| | useSparkSession | |
| ColumnProfilesBuilder | ColumnProfilesBuilder(spark_session) | Done |
| | property: profiles | Done |
| | property: numRecords | Done |
| StandardColumnProfile | StandardColumnProfile(spark_session, column, java_column_profile) | Done |
| NumericColumnProfile | NumericColumnProfile(spark_session, column, java_column_profile) | Done |
15 changes: 13 additions & 2 deletions pydeequ/profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
""" Profiles file for all the Profiles classes in Deequ"""
import json
from collections import namedtuple
from typing import Optional

from pyspark.sql import DataFrame, SparkSession
from pydeequ.analyzers import KLLParameters
from pydeequ.metrics import BucketDistribution
from pydeequ.pandas_utils import ensure_pyspark_df
from enum import Enum
from pydeequ.scala_utils import (
get_or_else_none,
java_list_to_python_list,
Expand Down Expand Up @@ -239,6 +239,7 @@ def __init__(self, spark_session: SparkSession):
self._sc = spark_session.sparkContext
self._jvm = spark_session._jvm
self._profiles = []
self._numRecords = None
self.columnProfileClasses = {
"StandardColumnProfile": StandardColumnProfile,
"StringColumnProfile": StandardColumnProfile,
Expand All @@ -251,11 +252,12 @@ def _columnProfilesFromColumnRunBuilderRun(self, run):
Produces a Java profile based on the designated column

:param run: columnProfilerRunner result
:return: a setter for columnProfilerRunner result
:return self: a setter for columnProfilerRunner result
"""
self._run_result = run
profile_map = self._jvm.scala.collection.JavaConversions.mapAsJavaMap(run.profiles()) # TODO from ScalaUtils
self._profiles = {column: self._columnProfileBuilder(column, profile_map[column]) for column in profile_map}
self._numRecords = run.numRecords()
return self

@property
Expand All @@ -267,6 +269,15 @@ def profiles(self):
"""
return self._profiles

@property
def numRecords(self) -> Optional[int]:
"""
A getter for the number of records

:return Optional[int]: number of records
"""
return self._numRecords

def _columnProfileBuilder(self, column, java_column_profile):
"""Factory function for ColumnProfile
Returns a Java profile based on the designated column
Expand Down
4 changes: 4 additions & 0 deletions tests/test_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ def test_spark_session_type_exception(self):
except TypeError:
pass

def test_profile_numRecords(self):
result = ColumnProfilerRunner(self.spark).onData(self.df).run()
self.assertEqual(result.numRecords, 3)


if __name__ == "__main__":
unittest.main()
15 changes: 15 additions & 0 deletions tutorials/profiles.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -952,6 +952,21 @@
" print(profile)"
]
},
{
"metadata": {},
"cell_type": "code",
"source": "print(result.numRecords)",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3010972\n"
]
}
],
"execution_count": 6
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down