Skip to content

Commit 1c1ae2c

Browse files
committed
Adding tests for issue 1429
1 parent b50deed commit 1c1ae2c

File tree

1 file changed

+56
-0
lines changed

1 file changed

+56
-0
lines changed

tests/issues/test_issue1429.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Test for issue 1429:
3+
https://github.com/ydataai/ydata-profiling/issues/1429
4+
"""
5+
import numpy as np
6+
7+
from ydata_profiling.config import SparkSettings
8+
from ydata_profiling.model.spark.describe_numeric_spark import numeric_stats_spark
9+
from ydata_profiling.model.spark.describe_counts_spark import describe_counts_spark
10+
from pyspark.sql import types as T, SparkSession, DataFrame
11+
12+
13+
def create_test_df(spark: SparkSession) -> DataFrame:
14+
schema = T.StructType(
15+
[
16+
T.StructField("category", T.StringType(), True),
17+
T.StructField("double", T.DoubleType(), True)
18+
]
19+
)
20+
21+
data = [
22+
(f"test_{num + 1}", float(num)) for num in range(205)
23+
]
24+
25+
# Adding dupes
26+
data.extend(
27+
[
28+
("test_1", float(1)) for _ in range(205)
29+
]
30+
)
31+
32+
# Adding nulls
33+
data.extend(
34+
[
35+
(None, None) for _ in range(100)
36+
]
37+
)
38+
39+
return spark.createDataFrame(data, schema=schema)
40+
41+
42+
def test_describe_numeric_spark(spark_session):
43+
test_df = create_test_df(spark_session)
44+
45+
numeric_stats = numeric_stats_spark(df=test_df.select("double"), summary={})
46+
47+
for key, value in numeric_stats.items():
48+
assert value is not None
49+
50+
51+
def test_describe_counts_spark(spark_session):
52+
test_df = create_test_df(spark_session)
53+
54+
_, _, summary = describe_counts_spark(config=SparkSettings(), series=test_df.select("category"), summary={})
55+
56+
assert summary["value_counts_without_nan"].loc["test_1"] == 206

0 commit comments

Comments
 (0)