Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/187.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed array handling and index alignment in cohorts, non-string value representation in filter rules, imputation row counting in pandas helpers, and pandas ``FutureWarning`` in ``add_unseen``.
28 changes: 18 additions & 10 deletions src/seismometer/data/cohorts.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,34 +151,38 @@ def get_cohort_performance_data(
return frame


def resolve_col_data(df: pd.DataFrame, feature: Union[str, pd.Series]) -> pd.Series:
def resolve_col_data(df: pd.DataFrame, feature: Union[str, SeriesOrArray]) -> pd.Series:
"""
Handles resolving feature from either being a series or specifying a series in the dataframe.
Handles resolving feature from either being a series, array, or specifying a series in the dataframe.

Parameters
----------
df : pd.DataFrame
Containing a column of name feature if feature is passed in as a string.
feature : Union[str, pd.Series]
Either a pandas.Series or a column name in the dataframe.
feature : Union[str, SeriesOrArray]
Either a pandas.Series, numpy array, or a column name in the dataframe.

Returns
-------
pd.Series.
pd.Series
Always returns a pandas Series, with index matching df.index for array inputs.
"""

if isinstance(feature, str):
if feature in df.columns:
return df[feature].copy()
else:
raise KeyError(f"Feature {feature} was not found in dataframe")
elif isinstance(feature, pd.Series):
return feature # Already a Series, preserve its index
elif hasattr(feature, "ndim"):
# Convert arrays to Series with df's index for proper alignment
if feature.ndim > 1: # probas from sklearn is nx2 with second column being the positive predictions
return feature[:, 1]
return pd.Series(feature[:, 1], index=df.index)
else:
return feature
return pd.Series(feature, index=df.index)
else:
raise TypeError("Feature must be a string or pandas.Series, was given a ", type(feature))
raise TypeError(f"Feature must be a string, pandas.Series, or numpy.ndarray, was given {type(feature)}")


# endregion
Expand Down Expand Up @@ -232,7 +236,11 @@ def label_cohorts_numeric(series: SeriesOrArray, splits: Optional[List] = None)
labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins) - 1)] + [f">={bins[-1]}"]
labels[0] = f"<{bins[1]}"
cat = pd.Categorical.from_codes(bin_ixs - 1, labels)
return pd.Series(cat)
# Preserve the input series index for proper alignment in pd.concat
if isinstance(series, pd.Series):
return pd.Series(cat, index=series.index)
else:
return pd.Series(cat)


def has_good_binning(bin_ixs: List, bin_edges: List) -> None:
Expand Down Expand Up @@ -275,7 +283,7 @@ def label_cohorts_categorical(series: SeriesOrArray, cat_values: Optional[list]
List of string labels for each bin; which is the list of categories.
"""
series.name = "cohort"
series.cat._name = "cohort" # CategoricalAccessors have a different name..
series.cat._name = "cohort" # CategoricalAccessors have a different name.

# If no splits specified, restrict to observed values
if cat_values is None:
Expand Down
4 changes: 2 additions & 2 deletions src/seismometer/data/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,9 @@ def __str__(self) -> str:
case "notna":
return f"{self.left} has a value"
case "isin":
return f"{self.left} is in: {', '.join(self.right)}"
return f"{self.left} is in: {', '.join(map(str, self.right))}"
case "notin":
return f"{self.left} not in: {', '.join(self.right)}"
return f"{self.left} not in: {', '.join(map(str, self.right))}"
case "topk":
return f"{self.left} in top {self.right} values"
case "nottopk":
Expand Down
19 changes: 11 additions & 8 deletions src/seismometer/data/pandas_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,9 @@ def post_process_event(
# cast after imputation - supports nonnullable types
try_casting(dataframe, label_col, column_dtype)

# Log how many rows were imputed/changed
imputed_with_time = ((label_na_map & ~time_na_map) & dataframe[label_col].notna()).sum()
imputed_no_time = (dataframe[label_col].isna()).sum()
# Log how many rows were imputed
imputed_with_time = (label_na_map & ~time_na_map).sum()
imputed_no_time = (label_na_map & time_na_map).sum()
logger.debug(
f"Post-processing of events for {label_col} and {time_col} complete. "
f"Imputed {imputed_with_time} rows with time, {imputed_no_time} rows with no time."
Expand Down Expand Up @@ -443,13 +443,15 @@ def _merge_with_strategy(
if merge_strategy == "first":
logger.debug(f"Updating events to only keep the first occurrence for each {event_display}.")
one_event_filtered = one_event.groupby(pks).first().reset_index()
if merge_strategy == "last":
elif merge_strategy == "last":
logger.debug(f"Updating events to only keep the last occurrence for each {event_display}.")
one_event_filtered = one_event.groupby(pks).last().reset_index()

except ValueError as e:
logger.warning(e)
pass
# Only continue with fallback merge if one_event_filtered was set
if "one_event_filtered" not in locals():
raise

return pd.merge(predictions, one_event_filtered, on=pks, how="left")

Expand Down Expand Up @@ -778,14 +780,15 @@ def _resolve_score_col(dataframe: pd.DataFrame, score: str) -> str:


def analytics_metric_name(metric_names: list[str], existing_metric_starts: list[str], column_name: str) -> str:
"""In the analytics table, often the provided column name is not the actual
"""
In the analytics table, often the provided column name is not the actual
metric name that we want to log. Here, we extract the desired metric name.

Parameters
----------
metric_names : list[str]
What metrics already exist.
existing_metric_values : list[str]
existing_metric_starts : list[str]
What strings can start the mangled column name.
column_name : str
The name of the column we are trying to make into a metric.
Expand All @@ -800,7 +803,7 @@ def analytics_metric_name(metric_names: list[str], existing_metric_starts: list[
else:
for value in existing_metric_starts:
if column_name.startswith(f"{value}_"):
return column_name.lstrip(f"{value}_")
return column_name.removeprefix(f"{value}_")
return None


Expand Down
4 changes: 4 additions & 0 deletions src/seismometer/plot/mpl/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ def add_unseen(df: pd.DataFrame, col="cohort") -> pd.DataFrame:
obs = df[col].unique()
unseen = [k for k in keys if k not in obs]

# Only concatenate if there are unseen categories
if not unseen:
return df

rv = pd.concat([df, pd.DataFrame({col: unseen})], ignore_index=True)
rv[col] = rv[col].astype(pd.CategoricalDtype(df[col].cat.categories))
return rv
Expand Down
237 changes: 237 additions & 0 deletions tests/data/test_cohorts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np
import pandas as pd
import pytest

import seismometer.data.cohorts as undertest
import seismometer.data.performance # NoQA - used in patching
Expand Down Expand Up @@ -99,3 +100,239 @@ def test_data_splits(self):
expected = expected_df(["<1.0", "1.0-2.0", ">=2.0"])

pd.testing.assert_frame_equal(actual, expected, check_column_type=False, check_like=True, check_dtype=False)


class TestGetCohortData:
"""Tests for get_cohort_data() function - previously untested."""

def test_get_cohort_data_with_column_names(self):
"""Test get_cohort_data with proba and true as column names."""
df = input_df()
result = undertest.get_cohort_data(df, "tri", proba="col1", true="TARGET")

assert "true" in result.columns
assert "pred" in result.columns
assert "cohort" in result.columns
assert len(result) == 6

def test_get_cohort_data_with_array_inputs(self):
"""Test get_cohort_data with proba and true as arrays."""
df = input_df()
proba_array = np.array([0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
true_array = np.array([1, 0, 0, 1, 0, 1])

result = undertest.get_cohort_data(df, "tri", proba=proba_array, true=true_array)

# Verify correct columns and row count
assert len(result) == 6
assert "pred" in result.columns
assert "true" in result.columns
assert "cohort" in result.columns
# Values should match input arrays
assert list(result["pred"].values) == list(proba_array)
assert list(result["true"].values) == list(true_array)

def test_get_cohort_data_with_mismatched_array_lengths(self):
"""Test get_cohort_data documents edge case behavior with mismatched lengths."""
df = input_df()
proba_series = pd.Series([0.2, 0.3], index=[0, 1]) # Only 2 rows

# Pandas will align by index, then dropna removes mismatched indices
result = undertest.get_cohort_data(df, "tri", proba=proba_series, true="TARGET")

# Documents behavior: only matching indices kept
assert len(result) >= 0 # May be 0-2 depending on cohort column alignment

def test_get_cohort_data_with_nan_values(self):
"""Test get_cohort_data drops NaN values."""
df = pd.DataFrame({"TARGET": [1, 0, np.nan, 1], "col1": [0.2, np.nan, 0.4, 0.5], "tri": [0, 0, 1, 1]})

result = undertest.get_cohort_data(df, "tri", proba="col1", true="TARGET")

# Should drop rows with NaN (2 rows dropped)
assert len(result) == 2

def test_get_cohort_data_with_splits(self):
"""Test get_cohort_data with custom splits parameter."""
df = input_df()

result = undertest.get_cohort_data(df, "tri", proba="col1", true="TARGET", splits=[1.0, 2.0])

# Should create cohorts based on splits
assert "cohort" in result.columns
assert result["cohort"].cat.categories.tolist() == ["<1.0", "1.0-2.0", ">=2.0"]


class TestResolveColData:
"""Tests for resolve_col_data() helper function - previously untested."""

def test_resolve_col_data_with_string_column(self):
"""Test resolve_col_data with column name as string."""
df = pd.DataFrame({"col1": [1, 2, 3]})
result = undertest.resolve_col_data(df, "col1")

pd.testing.assert_series_equal(result, pd.Series([1, 2, 3], name="col1"))

def test_resolve_col_data_with_missing_column(self):
"""Test resolve_col_data raises KeyError for missing column."""
df = pd.DataFrame({"col1": [1, 2, 3]})

with pytest.raises(KeyError, match="Feature missing_col was not found in dataframe"):
undertest.resolve_col_data(df, "missing_col")

def test_resolve_col_data_with_2d_array(self):
"""Test resolve_col_data handles 2D array (sklearn probabilities)."""
df = pd.DataFrame({"col1": [1, 2, 3]})
proba_2d = np.array([[0.2, 0.8], [0.3, 0.7], [0.4, 0.6]])

result = undertest.resolve_col_data(df, proba_2d)

# Should return second column (positive class)
np.testing.assert_array_equal(result, np.array([0.8, 0.7, 0.6]))

def test_resolve_col_data_with_1d_array(self):
"""Test resolve_col_data handles 1D array."""
df = pd.DataFrame({"col1": [1, 2, 3]})
array_1d = np.array([0.2, 0.3, 0.4])

result = undertest.resolve_col_data(df, array_1d)

np.testing.assert_array_equal(result, array_1d)

def test_resolve_col_data_with_invalid_type(self):
"""Test resolve_col_data raises TypeError for invalid input."""
df = pd.DataFrame({"col1": [1, 2, 3]})

with pytest.raises(TypeError, match="Feature must be a string, pandas.Series, or numpy.ndarray"):
undertest.resolve_col_data(df, 123) # Invalid type


class TestResolveCohorts:
"""Tests for resolve_cohorts() function - previously untested."""

def test_resolve_cohorts_with_categorical_series(self):
"""Test resolve_cohorts auto-dispatches to categorical handler."""
series = pd.Series(pd.Categorical(["A", "B", "A", "C"]), name="test_cohort")

result = undertest.resolve_cohorts(series)

assert isinstance(result, pd.Series)
assert hasattr(result, "cat")
assert set(result.cat.categories) == {"A", "B", "C"} # Unused removed

def test_resolve_cohorts_with_numeric_series(self):
"""Test resolve_cohorts auto-dispatches to numeric handler."""
series = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])

result = undertest.resolve_cohorts(series)

assert isinstance(result, pd.Series)
assert hasattr(result, "cat") # Should be categorical
# Should split at mean (3.0)
assert len(result.cat.categories) == 2

def test_resolve_cohorts_with_numeric_splits(self):
"""Test resolve_cohorts with custom numeric splits."""
series = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])

result = undertest.resolve_cohorts(series, splits=[2.5, 4.0])

assert result.cat.categories.tolist() == ["<2.5", "2.5-4.0", ">=4.0"]


class TestHasGoodBinning:
"""Tests for has_good_binning() error checking function - previously untested."""

def test_has_good_binning_with_valid_bins(self):
"""Test has_good_binning passes with valid binning."""
bin_ixs = np.array([1, 1, 2, 2, 3, 3])
bin_edges = [0.0, 1.0, 2.0]

# Should not raise
undertest.has_good_binning(bin_ixs, bin_edges)

def test_has_good_binning_with_empty_bins(self):
"""Test has_good_binning raises IndexError for empty bins."""
bin_ixs = np.array([1, 1, 3, 3]) # Missing bin 2
bin_edges = [0.0, 1.0, 2.0]

with pytest.raises(IndexError, match="Splits provided contain some empty bins"):
undertest.has_good_binning(bin_ixs, bin_edges)

def test_has_good_binning_with_single_bin(self):
"""Test has_good_binning with single bin edge case."""
bin_ixs = np.array([1, 1, 1])
bin_edges = [0.0]

# Should not raise
undertest.has_good_binning(bin_ixs, bin_edges)


class TestLabelCohortsCategorical:
"""Tests for label_cohorts_categorical() function - previously untested."""

def test_label_cohorts_categorical_without_cat_values(self):
"""Test label_cohorts_categorical removes unused categories."""
series = pd.Series(pd.Categorical(["A", "B", "A"], categories=["A", "B", "C", "D"]))

result = undertest.label_cohorts_categorical(series)

# Should remove unused categories C and D
assert set(result.cat.categories) == {"A", "B"}

def test_label_cohorts_categorical_with_cat_values_matching(self):
"""Test label_cohorts_categorical with matching cat_values."""
series = pd.Series(pd.Categorical(["A", "B", "C"], categories=["A", "B", "C"]))

result = undertest.label_cohorts_categorical(series, cat_values=["A", "B", "C"])

# Should return as-is
pd.testing.assert_series_equal(result, series, check_names=False)

def test_label_cohorts_categorical_with_cat_values_filtering(self):
"""Test label_cohorts_categorical filters to specified cat_values."""
series = pd.Series(pd.Categorical(["A", "B", "C", "D"], categories=["A", "B", "C", "D"]))

result = undertest.label_cohorts_categorical(series, cat_values=["A", "C"])

# Should filter to only A and C, rest become NaN
assert result.notna().sum() == 2
assert set(result.dropna()) == {"A", "C"}


class TestFindBinEdges:
"""Tests for find_bin_edges() function - previously untested."""

def test_find_bin_edges_with_no_thresholds(self):
"""Test find_bin_edges defaults to mean split."""
series = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) # Mean = 3.0

result = undertest.find_bin_edges(series)

# Returns list with [min, mean]
assert len(result) == 2
assert result[0] == 1.0 # Series minimum
assert result[1] == 3.0 # Series mean

def test_find_bin_edges_with_custom_thresholds(self):
"""Test find_bin_edges with custom threshold values."""
series = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])

result = undertest.find_bin_edges(series, thresholds=[2.0, 4.0])

# Returns list with [min, threshold1, threshold2]
assert len(result) == 3
assert result[0] == 1.0 # Series minimum
assert result[1] == 2.0
assert result[2] == 4.0

def test_find_bin_edges_with_single_value_series(self):
"""Test find_bin_edges with series containing single unique value."""
series = pd.Series([5.0, 5.0, 5.0])

result = undertest.find_bin_edges(series)

# Edge case: single value means min = mean
# Documents that this creates degenerate bins (both edges same)
assert len(result) >= 1
assert all(val == 5.0 for val in result) # All edges are 5.0
Loading