Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/model_drift/drift/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from .base import BaseDriftCalculator # noqa
from .categorical import ChiSqDriftCalculator # noqa
from .categorical import ChiSqDriftCalculator, ChiSqDriftCalculatorJackKnife, HellingerDriftCalculatorJackKnife # noqa
from .histogram import HistIntersectionCalculator, KdeHistPlotCalculator # noqa
from .collection import DriftCollectionCalculator # noqa
from .numeric import KSDriftCalculator, BasicDriftCalculator # noqa
from .numeric import KSDriftCalculator, BasicDriftCalculator, KSDriftCalculatorJackKnife, EMDDriftCalculatorJackKnife_1D, EMDDriftCalculatorJackKnife_woRef # noqa
from .performance import AUROCCalculator, ClassificationReportCalculator # noqa
from .sampler import Sampler # noqa
from .tabular import TabularDriftCalculator # noqa
from .tabular import TabularDriftCalculator # noqa
126 changes: 125 additions & 1 deletion src/model_drift/drift/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,25 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
from collections import Counter
import math

import numpy as np
from scipy.stats import chi2_contingency, chi2

from model_drift.drift.base import BaseDriftCalculator


def merge_freqs(ref_counts, sample):
sample_counts = Counter(sample)
keys = set().union(ref_counts, sample_counts)
exp = np.array([ref_counts.get(k, 0) for k in keys])
obs = np.array([sample_counts.get(k, 0) for k in keys])
return exp, keys, obs

def hellinger_distance(p, q):
"""Hellinger distance between two discrete distributions.
"""
return math.sqrt(sum([ (math.sqrt(p_i) - math.sqrt(q_i))**2 for p_i, q_i in zip(p, q) ]) / 2)


class ChiSqDriftCalculator(BaseDriftCalculator):
name = "chi2"
Expand Down Expand Up @@ -53,3 +58,122 @@ def _predict(self, sample):
out['critical_diff'] = out['distance'] - out['critical_value']

return out



class ChiSqDriftCalculatorJackKnife(BaseDriftCalculator):
name = "chi2_jackknife"

def __init__(self, q_val=0.1, correction=True, lambda_=None, use_freq=False, include_critical_values=False,
**kwargs):
super().__init__(**kwargs)
self.q_val = q_val
self.correction = correction
self.lambda_ = lambda_
self.use_freq = use_freq
self.include_critical_values = include_critical_values

def convert(self, arg):
return arg.apply(str)

def prepare(self, ref, **kwargs):
self._ref_counts = Counter(ref)
super().prepare(ref)

def _predict(self, sample):

nref = len(self._ref)
nobs = len(sample)

ref1 = np.random.choice(self._ref, nobs)
ref2 = np.random.choice(self._ref, nobs)

ref1_counts = Counter(ref1)
ref2_counts = Counter(ref2)

exp_ref1_sam, keys, obs_ref1_sam = merge_freqs(ref1_counts, sample)

exp_ref1_ref2, keys, obs_ref1_ref2 = merge_freqs(ref1_counts, ref2_counts)

if self.use_freq:
exp_ref1_sam = exp_ref1_sam / exp_ref1_sam.sum()
obs_ref1_sam = obs_ref1_sam / obs_ref1_sam.sum()

exp_ref1_ref2 = exp_ref1_ref2 / exp_ref1_ref2.sum()
obs_ref1_ref2 = obs_ref1_ref2 / obs_ref1_ref2.sum()


out = {}

dist1, _, _, _ = chi2_contingency(np.vstack([exp_ref1_sam, obs_ref1_sam]),
correction=self.correction,
lambda_=self.lambda_)

dist2, _, _, _ = chi2_contingency(np.vstack([exp_ref1_ref2, obs_ref1_ref2]),
correction=self.correction,
lambda_=self.lambda_)

out["distance"] = max(dist1 - dist2, 0.0)
out['pval'] = float("NaN")


if self.include_critical_values:
raise NotImplementedError("Critical value not implemented for jackknife")

return out

class HellingerDriftCalculatorJackKnife(BaseDriftCalculator):
name = "hellinger_jackknife"

def __init__(self, q_val=0.1, correction=True, lambda_=None, use_freq=True, include_critical_values=False,
**kwargs):
super().__init__(**kwargs)
self.q_val = q_val
self.correction = correction
self.lambda_ = lambda_
self.use_freq = use_freq
self.include_critical_values = include_critical_values

def convert(self, arg):
return arg.apply(str)

def prepare(self, ref, **kwargs):
self._ref_counts = Counter(ref)
super().prepare(ref)

def _predict(self, sample):

nref = len(self._ref)
nobs = len(sample)

ref1 = np.random.choice(self._ref, nobs)
ref2 = np.random.choice(self._ref, nobs)

ref1_counts = Counter(ref1)
ref2_counts = Counter(ref2)

exp_ref1_sam, keys, obs_ref1_sam = merge_freqs(ref1_counts, sample)

exp_ref1_ref2, keys, obs_ref1_ref2 = merge_freqs(ref1_counts, ref2_counts)

if self.use_freq:
exp_ref1_sam = exp_ref1_sam / exp_ref1_sam.sum()
obs_ref1_sam = obs_ref1_sam / obs_ref1_sam.sum()

exp_ref1_ref2 = exp_ref1_ref2 / exp_ref1_ref2.sum()
obs_ref1_ref2 = obs_ref1_ref2 / obs_ref1_ref2.sum()

out = {}

dist1 = hellinger_distance(exp_ref1_sam, obs_ref1_sam)
dist2 = hellinger_distance(exp_ref1_ref2, obs_ref1_ref2)


out["distance"] = max(dist1 - dist2, 0.0)
out['pval'] = float("NaN")


if self.include_critical_values:
raise NotImplementedError("Critical value not implemented for jackknife")

return out
128 changes: 127 additions & 1 deletion src/model_drift/drift/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import numpy as np
import pandas as pd
from scipy.special import kolmogi
from scipy.stats import ks_2samp
from scipy.stats import ks_2samp, wasserstein_distance
import ot

from model_drift.drift.base import BaseDriftCalculator

Expand Down Expand Up @@ -48,6 +49,131 @@ def calc_critical_value(n1, n2, q=.01):
return kolmogi(q) * np.sqrt((n1 + n2) / (n1 * n2))


class KSDriftCalculatorJackKnife(NumericBaseDriftCalculator):
name = "ks_jackknife"

def __init__(self, q_val=0.1, alternative='two-sided', mode='asymp', average='macro', include_critical_value=False,
**kwargs):
super().__init__(**kwargs)
self.q_val = q_val
self.alternative = alternative
self.mode = mode
self.average = average
self.include_critical_value = include_critical_value

def _predict(self, sample):
nref = len(self._ref)
nobs = len(sample)

ref1 = np.random.choice(self._ref, nobs)
ref2 = np.random.choice(self._ref, nobs)
out = {}
try:
dist1, _ = ks_2samp(ref1, sample, alternative=self.alternative, mode=self.mode)
dist2, _ = ks_2samp(ref1, ref2, alternative=self.alternative, mode=self.mode)

out["distance"] = max(dist1 - dist2, 0.0)
out['pval'] = float("NaN")

except TypeError:
out["distance"], out['pval'] = float("NaN"), float("NaN")

if self.include_critical_value:
raise NotImplementedError("Critical value not implemented for jackknife")

return out


class EMDDriftCalculatorJackKnife_woRef(NumericBaseDriftCalculator):
name = "emd_jackknife"

def __init__(self, include_critical_value=False, **kwargs):
super().__init__(**kwargs)
self.include_critical_value = include_critical_value

def convert(self, arg):
return arg

def _predict(self, sample):

def _emd_distance(tar, ref):
a = np.ones((len(ref))) / len(ref) # Uniform weights for the reference set
b = np.ones((len(tar))) / len(tar) # Uniform weights for the target set
M = ot.dist(ref, tar)
G0 = ot.emd(a, b, M, numItermax=1000000)
em_distance = np.sum(M * G0)
return em_distance

nref = len(self._ref)
nobs = len(sample)

sample_tuples = sample.apply(tuple)
ref_tuples = self._ref.apply(tuple)
ref_tuples_exclusive = ref_tuples[~ref_tuples.isin(sample_tuples)]
ref_lists_exclusive = ref_tuples_exclusive.apply(list)

ref1 = np.random.choice(ref_lists_exclusive, nobs)
ref2 = np.random.choice(ref_lists_exclusive, nobs)

sample_arr = np.array(sample.tolist())
sample_arr = np.nan_to_num(sample_arr, nan=0.0, posinf=0.0, neginf=0.0)

ref1_arr = np.array(ref1.tolist())
ref1_arr = np.nan_to_num(ref1_arr, nan=0.0, posinf=0.0, neginf=0.0)

ref2_arr = np.array(ref2.tolist())
ref2_arr = np.nan_to_num(ref2_arr, nan=0.0, posinf=0.0, neginf=0.0)

out = {}
try:
dist1 = _emd_distance(ref1_arr, sample_arr)
dist2 = _emd_distance(ref1_arr, ref2_arr)

out["distance"] = max(dist1 - dist2, 0.0)
out['pval'] = float("NaN")

except TypeError:
out["distance"], out['pval'] = float("NaN"), float("NaN")

if self.include_critical_value:
raise NotImplementedError("Critical value not implemented for jackknife")

return out

class EMDDriftCalculatorJackKnife_1D(NumericBaseDriftCalculator):
name = "emd_jackknife_1d"

def __init__(self, include_critical_value=False, **kwargs):
super().__init__(**kwargs)
self.include_critical_value = include_critical_value

def _predict(self, sample):
nref = len(self._ref)
nobs = len(sample)

ref1 = np.random.choice(self._ref, nobs)
ref2 = np.random.choice(self._ref, nobs)
out = {}
# drop NaNs from the arrays before computing the EMD
ref1 = ref1[~np.isnan(ref1)]
ref2 = ref2[~np.isnan(ref2)]
sample = sample[~np.isnan(sample)]
try:
dist1 = wasserstein_distance(ref1, sample)
dist2 = wasserstein_distance(ref1, ref2)

out["distance"] = max(dist1 - dist2, 0.0)
out['pval'] = float("NaN")

except TypeError:
out["distance"], out['pval'] = float("NaN"), float("NaN")

if self.include_critical_value:
raise NotImplementedError("Critical value not implemented for jackknife")

return out


class BasicDriftCalculator(NumericBaseDriftCalculator):
name = "stats"

Expand Down
53 changes: 50 additions & 3 deletions src/model_drift/drift/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,42 @@ def macro_auc(scores, labels, skip_missing=True):
def micro_auc(scores, labels):
return float(auroc(torch.tensor(scores), torch.tensor(labels).long(), average='micro').numpy())

def youden_point(df, target_names):
"""
Calculate the Youden Index for each disease and return that as optimal operating point for the disease.
"""
operating_points = {}
for disease in target_names:
fpr, tpr, thresholds = metrics.roc_curve(df[f'label.{disease}'], df[f'activation.{disease}'])

def classification_report(scores, labels, target_names=None, th=0.5, ):
# Calculate Youden Index
idx = np.argmax(tpr - fpr)
best_cutoff = thresholds[idx]

operating_points[disease] = format(best_cutoff, ".4f")

return operating_points


def classification_report(scores, labels, target_names=None, th=None):
keeps = (labels.sum(axis=0) > 0)

if target_names is None:
target_names = [str(i) for i in range(scores.shape[1])]
target_names = np.array(target_names)
output = metrics.classification_report(labels, scores >= th, target_names=target_names, output_dict=True)

if not isinstance(th, dict):
raise ValueError("Thresholds (th) must be provided as a dictionary with target names as keys.")

#binarize scores according to their thresholds
binary_scores = np.zeros_like(scores, dtype=bool)
for i, target in enumerate(target_names):
if target in th:
binary_scores[:, i] = scores[:, i] >= float(th[target])
else:
raise KeyError(f"No threshold provided for target '{target}'.")

output = metrics.classification_report(labels, binary_scores, target_names=target_names, output_dict=True)
for i, k in enumerate(target_names):
if keeps[i] == 0:
continue
Expand Down Expand Up @@ -99,12 +127,31 @@ def _predict(self, sample):
class ClassificationReportCalculator(BaseDriftCalculator):
name = "class_report"

def __init__(self, label_col=None, score_col=None, target_names=None, th=0.5):
def __init__(self, label_col=None, score_col=None, target_names=None, th=None):
super().__init__()
self.label_col = label_col
self.score_col = score_col
self.target_names = target_names
self.th = th

def prepare(self, ref):
self._ref = self.convert(ref)
if 'activation' in self._ref and 'label' in self._ref:

activations_df = pd.DataFrame(self._ref['activation'].tolist(), index=self._ref.index)
labels_df = pd.DataFrame(self._ref['label'].tolist(), index=self._ref.index)
combined_df = pd.concat([activations_df, labels_df], axis=1)

combined_df.columns = ['activation.' + str(k) for i, k in enumerate(self.target_names)] + \
['label.' + str(k) for i, k in enumerate(self.target_names)]

self._ref_df = combined_df
self._is_prepared = True
else:
print("Error: 'ref' does not contain required 'activation' and 'label' columns.")

self.th = youden_point(self._ref_df, self.target_names)
self._is_prepared = True

def convert(self, arg):
if not isinstance(arg, pd.DataFrame):
Expand Down
Loading