Skip to content
176 changes: 175 additions & 1 deletion catwalk/estimators/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,15 @@
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

from catwalk.estimators.transformers import CutOff
from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals

import numpy as np
import random

MAX_INT = np.iinfo(np.int32).max

class ScaledLogisticRegression(BaseEstimator, ClassifierMixin):
"""
Expand Down Expand Up @@ -76,3 +83,170 @@ def predict(self, X):

def score(self, X, y):
return self.pipeline.score(X,y)


class CatInATreeClassifier(BaseEstimator, ClassifierMixin):
"""
Fit a decision tree with a subset of features that respects categoricals

Args:
categoricals : list,
List of groups of column indices to be considered associated
with one another as categoricals. For instance [[1,2], [7,8,9]]
would mean columns 1 & 2 are associated as one categorical and
7, 8, and 9 are associated as a second one.
"""
def __init__(self,
categoricals,
max_features='sqrt',
random_state=None,
criterion="gini",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.,
max_leaf_nodes=None,
min_impurity_split=1e-07,
class_weight=None,
presort=False):

self.categoricals = categoricals
self.criterion = criterion
self.splitter = splitter
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_fraction_leaf = min_weight_fraction_leaf
self.max_features = max_features
self.random_state = random_state
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_split = min_impurity_split
self.class_weight = class_weight
self.presort = presort

self.subset_cols = SubsetWithCategoricals(
categoricals=categoricals, max_features=max_features, random_state=random_state
)
self.tree = DecisionTreeClassifier(
criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes,
min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort
)

self.pipeline = Pipeline([
('subset_cols', self.subset_cols),
('tree', self.tree)
])

def fit(self, X, y):

# set the underlying random states before fitting
# doing this here rather than in the constructor because self.random_state might
# have been modified by an ensemble method
self.pipeline.named_steps['subset_cols'].set_params(random_state=self.random_state)
self.pipeline.named_steps['tree'].set_params(random_state=self.random_state)

self.pipeline.fit(X, y)

self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_
self.subset_indices = self.pipeline.named_steps['subset_cols'].subset_indices

self.classes_ = self.pipeline.named_steps['tree'].classes_
self.n_classes_ = self.pipeline.named_steps['tree'].n_classes_
self.n_features_ = self.pipeline.named_steps['tree'].n_features_
self.n_outputs_ = self.pipeline.named_steps['tree'].n_outputs_
self.tree_ = self.pipeline.named_steps['tree'].tree_

# feature importances need to reference full column set but underlying tree
# was trained on the subset, so fill in others with zeros
fi = self.pipeline.named_steps['tree'].feature_importances_
fi_dict = dict(zip(self.subset_indices, fi))
fi_full = []
for i in range(X.shape[1]):
fi_full.append(fi_dict.get(i, 0))
self.feature_importances_ = fi_full

return self

def apply(self, X):
return self.pipeline.apply(X)

def decision_path(self, X):
return self.pipeline.decision_path(X)

def predict(self, X):
return self.pipeline.predict(X)

def predict_log_proba(self, X):
return self.pipeline.predict_log_proba(X)

def predict_proba(self, X):
return self.pipeline.predict_proba(X)

def score(self, X, y):
return self.pipeline.score(X, y)


class CatInAForestClassifier(BaggingClassifier):
"""
Bagged classifier using CatInATreeClassifiers as estimators.
Note that max_features is required here for the underlying
subsetting and that the bagging classifier will use all selected
features for each tree with no option for feature bootstrapping.
"""
def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False,
warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best",
max_depth=None, min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07,
class_weight=None, presort=False):

# if isinstance(random_state, int):
# random.seed(random_state)
# elif isinstance(random_state, np.random.RandomState):
# random.seed(random_state.randint(MAX_INT))

# set up the base estimator as a CatInATreeClassifier()
self.base_estimator = CatInATreeClassifier(
categoricals=categoricals, max_features=max_features_tree, criterion=criterion,
splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split,
class_weight=class_weight, presort=presort
)

# Call the super-class's constructor
# Here, we force each tree to consider all features (without bootstrapping)
# as we'll handle the subsetting in the base estimator to have control over
# sampling categoricals. Also note that calling the BaggingClassifier
# constructor will set an object parameter `max_features`=1.0, so we've
# nammed the class parameter `max_features_tree` avoid collision.
BaggingClassifier.__init__(
self,
base_estimator=self.base_estimator,
n_estimators=n_estimators,
max_samples=max_samples,
max_features=1.0,
bootstrap=bootstrap,
bootstrap_features=False,
oob_score=oob_score,
warm_start=warm_start,
n_jobs=n_jobs,
random_state=random_state,
verbose=verbose
)

self.categoricals = categoricals
self.max_features_tree = max_features_tree
self.criterion = criterion
self.splitter = splitter
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_fraction_leaf = min_weight_fraction_leaf
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_split = min_impurity_split
self.class_weight = class_weight
self.presort = presort
100 changes: 100 additions & 0 deletions catwalk/estimators/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,28 @@
import warnings

import numpy as np
from math import log, sqrt
import random

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import FLOAT_DTYPES

MAX_INT = np.iinfo(np.int32).max

def flatten_list(l):
"""
Simple utility to flatten a list down to one dimension even if the list
contains elements of differing depth
"""
res = []
for i in l:
if isinstance(i, list):
res = res + flatten_list(i)
else:
res = res + [i]
return res

DEPRECATION_MSG_1D = (
"Passing 1d arrays as data is deprecated in 0.17 and will "
"raise ValueError in 0.19. Reshape your data either using "
Expand Down Expand Up @@ -69,3 +86,86 @@ def transform(self, X):
X[X < feature_range[0]] = feature_range[0]

return X


# feels pretty gross to have to specify the categorical columns in the constructor
# even before the object is aware of the data it's operating on, but doesn't seem
# like the fit method is flexible enough to specify it there if we're going to
# use it in a pipeline. ugh.
class SubsetWithCategoricals(BaseEstimator, TransformerMixin):
"""
Subsets features of an array treating categoricals as a group

Args:
max_features : int, float, string or None, optional (default=None)
The number of features to subset down to:
- If int, then subset to `max_features` features.
- If float, then `max_features` is a percentage and
`int(max_features * n_features)` features are used.
- If "auto", then `max_features=sqrt(n_features)`.
- If "sqrt", then `max_features=sqrt(n_features)`.
- If "log2", then `max_features=log2(n_features)`.
- If None, then `max_features=n_features`.

categoricals : list,
List of groups of column indices to be considered associated
with one another as categoricals. For instance [[1,2], [7,8,9]]
would mean columns 1 & 2 are associated as one categorical and
7, 8, and 9 are associated as a second one.

Attributes:
subset_indices : list,
Indices of the chosen subset of columns in the original array.

max_features_ : int,
The inferred value of max_features.
"""
def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True):
self.max_features = max_features
self.categoricals = categoricals
self.random_state = random_state
self.copy = copy

def _infer_max_features(self, num_features):
if isinstance(self.max_features, float):
return int(self.max_features*num_features)
elif isinstance(self.max_features, int):
return self.max_features
elif self.max_features in ['auto', 'sqrt']:
return int(sqrt(num_features))
elif self.max_features == 'log2':
return int(log(num_features, 2))
elif self.max_features is None:
return num_features
else:
raise ValueError('Invalid value for max_features: %s' % self.max_features)

def fit(self, X, y=None):
if isinstance(self.random_state, int):
random.seed(self.random_state)
elif isinstance(self.random_state, np.random.RandomState):
random.seed(self.random_state.randint(MAX_INT))

features = list(range(X.shape[1]))

all_cats = set(flatten_list(self.categoricals))
non_cats = set(features) - all_cats

# this will be a mixed list of column indices for non-categoricals
# and lists of indices for categorics
distinct_features = list(non_cats) + self.categoricals

self.max_features_ = self._infer_max_features(len(distinct_features))
if self.max_features_ > len(distinct_features):
raise ValueError('Cannot subset to more than distinct features: %s vs %s' % (
self.max_features_, len(distinct_features)))

self.subset_indices = sorted(flatten_list(
random.sample(distinct_features, self.max_features_)
))

return self

def transform(self, X):
X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES)
return X[:, self.subset_indices]
17 changes: 15 additions & 2 deletions catwalk/model_trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
filename_friendly_hash, \
retrieve_model_id_from_hash, \
db_retry, \
save_db_objects
save_db_objects, \
bag_of_cats, \
find_cats

from results_schema import Model, FeatureImportance

Expand All @@ -38,6 +40,7 @@ def __init__(
model_storage_engine,
db_engine,
model_group_keys,
feature_config,
replace=True
):
self.project_path = project_path
Expand All @@ -46,6 +49,7 @@ def __init__(
self.db_engine = db_engine
self.sessionmaker = sessionmaker(bind=self.db_engine)
self.model_group_keys = model_group_keys
self.feature_config = feature_config
self.replace = replace

def unique_parameters(self, parameters):
Expand Down Expand Up @@ -101,8 +105,17 @@ def _train(self, matrix_store, class_path, parameters):
module_name, class_name = class_path.rsplit(".", 1)
module = importlib.import_module(module_name)
cls = getattr(module, class_name)
instance = cls(**parameters)
y = matrix_store.labels()
model_params = parameters.copy() # copy since we may modify

# if using a classifier that samples respecting categoricals, detect the
# groups of categoricals and add them to the model parameter set
if class_name in ['CatInATreeClassifier', 'CatInAForestClassifier']:
cats_regex = bag_of_cats(self.feature_config)
categoricals = find_cats(matrix_store.matrix.columns.values, cats_regex)
model_params['categoricals'] = categoricals

instance = cls(**model_params)

return instance.fit(matrix_store.matrix, y), matrix_store.matrix.columns

Expand Down
Loading