diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py index 0ee7cf0..ad17284 100644 --- a/catwalk/estimators/classifiers.py +++ b/catwalk/estimators/classifiers.py @@ -4,8 +4,15 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import BaggingClassifier -from catwalk.estimators.transformers import CutOff +from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals + +import numpy as np +import random + +MAX_INT = np.iinfo(np.int32).max class ScaledLogisticRegression(BaseEstimator, ClassifierMixin): """ @@ -76,3 +83,170 @@ def predict(self, X): def score(self, X, y): return self.pipeline.score(X,y) + + +class CatInATreeClassifier(BaseEstimator, ClassifierMixin): + """ + Fit a decision tree with a subset of features that respects categoricals + + Args: + categoricals : list, + List of groups of column indices to be considered associated + with one another as categoricals. For instance [[1,2], [7,8,9]] + would mean columns 1 & 2 are associated as one categorical and + 7, 8, and 9 are associated as a second one. + """ + def __init__(self, + categoricals, + max_features='sqrt', + random_state=None, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0., + max_leaf_nodes=None, + min_impurity_split=1e-07, + class_weight=None, + presort=False): + + self.categoricals = categoricals + self.criterion = criterion + self.splitter = splitter + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.random_state = random_state + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split + self.class_weight = class_weight + self.presort = presort + + self.subset_cols = SubsetWithCategoricals( + categoricals=categoricals, max_features=max_features, random_state=random_state + ) + self.tree = DecisionTreeClassifier( + criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes, + min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort + ) + + self.pipeline = Pipeline([ + ('subset_cols', self.subset_cols), + ('tree', self.tree) + ]) + + def fit(self, X, y): + + # set the underlying random states before fitting + # doing this here rather than in the constructor because self.random_state might + # have been modified by an ensemble method + self.pipeline.named_steps['subset_cols'].set_params(random_state=self.random_state) + self.pipeline.named_steps['tree'].set_params(random_state=self.random_state) + + self.pipeline.fit(X, y) + + self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_ + self.subset_indices = self.pipeline.named_steps['subset_cols'].subset_indices + + self.classes_ = self.pipeline.named_steps['tree'].classes_ + self.n_classes_ = self.pipeline.named_steps['tree'].n_classes_ + self.n_features_ = self.pipeline.named_steps['tree'].n_features_ + self.n_outputs_ = self.pipeline.named_steps['tree'].n_outputs_ + self.tree_ = self.pipeline.named_steps['tree'].tree_ + + # feature importances need to reference full column set but underlying tree + # was trained on the subset, so fill in others with zeros + fi = self.pipeline.named_steps['tree'].feature_importances_ + fi_dict = dict(zip(self.subset_indices, fi)) + fi_full = [] + for i in range(X.shape[1]): + fi_full.append(fi_dict.get(i, 0)) + self.feature_importances_ = fi_full + + return self + + def apply(self, X): + return self.pipeline.apply(X) + + def decision_path(self, X): + return self.pipeline.decision_path(X) + + def predict(self, X): + return self.pipeline.predict(X) + + def predict_log_proba(self, X): + return self.pipeline.predict_log_proba(X) + + def predict_proba(self, X): + return self.pipeline.predict_proba(X) + + def score(self, X, y): + return self.pipeline.score(X, y) + + +class CatInAForestClassifier(BaggingClassifier): + """ + Bagged classifier using CatInATreeClassifiers as estimators. + Note that max_features is required here for the underlying + subsetting and that the bagging classifier will use all selected + features for each tree with no option for feature bootstrapping. + """ + def __init__(self, categoricals, max_features_tree='sqrt', random_state=None, + n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, + warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", + max_depth=None, min_samples_split=2, min_samples_leaf=1, + min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07, + class_weight=None, presort=False): + + # if isinstance(random_state, int): + # random.seed(random_state) + # elif isinstance(random_state, np.random.RandomState): + # random.seed(random_state.randint(MAX_INT)) + + # set up the base estimator as a CatInATreeClassifier() + self.base_estimator = CatInATreeClassifier( + categoricals=categoricals, max_features=max_features_tree, criterion=criterion, + splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, + max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split, + class_weight=class_weight, presort=presort + ) + + # Call the super-class's constructor + # Here, we force each tree to consider all features (without bootstrapping) + # as we'll handle the subsetting in the base estimator to have control over + # sampling categoricals. Also note that calling the BaggingClassifier + # constructor will set an object parameter `max_features`=1.0, so we've + # nammed the class parameter `max_features_tree` avoid collision. + BaggingClassifier.__init__( + self, + base_estimator=self.base_estimator, + n_estimators=n_estimators, + max_samples=max_samples, + max_features=1.0, + bootstrap=bootstrap, + bootstrap_features=False, + oob_score=oob_score, + warm_start=warm_start, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose + ) + + self.categoricals = categoricals + self.max_features_tree = max_features_tree + self.criterion = criterion + self.splitter = splitter + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_split = min_impurity_split + self.class_weight = class_weight + self.presort = presort diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py index bcf75c2..5eb2f5f 100644 --- a/catwalk/estimators/transformers.py +++ b/catwalk/estimators/transformers.py @@ -3,11 +3,28 @@ import warnings import numpy as np +from math import log, sqrt +import random from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import check_array from sklearn.utils.validation import FLOAT_DTYPES +MAX_INT = np.iinfo(np.int32).max + +def flatten_list(l): + """ + Simple utility to flatten a list down to one dimension even if the list + contains elements of differing depth + """ + res = [] + for i in l: + if isinstance(i, list): + res = res + flatten_list(i) + else: + res = res + [i] + return res + DEPRECATION_MSG_1D = ( "Passing 1d arrays as data is deprecated in 0.17 and will " "raise ValueError in 0.19. Reshape your data either using " @@ -69,3 +86,86 @@ def transform(self, X): X[X < feature_range[0]] = feature_range[0] return X + + +# feels pretty gross to have to specify the categorical columns in the constructor +# even before the object is aware of the data it's operating on, but doesn't seem +# like the fit method is flexible enough to specify it there if we're going to +# use it in a pipeline. ugh. +class SubsetWithCategoricals(BaseEstimator, TransformerMixin): + """ + Subsets features of an array treating categoricals as a group + + Args: + max_features : int, float, string or None, optional (default=None) + The number of features to subset down to: + - If int, then subset to `max_features` features. + - If float, then `max_features` is a percentage and + `int(max_features * n_features)` features are used. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + categoricals : list, + List of groups of column indices to be considered associated + with one another as categoricals. For instance [[1,2], [7,8,9]] + would mean columns 1 & 2 are associated as one categorical and + 7, 8, and 9 are associated as a second one. + + Attributes: + subset_indices : list, + Indices of the chosen subset of columns in the original array. + + max_features_ : int, + The inferred value of max_features. + """ + def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True): + self.max_features = max_features + self.categoricals = categoricals + self.random_state = random_state + self.copy = copy + + def _infer_max_features(self, num_features): + if isinstance(self.max_features, float): + return int(self.max_features*num_features) + elif isinstance(self.max_features, int): + return self.max_features + elif self.max_features in ['auto', 'sqrt']: + return int(sqrt(num_features)) + elif self.max_features == 'log2': + return int(log(num_features, 2)) + elif self.max_features is None: + return num_features + else: + raise ValueError('Invalid value for max_features: %s' % self.max_features) + + def fit(self, X, y=None): + if isinstance(self.random_state, int): + random.seed(self.random_state) + elif isinstance(self.random_state, np.random.RandomState): + random.seed(self.random_state.randint(MAX_INT)) + + features = list(range(X.shape[1])) + + all_cats = set(flatten_list(self.categoricals)) + non_cats = set(features) - all_cats + + # this will be a mixed list of column indices for non-categoricals + # and lists of indices for categorics + distinct_features = list(non_cats) + self.categoricals + + self.max_features_ = self._infer_max_features(len(distinct_features)) + if self.max_features_ > len(distinct_features): + raise ValueError('Cannot subset to more than distinct features: %s vs %s' % ( + self.max_features_, len(distinct_features))) + + self.subset_indices = sorted(flatten_list( + random.sample(distinct_features, self.max_features_) + )) + + return self + + def transform(self, X): + X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES) + return X[:, self.subset_indices] diff --git a/catwalk/model_trainers.py b/catwalk/model_trainers.py index fb277eb..8d8c991 100644 --- a/catwalk/model_trainers.py +++ b/catwalk/model_trainers.py @@ -14,7 +14,9 @@ filename_friendly_hash, \ retrieve_model_id_from_hash, \ db_retry, \ - save_db_objects + save_db_objects, \ + bag_of_cats, \ + find_cats from results_schema import Model, FeatureImportance @@ -38,6 +40,7 @@ def __init__( model_storage_engine, db_engine, model_group_keys, + feature_config, replace=True ): self.project_path = project_path @@ -46,6 +49,7 @@ def __init__( self.db_engine = db_engine self.sessionmaker = sessionmaker(bind=self.db_engine) self.model_group_keys = model_group_keys + self.feature_config = feature_config self.replace = replace def unique_parameters(self, parameters): @@ -101,8 +105,17 @@ def _train(self, matrix_store, class_path, parameters): module_name, class_name = class_path.rsplit(".", 1) module = importlib.import_module(module_name) cls = getattr(module, class_name) - instance = cls(**parameters) y = matrix_store.labels() + model_params = parameters.copy() # copy since we may modify + + # if using a classifier that samples respecting categoricals, detect the + # groups of categoricals and add them to the model parameter set + if class_name in ['CatInATreeClassifier', 'CatInAForestClassifier']: + cats_regex = bag_of_cats(self.feature_config) + categoricals = find_cats(matrix_store.matrix.columns.values, cats_regex) + model_params['categoricals'] = categoricals + + instance = cls(**model_params) return instance.fit(matrix_store.matrix, y), matrix_store.matrix.columns diff --git a/catwalk/utils.py b/catwalk/utils.py index 1a3233f..c4e59a5 100644 --- a/catwalk/utils.py +++ b/catwalk/utils.py @@ -13,6 +13,8 @@ import sqlalchemy import csv import postgres_copy +from itertools import product +import re def split_s3_path(path): @@ -199,3 +201,74 @@ def save_db_objects(db_engine, db_objects): ]) f.seek(0) postgres_copy.copy_from(f, type(db_objects[0]), db_engine, format='csv') + + +# Two methods for identifying and grouping categorical columns +def bag_of_cats(feature_config): + """ + Parse a feature config to create regex patterns to match + categorical columns. Note that this assumes there's no + column name truncation + """ + cats_regex = [] + for fg in feature_config: + prefix = fg['prefix'] + groups = fg['groups'] + intervals = fg['intervals'] + cats = fg.get('categoricals', []) + for cat in cats: + col = cat['column'] + metrics = cat['metrics'] + + for group, interval, metric in product( + groups, intervals, metrics + ): + cats_regex.append(r'^%s_%s_%s_%s_(.*)_%s$' % ( + prefix, group, interval, col, metric + )) + + return cats_regex + + +# assumes no column name truncation!! +def find_cats(matrix_cols, cats_regex, exclude_cols=None): + """ + Assign matrix columns (by their numerical indices) to groups + of categoricals based on matching to a regex pattern + + Note that groupings of imputed columns along with their + underlying columns will be included in the returned result + as well. + """ + + # be sure we exclude entity id, date, and label + if exclude_cols is None: + exclude_cols = ['entity_id', 'as_of_date', 'outcome'] + feature_cols = [c for c in matrix_cols if c not in exclude_cols] + + # add in regex to make sure imputed flags always come along with + # their reference columns + # TODO: maybe return these as a separate list to allow models to + # treat them differently than categoricals. + imp_regex = [ + r'^%s(_imp)?$' % col[:-4] for col in matrix_cols if col[-4:] == '_imp' + ] + cats_regex += imp_regex + + # We want the sets of numberical indices of columns that match our + # categorical patterns, so loop trough the column names then through + # the patterns, checking each one for a match. Here, `cats_dict` + # will act as a collector to hold the matches associated with each + # pattern. Note that if a column matches two patterns, it will get + # assigned to the first categorical that matches, though this + # shouldn't happen if the regex is matching the full string... + cats_dict = {r:[] for r in cats_regex} + for i, fc in enumerate(feature_cols): + for regex in cats_regex: + m = re.match(regex, fc) + if m is not None: + cats_dict[regex].append(i) + break + + # collapse the dict into a list of lists to return + return [v for v in cats_dict.values() if len(v) > 0] diff --git a/tests/test_estimators.py b/tests/test_estimators.py index 47dea4b..86ddc2b 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -1,11 +1,14 @@ import numpy as np +import pandas as pd import warnings import pytest -from catwalk.estimators.transformers import CutOff -from catwalk.estimators.classifiers import ScaledLogisticRegression +from catwalk.estimators.transformers import CutOff, \ + SubsetWithCategoricals, flatten_list +from catwalk.estimators.classifiers import ScaledLogisticRegression, \ + CatInATreeClassifier, CatInAForestClassifier from sklearn import linear_model @@ -74,3 +77,99 @@ def test_dsapp_lr(data): pipeline.fit(data['X_train'], data['y_train']) assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test'])) + +def test_flatten_list(): + assert flatten_list([1, [2,3], [4, [5]], [], 6]) == [1,2,3,4,5,6] + assert flatten_list([]) == [] + assert flatten_list([1,2,3]) == [1,2,3] + assert flatten_list([[1,2]]) == [1,2] + +def test_subset_with_categoricals(): + df = pd.DataFrame({ + 'entity_id': [1,2,3,4], + 'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'], + 'first_entity_id_1y_c1_top_min': [0,1,0,0], + 'first_entity_id_1y_c1_bottom_min': [1,0,0,0], + 'first_entity_id_1y_c1__NULL_min': [0,0,1,0], + 'first_entity_id_1y_a1_sum': [12,7,0,2], + 'first_entity_id_1y_a2_max': [3,1,4,1], + 'second_entity_id_10y_a3_sum': [5,9,2,6], + 'second_entity_id_10y_c3_one_sum': [1,1,0,1], + 'second_entity_id_10y_c3_two_sum': [0,0,1,0], + 'outcome': [0,1,0,0] + }) + # ensure column order + df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum', 'outcome' + ]] + + # random seed 0 + sc = SubsetWithCategoricals( + categoricals=[[0, 1, 2], [6, 7]], + random_state=0 + ) + + samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values) + + assert np.all(samp == np.array([ + [ 0., 1., 0., 1., 0.], + [ 1., 0., 0., 1., 0.], + [ 0., 0., 1., 0., 1.], + [ 0., 0., 0., 1., 0.] + ])) + assert sc.max_features_ == 2 + assert sc.subset_indices == [0, 1, 2, 6, 7] + + # random seed 1 + sc = SubsetWithCategoricals( + categoricals=[[0, 1, 2], [6, 7]], + random_state=1 + ) + + samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values) + + assert np.all(samp == np.array([ + [ 12., 3.], + [ 7., 1.], + [ 0., 4.], + [ 2., 1.] + ])) + assert sc.max_features_ == 2 + assert sc.subset_indices == [3,4] + +def test_cat_in_a_tree(data): + # just for the purposes of testing, assuming several of the columns are categoricals + categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]] + + clf = CatInATreeClassifier(categoricals=categoricals, max_features=7, random_state=12345) + clf.fit(data['X_train'], data['y_train']) + + assert clf.max_features_ == 7 + assert clf.subset_indices == [0, 7, 8, 9, 10, 11, 12, 16, 19, 21, 27] + + pred = clf.predict_proba(data['X_test']) + assert len(pred) == len(data['y_test']) + # specific to the breast cancer data... + assert round(sum([p[1] for p in pred])) == 102 + + +def test_cat_in_a_forest(data): + # just for the purposes of testing, assuming several of the columns are categoricals + categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]] + + clf = CatInAForestClassifier(categoricals=categoricals, max_features_tree=7, n_estimators=3, random_state=12345) + clf.fit(data['X_train'], data['y_train']) + + assert clf.estimators_[0].max_features_ == 7 + assert clf.estimators_[0].subset_indices == [0, 1, 6, 12, 13, 14, 18, 22, 23, 24, 25] + assert clf.estimators_[1].subset_indices == [0, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 21] + assert clf.estimators_[2].subset_indices == [0, 2, 3, 4, 12, 13, 14, 15, 27, 28] + + pred = clf.predict_proba(data['X_test']) + assert len(pred) == len(data['y_test']) + # specific to the breast cancer data... + # even with + assert round(sum([p[1] for p in pred])) == 108 diff --git a/tests/test_integration.py b/tests/test_integration.py index a64d26a..8edae4c 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -77,7 +77,8 @@ def test_integration(): experiment_hash=experiment_hash, model_storage_engine=model_storage_engine, db_engine=db_engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) predictor = Predictor( project_path, diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py index 60e61a3..0b83302 100644 --- a/tests/test_model_trainers.py +++ b/tests/test_model_trainers.py @@ -57,7 +57,8 @@ def test_model_trainer(): experiment_hash=None, model_storage_engine=model_storage_engine, db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix_store = InMemoryMatrixStore(matrix, metadata) model_ids = trainer.train_models( @@ -136,6 +137,7 @@ def test_model_trainer(): model_storage_engine=model_storage_engine, db_engine=engine, model_group_keys=['label_name', 'label_window'], + feature_config=[], replace=True ) new_model_ids = trainer.train_models( @@ -180,6 +182,119 @@ def test_model_trainer(): sorted([model_id for model_id in new_model_ids]) +def test_model_trainer_categoricals(): + with testing.postgresql.Postgresql() as postgresql: + engine = create_engine(postgresql.url()) + ensure_db(engine) + + grid_config = { + 'catwalk.estimators.classifiers.CatInAForestClassifier': { + 'max_features_tree': [3], + 'n_estimators': [3], + 'random_state': [2193] + } + } + + with mock_s3(): + s3_conn = boto3.resource('s3') + s3_conn.create_bucket(Bucket='econ-dev') + + feature_config = [ + { + 'prefix': 'first', + 'aggregates': [ + {'quantity': 'a1', 'metrics': ['sum']}, + {'quantity': 'a2', 'metrics': ['max']} + ], + 'categoricals': [ + {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']} + ], + 'intervals': ['1y'], + 'groups': ['entity_id'] + }, + { + 'prefix': 'second', + 'aggregates': [ + {'quantity': 'a3', 'metrics': ['sum']} + ], + 'categoricals': [ + {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']} + ], + 'intervals': ['10y'], + 'groups': ['entity_id'] + } + ] + + # create training set + matrix = pandas.DataFrame.from_dict({ + 'entity_id': [1,2,3,4], + 'first_entity_id_1y_c1_top_min': [0,1,0,0], + 'first_entity_id_1y_c1_bottom_min': [1,0,0,0], + 'first_entity_id_1y_c1__NULL_min': [0,0,1,0], + 'first_entity_id_1y_a1_sum': [12,7,0,2], + 'first_entity_id_1y_a2_max': [3,1,4,1], + 'second_entity_id_10y_a3_sum': [5,9,2,6], + 'second_entity_id_10y_c3_one_sum': [1,1,0,1], + 'second_entity_id_10y_c3_two_sum': [0,0,1,0], + 'outcome': [0,1,0,0] + }) + # ensure column order + matrix = matrix[['entity_id', 'first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum', 'outcome' + ]] + metadata = { + 'beginning_of_time': datetime.date(2012, 12, 20), + 'end_time': datetime.date(2016, 12, 20), + 'label_name': 'outcome', + 'label_window': '1y', + 'metta-uuid': '1234', + 'feature_names': ['first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum' + ], + 'indices': ['entity_id'], + } + project_path = 'econ-dev/inspections' + model_storage_engine = S3ModelStorageEngine(s3_conn, project_path) + trainer = ModelTrainer( + project_path=project_path, + experiment_hash=None, + model_storage_engine=model_storage_engine, + db_engine=engine, + model_group_keys=['label_name', 'label_window'], + feature_config=feature_config + ) + matrix_store = InMemoryMatrixStore(matrix, metadata) + model_ids = trainer.train_models( + grid_config=grid_config, + misc_db_parameters=dict(), + matrix_store=matrix_store + ) + + # assert categoricals were properly detected and passed to model + records = [ + row for row in + engine.execute('select model_hash from results.models') + ] + + cache_keys = [ + model_cache_key(project_path, model_row[0], s3_conn) + for model_row in records + ] + + model_pickles = [ + pickle.loads(cache_key.get()['Body'].read()) + for cache_key in cache_keys + ] + + assert sorted([sorted(c) for c in model_pickles[0].categoricals]) == [[0, 1, 2], [6, 7]] + + def test_n_jobs_not_new_model(): grid_config = { 'sklearn.ensemble.AdaBoostClassifier': { @@ -205,7 +320,8 @@ def test_n_jobs_not_new_model(): experiment_hash=None, model_storage_engine=S3ModelStorageEngine(s3_conn, 'econ-dev/inspections'), db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix = pandas.DataFrame.from_dict({ @@ -264,7 +380,8 @@ def test_retry_max(self): experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix = pandas.DataFrame.from_dict({ @@ -309,7 +426,8 @@ def test_retry_recovery(self): experiment_hash=None, model_storage_engine=InMemoryModelStorageEngine(project_path=''), db_engine=engine, - model_group_keys=['label_name', 'label_window'] + model_group_keys=['label_name', 'label_window'], + feature_config=[] ) matrix = pandas.DataFrame.from_dict({ diff --git a/tests/test_utils.py b/tests/test_utils.py index aecf051..6018caa 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,12 +1,14 @@ from catwalk.utils import filename_friendly_hash, \ save_experiment_and_get_hash, \ - sort_predictions_and_labels + sort_predictions_and_labels, \ + bag_of_cats, find_cats from catwalk.db import ensure_db from sqlalchemy import create_engine import testing.postgresql import datetime import logging import re +import pandas as pd def test_filename_friendly_hash(): @@ -104,3 +106,81 @@ def test_sort_predictions_and_labels(): ) assert sorted_predictions == (0.6, 0.5, 0.5, 0.4) assert sorted_labels == (True, False, True, False) + +def test_bag_of_cats(): + feature_config = [ + { + 'prefix': 'first', + 'aggregates': [ + {'quantity': 'a1', 'metrics': ['min', 'max']} + ], + 'categoricals': [ + {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']}, + {'column': 'c2', 'choices': ['up', 'down'], 'metrics': ['sum', 'max']} + ], + 'intervals': ['1y', '5y'], + 'groups': ['entity_id'] + }, + { + 'prefix': 'second', + 'categoricals': [ + {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']}, + {'column': 'c4', 'choices': ['three', 'four'], 'metrics': ['max']} + ], + 'intervals': ['1y', '10y'], + 'groups': ['entity_id'] + }, + { + 'prefix': 'third', + 'aggregates': [ + {'quantity': 'a2', 'metrics': ['min', 'max']} + ], + 'intervals': ['6month'], + 'groups': ['entity_id'] + } + ] + + cat_regex = set(bag_of_cats(feature_config)) + + assert cat_regex == set([ + r'^first_entity_id_1y_c1_(.*)_min$', r'^first_entity_id_5y_c1_(.*)_min$', + r'^first_entity_id_1y_c2_(.*)_sum$', r'^first_entity_id_1y_c2_(.*)_max$', + r'^first_entity_id_5y_c2_(.*)_sum$', r'^first_entity_id_5y_c2_(.*)_max$', + r'^second_entity_id_1y_c3_(.*)_sum$', r'^second_entity_id_10y_c3_(.*)_sum$', + r'^second_entity_id_1y_c4_(.*)_max$', r'^second_entity_id_10y_c4_(.*)_max$' + ]) + +def test_find_cats(): + # test with just categoricals + cat_regex = [r'^first_entity_id_1y_c1_(.*)_min$', r'^second_entity_id_10y_c3_(.*)_sum$'] + df = pd.DataFrame({ + 'entity_id': [1,2,3,4], + 'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'], + 'first_entity_id_1y_c1_top_min': [0,1,0,0], + 'first_entity_id_1y_c1_bottom_min': [1,0,0,0], + 'first_entity_id_1y_c1__NULL_min': [0,0,1,0], + 'first_entity_id_1y_a1_sum': [12,7,0,2], + 'first_entity_id_1y_a2_max': [3,1,4,1], + 'second_entity_id_10y_a3_sum': [5,9,2,6], + 'second_entity_id_10y_c3_one_sum': [1,1,0,1], + 'second_entity_id_10y_c3_two_sum': [0,0,1,0], + 'outcome': [0,1,0,0] + }) + # ensure column order + df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', + 'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min', + 'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max', + 'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum', + 'second_entity_id_10y_c3_two_sum', 'outcome' + ]] + + cat_cols = find_cats(df.columns.values, cat_regex) + + assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [6, 7]] + + # test with categoricals and imputed flags + df['first_entity_id_1y_a1_sum_imp'] = [0,0,0,1] + df['second_entity_id_10y_a3_sum_imp'] = [0,1,0,1] + + cat_cols = find_cats(df.columns.values, cat_regex) + assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [3,8], [5,9], [6, 7]]