diff --git a/catwalk/estimators/classifiers.py b/catwalk/estimators/classifiers.py
index 0ee7cf0..ad17284 100644
--- a/catwalk/estimators/classifiers.py
+++ b/catwalk/estimators/classifiers.py
@@ -4,8 +4,15 @@
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier
 
-from catwalk.estimators.transformers import CutOff
+from catwalk.estimators.transformers import CutOff, SubsetWithCategoricals
+
+import numpy as np
+import random
+
+MAX_INT = np.iinfo(np.int32).max
 
 class ScaledLogisticRegression(BaseEstimator, ClassifierMixin):
     """
@@ -76,3 +83,170 @@ def predict(self, X):
 
     def score(self, X, y):
         return self.pipeline.score(X,y)
+
+
+class CatInATreeClassifier(BaseEstimator, ClassifierMixin):
+    """
+    Fit a decision tree with a subset of features that respects categoricals
+
+    Args:
+        categoricals : list,
+            List of groups of column indices to be considered associated
+            with one another as categoricals. For instance [[1,2], [7,8,9]]
+            would mean columns 1 & 2 are associated as one categorical and
+            7, 8, and 9 are associated as a second one.
+    """
+    def __init__(self,
+                 categoricals,
+                 max_features='sqrt',
+                 random_state=None,
+                 criterion="gini",
+                 splitter="best",
+                 max_depth=None,
+                 min_samples_split=2,
+                 min_samples_leaf=1,
+                 min_weight_fraction_leaf=0.,
+                 max_leaf_nodes=None,
+                 min_impurity_split=1e-07,
+                 class_weight=None,
+                 presort=False):
+
+        self.categoricals = categoricals
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_features = max_features
+        self.random_state = random_state
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_split = min_impurity_split
+        self.class_weight = class_weight
+        self.presort = presort
+
+        self.subset_cols = SubsetWithCategoricals(
+            categoricals=categoricals, max_features=max_features, random_state=random_state
+        )
+        self.tree = DecisionTreeClassifier(
+            criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split,
+            min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
+            max_features=1.0, random_state=random_state, max_leaf_nodes=max_leaf_nodes,
+            min_impurity_split=min_impurity_split, class_weight=class_weight, presort=presort
+        )
+
+        self.pipeline = Pipeline([
+            ('subset_cols', self.subset_cols),
+            ('tree', self.tree)
+        ])
+
+    def fit(self, X, y):
+
+        # set the underlying random states before fitting
+        # doing this here rather than in the constructor because self.random_state might
+        # have been modified by an ensemble method
+        self.pipeline.named_steps['subset_cols'].set_params(random_state=self.random_state)
+        self.pipeline.named_steps['tree'].set_params(random_state=self.random_state)
+
+        self.pipeline.fit(X, y)
+
+        self.max_features_ = self.pipeline.named_steps['subset_cols'].max_features_
+        self.subset_indices = self.pipeline.named_steps['subset_cols'].subset_indices
+
+        self.classes_ = self.pipeline.named_steps['tree'].classes_
+        self.n_classes_ = self.pipeline.named_steps['tree'].n_classes_
+        self.n_features_ = self.pipeline.named_steps['tree'].n_features_
+        self.n_outputs_ = self.pipeline.named_steps['tree'].n_outputs_
+        self.tree_ = self.pipeline.named_steps['tree'].tree_
+
+        # feature importances need to reference full column set but underlying tree
+        # was trained on the subset, so fill in others with zeros
+        fi = self.pipeline.named_steps['tree'].feature_importances_
+        fi_dict = dict(zip(self.subset_indices, fi))
+        fi_full = []
+        for i in range(X.shape[1]):
+            fi_full.append(fi_dict.get(i, 0))
+        self.feature_importances_ = fi_full
+
+        return self
+
+    def apply(self, X):
+        return self.pipeline.apply(X)
+
+    def decision_path(self, X):
+        return self.pipeline.decision_path(X)
+
+    def predict(self, X):
+        return self.pipeline.predict(X)
+
+    def predict_log_proba(self, X):
+        return self.pipeline.predict_log_proba(X)
+
+    def predict_proba(self, X):
+        return self.pipeline.predict_proba(X)
+
+    def score(self, X, y):
+        return self.pipeline.score(X, y)
+
+
+class CatInAForestClassifier(BaggingClassifier):
+    """
+    Bagged classifier using CatInATreeClassifiers as estimators.
+    Note that max_features is required here for the underlying
+    subsetting and that the bagging classifier will use all selected
+    features for each tree with no option for feature bootstrapping.
+    """
+    def __init__(self, categoricals, max_features_tree='sqrt', random_state=None,
+        n_estimators=10, max_samples=1.0, bootstrap=True, oob_score=False, 
+        warm_start=False, n_jobs=1, verbose=0, criterion="gini", splitter="best", 
+        max_depth=None, min_samples_split=2, min_samples_leaf=1, 
+        min_weight_fraction_leaf=0., max_leaf_nodes=None, min_impurity_split=1e-07, 
+        class_weight=None, presort=False):
+
+        # if isinstance(random_state, int):
+        #     random.seed(random_state)
+        # elif isinstance(random_state, np.random.RandomState):
+        #     random.seed(random_state.randint(MAX_INT))
+
+        # set up the base estimator as a CatInATreeClassifier()
+        self.base_estimator = CatInATreeClassifier(
+            categoricals=categoricals, max_features=max_features_tree, criterion=criterion, 
+            splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, 
+            min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, 
+            max_leaf_nodes=max_leaf_nodes, min_impurity_split=min_impurity_split, 
+            class_weight=class_weight, presort=presort
+            )
+
+        # Call the super-class's constructor
+        # Here, we force each tree to consider all features (without bootstrapping)
+        # as we'll handle the subsetting in the base estimator to have control over
+        # sampling categoricals. Also note that calling the BaggingClassifier
+        # constructor will set an object parameter `max_features`=1.0, so we've
+        # nammed the class parameter `max_features_tree` avoid collision.
+        BaggingClassifier.__init__(
+            self, 
+            base_estimator=self.base_estimator,
+            n_estimators=n_estimators,
+            max_samples=max_samples,
+            max_features=1.0,
+            bootstrap=bootstrap,
+            bootstrap_features=False,
+            oob_score=oob_score,
+            warm_start=warm_start,
+            n_jobs=n_jobs,
+            random_state=random_state,
+            verbose=verbose
+            )
+
+        self.categoricals = categoricals
+        self.max_features_tree = max_features_tree
+        self.criterion = criterion
+        self.splitter = splitter
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.min_samples_leaf = min_samples_leaf
+        self.min_weight_fraction_leaf = min_weight_fraction_leaf
+        self.max_leaf_nodes = max_leaf_nodes
+        self.min_impurity_split = min_impurity_split
+        self.class_weight = class_weight
+        self.presort = presort
diff --git a/catwalk/estimators/transformers.py b/catwalk/estimators/transformers.py
index bcf75c2..5eb2f5f 100644
--- a/catwalk/estimators/transformers.py
+++ b/catwalk/estimators/transformers.py
@@ -3,11 +3,28 @@
 import warnings
 
 import numpy as np
+from math import log, sqrt
+import random
 
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils import check_array
 from sklearn.utils.validation import FLOAT_DTYPES
 
+MAX_INT = np.iinfo(np.int32).max
+
+def flatten_list(l):
+    """
+    Simple utility to flatten a list down to one dimension even if the list
+    contains elements of differing depth
+    """
+    res = []
+    for i in l:
+        if isinstance(i, list):
+            res = res + flatten_list(i)
+        else:
+            res = res + [i]
+    return res
+
 DEPRECATION_MSG_1D = (
     "Passing 1d arrays as data is deprecated in 0.17 and will "
     "raise ValueError in 0.19. Reshape your data either using "
@@ -69,3 +86,86 @@ def transform(self, X):
         X[X < feature_range[0]] = feature_range[0]
 
         return X
+
+
+# feels pretty gross to have to specify the categorical columns in the constructor
+# even before the object is aware of the data it's operating on, but doesn't seem
+# like the fit method is flexible enough to specify it there if we're going to 
+# use it in a pipeline. ugh.
+class SubsetWithCategoricals(BaseEstimator, TransformerMixin):
+    """
+    Subsets features of an array treating categoricals as a group
+
+    Args:
+        max_features : int, float, string or None, optional (default=None)
+            The number of features to subset down to:
+                - If int, then subset to `max_features` features.
+                - If float, then `max_features` is a percentage and
+                  `int(max_features * n_features)` features are used.
+                - If "auto", then `max_features=sqrt(n_features)`.
+                - If "sqrt", then `max_features=sqrt(n_features)`.
+                - If "log2", then `max_features=log2(n_features)`.
+                - If None, then `max_features=n_features`.
+
+        categoricals : list,
+            List of groups of column indices to be considered associated
+            with one another as categoricals. For instance [[1,2], [7,8,9]]
+            would mean columns 1 & 2 are associated as one categorical and
+            7, 8, and 9 are associated as a second one.
+
+    Attributes:
+        subset_indices : list,
+            Indices of the chosen subset of columns in the original array.
+
+        max_features_ : int,
+            The inferred value of max_features.
+    """
+    def __init__(self, categoricals, max_features='sqrt', random_state=None, copy=True):
+        self.max_features = max_features
+        self.categoricals = categoricals
+        self.random_state = random_state
+        self.copy = copy
+
+    def _infer_max_features(self, num_features):
+        if isinstance(self.max_features, float):
+            return int(self.max_features*num_features)
+        elif isinstance(self.max_features, int):
+            return self.max_features
+        elif self.max_features in ['auto', 'sqrt']:
+            return int(sqrt(num_features))
+        elif self.max_features == 'log2':
+            return int(log(num_features, 2))
+        elif self.max_features is None:
+            return num_features
+        else:
+            raise ValueError('Invalid value for max_features: %s' % self.max_features)
+
+    def fit(self, X, y=None):
+        if isinstance(self.random_state, int):
+            random.seed(self.random_state)
+        elif isinstance(self.random_state, np.random.RandomState):
+            random.seed(self.random_state.randint(MAX_INT))
+
+        features = list(range(X.shape[1]))
+
+        all_cats = set(flatten_list(self.categoricals))
+        non_cats = set(features) - all_cats
+
+        # this will be a mixed list of column indices for non-categoricals
+        # and lists of indices for categorics
+        distinct_features = list(non_cats) + self.categoricals
+
+        self.max_features_ = self._infer_max_features(len(distinct_features))
+        if self.max_features_ > len(distinct_features):
+            raise ValueError('Cannot subset to more than distinct features: %s vs %s' % (
+                self.max_features_, len(distinct_features)))
+
+        self.subset_indices = sorted(flatten_list(
+            random.sample(distinct_features, self.max_features_)
+        ))
+
+        return self
+
+    def transform(self, X):
+        X = check_array(X, copy=self.copy, ensure_2d=False, dtype=FLOAT_DTYPES)
+        return X[:, self.subset_indices]
diff --git a/catwalk/model_trainers.py b/catwalk/model_trainers.py
index fb277eb..8d8c991 100644
--- a/catwalk/model_trainers.py
+++ b/catwalk/model_trainers.py
@@ -14,7 +14,9 @@
     filename_friendly_hash, \
     retrieve_model_id_from_hash, \
     db_retry, \
-    save_db_objects
+    save_db_objects, \
+    bag_of_cats, \
+    find_cats
 
 from results_schema import Model, FeatureImportance
 
@@ -38,6 +40,7 @@ def __init__(
         model_storage_engine,
         db_engine,
         model_group_keys,
+        feature_config,
         replace=True
     ):
         self.project_path = project_path
@@ -46,6 +49,7 @@ def __init__(
         self.db_engine = db_engine
         self.sessionmaker = sessionmaker(bind=self.db_engine)
         self.model_group_keys = model_group_keys
+        self.feature_config = feature_config
         self.replace = replace
 
     def unique_parameters(self, parameters):
@@ -101,8 +105,17 @@ def _train(self, matrix_store, class_path, parameters):
         module_name, class_name = class_path.rsplit(".", 1)
         module = importlib.import_module(module_name)
         cls = getattr(module, class_name)
-        instance = cls(**parameters)
         y = matrix_store.labels()
+        model_params = parameters.copy() # copy since we may modify
+
+        # if using a classifier that samples respecting categoricals, detect the
+        # groups of categoricals and add them to the model parameter set
+        if class_name in ['CatInATreeClassifier', 'CatInAForestClassifier']:
+            cats_regex = bag_of_cats(self.feature_config)
+            categoricals = find_cats(matrix_store.matrix.columns.values, cats_regex)
+            model_params['categoricals'] = categoricals
+
+        instance = cls(**model_params)
 
         return instance.fit(matrix_store.matrix, y), matrix_store.matrix.columns
 
diff --git a/catwalk/utils.py b/catwalk/utils.py
index 1a3233f..c4e59a5 100644
--- a/catwalk/utils.py
+++ b/catwalk/utils.py
@@ -13,6 +13,8 @@
 import sqlalchemy
 import csv
 import postgres_copy
+from itertools import product
+import re
 
 
 def split_s3_path(path):
@@ -199,3 +201,74 @@ def save_db_objects(db_engine, db_objects):
             ])
         f.seek(0)
         postgres_copy.copy_from(f, type(db_objects[0]), db_engine, format='csv')
+
+
+# Two methods for identifying and grouping categorical columns
+def bag_of_cats(feature_config):
+    """
+    Parse a feature config to create regex patterns to match
+    categorical columns. Note that this assumes there's no
+    column name truncation
+    """
+    cats_regex = []
+    for fg in feature_config:
+        prefix = fg['prefix']
+        groups = fg['groups']
+        intervals = fg['intervals']
+        cats = fg.get('categoricals', [])
+        for cat in cats:
+            col = cat['column']
+            metrics = cat['metrics']
+
+            for group, interval, metric in product(
+                groups, intervals, metrics
+                ):
+                cats_regex.append(r'^%s_%s_%s_%s_(.*)_%s$' % (
+                    prefix, group, interval, col, metric
+                ))
+
+    return cats_regex
+
+
+# assumes no column name truncation!!
+def find_cats(matrix_cols, cats_regex, exclude_cols=None):
+    """
+    Assign matrix columns (by their numerical indices) to groups
+    of categoricals based on matching to a regex pattern
+
+    Note that groupings of imputed columns along with their
+    underlying columns will be included in the returned result
+    as well.
+    """
+
+    # be sure we exclude entity id, date, and label
+    if exclude_cols is None:
+        exclude_cols = ['entity_id', 'as_of_date', 'outcome']
+    feature_cols = [c for c in matrix_cols if c not in exclude_cols]
+
+    # add in regex to make sure imputed flags always come along with
+    # their reference columns
+    # TODO: maybe return these as a separate list to allow models to
+    #       treat them differently than categoricals.
+    imp_regex = [
+        r'^%s(_imp)?$' % col[:-4] for col in matrix_cols if col[-4:] == '_imp'
+    ]
+    cats_regex += imp_regex
+
+    # We want the sets of numberical indices of columns that match our
+    # categorical patterns, so loop trough the column names then through
+    # the patterns, checking each one for a match. Here, `cats_dict`
+    # will act as a collector to hold the matches associated with each
+    # pattern. Note that if a column matches two patterns, it will get 
+    # assigned to the first categorical that matches, though this 
+    # shouldn't happen if the regex is matching the full string...
+    cats_dict = {r:[] for r in cats_regex}
+    for i, fc in enumerate(feature_cols):
+        for regex in cats_regex:
+            m = re.match(regex, fc)
+            if m is not None:
+                cats_dict[regex].append(i)
+                break
+
+    # collapse the dict into a list of lists to return
+    return [v for v in cats_dict.values() if len(v) > 0]
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
index 47dea4b..86ddc2b 100644
--- a/tests/test_estimators.py
+++ b/tests/test_estimators.py
@@ -1,11 +1,14 @@
 import numpy as np
+import pandas as pd
 
 import warnings
 
 import pytest
 
-from catwalk.estimators.transformers import CutOff
-from catwalk.estimators.classifiers import ScaledLogisticRegression
+from catwalk.estimators.transformers import CutOff, \
+    SubsetWithCategoricals, flatten_list
+from catwalk.estimators.classifiers import ScaledLogisticRegression, \
+    CatInATreeClassifier, CatInAForestClassifier
 
 from sklearn import linear_model
 
@@ -74,3 +77,99 @@ def test_dsapp_lr(data):
     pipeline.fit(data['X_train'], data['y_train'])
 
     assert np.all(dsapp_lr.predict(data['X_test']) == pipeline.predict(data['X_test']))
+
+def test_flatten_list():
+    assert flatten_list([1, [2,3], [4, [5]], [], 6]) == [1,2,3,4,5,6]
+    assert flatten_list([]) == []
+    assert flatten_list([1,2,3]) == [1,2,3]
+    assert flatten_list([[1,2]]) == [1,2]
+
+def test_subset_with_categoricals():
+    df = pd.DataFrame({
+        'entity_id': [1,2,3,4],
+        'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'],
+        'first_entity_id_1y_c1_top_min': [0,1,0,0],
+        'first_entity_id_1y_c1_bottom_min': [1,0,0,0],
+        'first_entity_id_1y_c1__NULL_min': [0,0,1,0],
+        'first_entity_id_1y_a1_sum': [12,7,0,2],
+        'first_entity_id_1y_a2_max': [3,1,4,1],
+        'second_entity_id_10y_a3_sum': [5,9,2,6],
+        'second_entity_id_10y_c3_one_sum': [1,1,0,1],
+        'second_entity_id_10y_c3_two_sum': [0,0,1,0],
+        'outcome': [0,1,0,0]
+        })
+    # ensure column order
+    df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', 
+             'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+             'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+             'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+             'second_entity_id_10y_c3_two_sum', 'outcome'
+    ]]
+
+    # random seed 0
+    sc = SubsetWithCategoricals(
+            categoricals=[[0, 1, 2], [6, 7]],
+            random_state=0
+        )
+
+    samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values)
+
+    assert np.all(samp == np.array([  
+            [ 0.,  1.,  0.,  1.,  0.],
+            [ 1.,  0.,  0.,  1.,  0.],
+            [ 0.,  0.,  1.,  0.,  1.],
+            [ 0.,  0.,  0.,  1.,  0.]
+    ]))
+    assert sc.max_features_ == 2
+    assert sc.subset_indices == [0, 1, 2, 6, 7]
+
+    # random seed 1
+    sc = SubsetWithCategoricals(
+            categoricals=[[0, 1, 2], [6, 7]],
+            random_state=1
+        )
+
+    samp = sc.fit_transform(df.drop(['entity_id', 'as_of_date', 'outcome'], axis=1).values)
+
+    assert np.all(samp == np.array([
+            [ 12.,   3.],
+            [  7.,   1.],
+            [  0.,   4.],
+            [  2.,   1.]
+    ]))
+    assert sc.max_features_ == 2
+    assert sc.subset_indices == [3,4]
+
+def test_cat_in_a_tree(data):
+    # just for the purposes of testing, assuming several of the columns are categoricals
+    categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]]
+
+    clf = CatInATreeClassifier(categoricals=categoricals, max_features=7, random_state=12345)
+    clf.fit(data['X_train'], data['y_train'])
+
+    assert clf.max_features_ == 7
+    assert clf.subset_indices == [0, 7, 8, 9, 10, 11, 12, 16, 19, 21, 27]
+    
+    pred = clf.predict_proba(data['X_test'])
+    assert len(pred) == len(data['y_test'])
+    # specific to the breast cancer data...
+    assert round(sum([p[1] for p in pred])) == 102
+
+
+def test_cat_in_a_forest(data):
+    # just for the purposes of testing, assuming several of the columns are categoricals
+    categoricals = [[2,3,4], [7,8,9,10,11], [13,14], [22,23,24,25]]
+
+    clf = CatInAForestClassifier(categoricals=categoricals, max_features_tree=7, n_estimators=3, random_state=12345)
+    clf.fit(data['X_train'], data['y_train'])
+
+    assert clf.estimators_[0].max_features_ == 7
+    assert clf.estimators_[0].subset_indices == [0, 1, 6, 12, 13, 14, 18, 22, 23, 24, 25]
+    assert clf.estimators_[1].subset_indices == [0, 7, 8, 9, 10, 11, 12, 13, 14, 16, 18, 21]
+    assert clf.estimators_[2].subset_indices == [0, 2, 3, 4, 12, 13, 14, 15, 27, 28]
+    
+    pred = clf.predict_proba(data['X_test'])
+    assert len(pred) == len(data['y_test'])
+    # specific to the breast cancer data...
+    # even with 
+    assert round(sum([p[1] for p in pred])) == 108
diff --git a/tests/test_integration.py b/tests/test_integration.py
index a64d26a..8edae4c 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -77,7 +77,8 @@ def test_integration():
                 experiment_hash=experiment_hash,
                 model_storage_engine=model_storage_engine,
                 db_engine=db_engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
             predictor = Predictor(
                 project_path,
diff --git a/tests/test_model_trainers.py b/tests/test_model_trainers.py
index 60e61a3..0b83302 100644
--- a/tests/test_model_trainers.py
+++ b/tests/test_model_trainers.py
@@ -57,7 +57,8 @@ def test_model_trainer():
                 experiment_hash=None,
                 model_storage_engine=model_storage_engine,
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
             matrix_store = InMemoryMatrixStore(matrix, metadata)
             model_ids = trainer.train_models(
@@ -136,6 +137,7 @@ def test_model_trainer():
                 model_storage_engine=model_storage_engine,
                 db_engine=engine,
                 model_group_keys=['label_name', 'label_window'],
+                feature_config=[],
                 replace=True
             )
             new_model_ids = trainer.train_models(
@@ -180,6 +182,119 @@ def test_model_trainer():
                 sorted([model_id for model_id in new_model_ids])
 
 
+def test_model_trainer_categoricals():
+    with testing.postgresql.Postgresql() as postgresql:
+        engine = create_engine(postgresql.url())
+        ensure_db(engine)
+
+        grid_config = {
+            'catwalk.estimators.classifiers.CatInAForestClassifier': {
+                'max_features_tree': [3],
+                'n_estimators': [3],
+                'random_state': [2193]
+            }
+        }
+
+        with mock_s3():
+            s3_conn = boto3.resource('s3')
+            s3_conn.create_bucket(Bucket='econ-dev')
+
+            feature_config = [
+                {
+                    'prefix': 'first',
+                    'aggregates': [
+                        {'quantity': 'a1', 'metrics': ['sum']},
+                        {'quantity': 'a2', 'metrics': ['max']}
+                    ],
+                    'categoricals': [
+                        {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']}
+                    ],
+                    'intervals': ['1y'],
+                    'groups': ['entity_id']
+                },
+                {
+                    'prefix': 'second',
+                    'aggregates': [
+                        {'quantity': 'a3', 'metrics': ['sum']}
+                    ],
+                    'categoricals': [
+                        {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']}
+                    ],
+                    'intervals': ['10y'],
+                    'groups': ['entity_id']
+                }
+            ]
+
+            # create training set
+            matrix = pandas.DataFrame.from_dict({
+                'entity_id': [1,2,3,4],
+                'first_entity_id_1y_c1_top_min': [0,1,0,0],
+                'first_entity_id_1y_c1_bottom_min': [1,0,0,0],
+                'first_entity_id_1y_c1__NULL_min': [0,0,1,0],
+                'first_entity_id_1y_a1_sum': [12,7,0,2],
+                'first_entity_id_1y_a2_max': [3,1,4,1],
+                'second_entity_id_10y_a3_sum': [5,9,2,6],
+                'second_entity_id_10y_c3_one_sum': [1,1,0,1],
+                'second_entity_id_10y_c3_two_sum': [0,0,1,0],
+                'outcome': [0,1,0,0]
+            })
+            # ensure column order
+            matrix = matrix[['entity_id', 'first_entity_id_1y_c1_top_min', 
+                     'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+                     'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+                     'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+                     'second_entity_id_10y_c3_two_sum', 'outcome'
+            ]]
+            metadata = {
+                'beginning_of_time': datetime.date(2012, 12, 20),
+                'end_time': datetime.date(2016, 12, 20),
+                'label_name': 'outcome',
+                'label_window': '1y',
+                'metta-uuid': '1234',
+                'feature_names': ['first_entity_id_1y_c1_top_min', 
+                     'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+                     'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+                     'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+                     'second_entity_id_10y_c3_two_sum'
+                ],
+                'indices': ['entity_id'],
+            }
+            project_path = 'econ-dev/inspections'
+            model_storage_engine = S3ModelStorageEngine(s3_conn, project_path)
+            trainer = ModelTrainer(
+                project_path=project_path,
+                experiment_hash=None,
+                model_storage_engine=model_storage_engine,
+                db_engine=engine,
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=feature_config
+            )
+            matrix_store = InMemoryMatrixStore(matrix, metadata)
+            model_ids = trainer.train_models(
+                grid_config=grid_config,
+                misc_db_parameters=dict(),
+                matrix_store=matrix_store
+            )
+
+            # assert categoricals were properly detected and passed to model
+            records = [
+                row for row in
+                engine.execute('select model_hash from results.models')
+            ]
+
+            cache_keys = [
+                model_cache_key(project_path, model_row[0], s3_conn)
+                for model_row in records
+            ]
+
+            model_pickles = [
+                pickle.loads(cache_key.get()['Body'].read())
+                for cache_key in cache_keys
+            ]
+
+            assert sorted([sorted(c) for c in model_pickles[0].categoricals]) == [[0, 1, 2], [6, 7]]
+
+
 def test_n_jobs_not_new_model():
     grid_config = {
         'sklearn.ensemble.AdaBoostClassifier': {
@@ -205,7 +320,8 @@ def test_n_jobs_not_new_model():
                 experiment_hash=None,
                 model_storage_engine=S3ModelStorageEngine(s3_conn, 'econ-dev/inspections'),
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
 
             matrix = pandas.DataFrame.from_dict({
@@ -264,7 +380,8 @@ def test_retry_max(self):
                 experiment_hash=None,
                 model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
 
             matrix = pandas.DataFrame.from_dict({
@@ -309,7 +426,8 @@ def test_retry_recovery(self):
                 experiment_hash=None,
                 model_storage_engine=InMemoryModelStorageEngine(project_path=''),
                 db_engine=engine,
-                model_group_keys=['label_name', 'label_window']
+                model_group_keys=['label_name', 'label_window'],
+                feature_config=[]
             )
 
             matrix = pandas.DataFrame.from_dict({
diff --git a/tests/test_utils.py b/tests/test_utils.py
index aecf051..6018caa 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,12 +1,14 @@
 from catwalk.utils import filename_friendly_hash, \
     save_experiment_and_get_hash, \
-    sort_predictions_and_labels
+    sort_predictions_and_labels, \
+    bag_of_cats, find_cats
 from catwalk.db import ensure_db
 from sqlalchemy import create_engine
 import testing.postgresql
 import datetime
 import logging
 import re
+import pandas as pd
 
 
 def test_filename_friendly_hash():
@@ -104,3 +106,81 @@ def test_sort_predictions_and_labels():
     )
     assert sorted_predictions == (0.6, 0.5, 0.5, 0.4)
     assert sorted_labels == (True, False, True, False)
+
+def test_bag_of_cats():
+    feature_config = [
+        {
+            'prefix': 'first',
+            'aggregates': [
+                {'quantity': 'a1', 'metrics': ['min', 'max']}
+            ],
+            'categoricals': [
+                {'column': 'c1', 'choices': ['top', 'bottom', 'charm', 'strange'], 'metrics': ['min']},
+                {'column': 'c2', 'choices': ['up', 'down'], 'metrics': ['sum', 'max']}
+            ],
+            'intervals': ['1y', '5y'],
+            'groups': ['entity_id']
+        },
+        {
+            'prefix': 'second',
+            'categoricals': [
+                {'column': 'c3', 'choices': ['one', 'two'], 'metrics': ['sum']},
+                {'column': 'c4', 'choices': ['three', 'four'], 'metrics': ['max']}
+            ],
+            'intervals': ['1y', '10y'],
+            'groups': ['entity_id']
+        },
+        {
+            'prefix': 'third',
+            'aggregates': [
+                {'quantity': 'a2', 'metrics': ['min', 'max']}
+            ],
+            'intervals': ['6month'],
+            'groups': ['entity_id']
+        }
+    ]
+
+    cat_regex = set(bag_of_cats(feature_config))
+
+    assert cat_regex == set([
+        r'^first_entity_id_1y_c1_(.*)_min$', r'^first_entity_id_5y_c1_(.*)_min$', 
+        r'^first_entity_id_1y_c2_(.*)_sum$', r'^first_entity_id_1y_c2_(.*)_max$', 
+        r'^first_entity_id_5y_c2_(.*)_sum$', r'^first_entity_id_5y_c2_(.*)_max$', 
+        r'^second_entity_id_1y_c3_(.*)_sum$', r'^second_entity_id_10y_c3_(.*)_sum$', 
+        r'^second_entity_id_1y_c4_(.*)_max$', r'^second_entity_id_10y_c4_(.*)_max$'
+    ])
+
+def test_find_cats():
+    # test with just categoricals
+    cat_regex = [r'^first_entity_id_1y_c1_(.*)_min$', r'^second_entity_id_10y_c3_(.*)_sum$']
+    df = pd.DataFrame({
+        'entity_id': [1,2,3,4],
+        'as_of_date': ['2012-01-01','2012-01-01','2012-01-01','2012-01-01'],
+        'first_entity_id_1y_c1_top_min': [0,1,0,0],
+        'first_entity_id_1y_c1_bottom_min': [1,0,0,0],
+        'first_entity_id_1y_c1__NULL_min': [0,0,1,0],
+        'first_entity_id_1y_a1_sum': [12,7,0,2],
+        'first_entity_id_1y_a2_max': [3,1,4,1],
+        'second_entity_id_10y_a3_sum': [5,9,2,6],
+        'second_entity_id_10y_c3_one_sum': [1,1,0,1],
+        'second_entity_id_10y_c3_two_sum': [0,0,1,0],
+        'outcome': [0,1,0,0]
+        })
+    # ensure column order
+    df = df[['entity_id', 'as_of_date', 'first_entity_id_1y_c1_top_min', 
+             'first_entity_id_1y_c1_bottom_min', 'first_entity_id_1y_c1__NULL_min',
+             'first_entity_id_1y_a1_sum', 'first_entity_id_1y_a2_max',
+             'second_entity_id_10y_a3_sum', 'second_entity_id_10y_c3_one_sum',
+             'second_entity_id_10y_c3_two_sum', 'outcome'
+    ]]
+
+    cat_cols = find_cats(df.columns.values, cat_regex)
+
+    assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [6, 7]]
+
+    # test with categoricals and imputed flags
+    df['first_entity_id_1y_a1_sum_imp'] = [0,0,0,1]
+    df['second_entity_id_10y_a3_sum_imp'] = [0,1,0,1]
+
+    cat_cols = find_cats(df.columns.values, cat_regex)
+    assert sorted([sorted(c) for c in cat_cols]) == [[0, 1, 2], [3,8], [5,9], [6, 7]]