diff --git a/nums/models/glms.py b/nums/models/glms.py index 798d5838..5b1ef9cc 100644 --- a/nums/models/glms.py +++ b/nums/models/glms.py @@ -33,7 +33,7 @@ # # The link function is expressed as follows. # E(Y | X) = mu -# Define the linear predictor eta = X.T @ beta +# Define the linear predictor eta = X @ beta # Define g as the link function, so that g(mu) = eta # E(Y | X) = g^{-1}(eta) # @@ -62,7 +62,35 @@ # g(mu) = (b')^{-1}(mu) = ln(mu/(1-mu)) = ln(p/(1-p)) = theta(p) -class GLM(object): +class Model(object): + def forward(self, X, beta=None): + raise NotImplementedError() + + def objective( + self, + X: BlockArray, + y: BlockArray, + beta: BlockArray = None, + mu: BlockArray = None, + ): + raise NotImplementedError() + + def gradient( + self, + X: BlockArray, + y: BlockArray, + mu: BlockArray = None, + beta: BlockArray = None, + ): + # gradient w.r.t. beta. + raise NotImplementedError() + + def hessian(self, X: BlockArray, y: BlockArray, mu: BlockArray = None): + # Hessian w.r.t. beta. + raise NotImplementedError() + + +class GLM(Model): def __init__( self, penalty="none", @@ -207,29 +235,6 @@ def predict(self, X): def link_inv(self, eta: BlockArray): raise NotImplementedError() - def objective( - self, - X: BlockArray, - y: BlockArray, - beta: BlockArray = None, - mu: BlockArray = None, - ): - raise NotImplementedError() - - def gradient( - self, - X: BlockArray, - y: BlockArray, - mu: BlockArray = None, - beta: BlockArray = None, - ): - # gradient w.r.t. beta. - raise NotImplementedError() - - def hessian(self, X: BlockArray, y: BlockArray, mu: BlockArray = None): - # Hessian w.r.t. beta. - raise NotImplementedError() - def deviance(self, y, y_pred): raise NotImplementedError() diff --git a/nums/models/lbfgs.py b/nums/models/lbfgs.py index 6a50b25f..be421027 100644 --- a/nums/models/lbfgs.py +++ b/nums/models/lbfgs.py @@ -19,14 +19,14 @@ import numpy as np from nums.core.array.application import ArrayApplication from nums.core.application_manager import instance as _instance -from nums.models.glms import GLM +from nums.models.glms import Model # Based on Nocedal and Wright, chapters 2, 3, 6 and 7. class BackTrackingLineSearch(object): - def __init__(self, model: GLM): + def __init__(self, model: Model): self.app = _instance() self.model = model @@ -64,9 +64,9 @@ def __init__(self, k, s, y): class LBFGS(object): - def __init__(self, model: GLM, m=10, max_iter=100, thresh=1e-4, dtype=np.float64): + def __init__(self, model: Model, m=10, max_iter=100, thresh=1e-4, dtype=np.float64): self.app: ArrayApplication = _instance() - self.model: GLM = model + self.model: Model = model self.m = m self.max_iter = max_iter self.thresh = thresh diff --git a/nums/models/multinomial_lr.py b/nums/models/multinomial_lr.py new file mode 100644 index 00000000..a8a653e7 --- /dev/null +++ b/nums/models/multinomial_lr.py @@ -0,0 +1,358 @@ +import numpy as np + +from nums.core.array.blockarray import BlockArray +from nums.core.array.application import ArrayApplication +from nums.core.application_manager import instance as _instance +from nums.core.array import utils as array_utils +from nums.core.array.random import NumsRandomState +from nums.core.linalg import inv + +from nums.models.lbfgs import LBFGS +from nums.models.glms import Model + + +class MultinomialLogisticRegression(Model): + def __init__( + self, + penalty="l2", + C=1.0, + tol=0.0001, + max_iter=100, + solver="newton-cg", + lr=0.01, + m=3, + random_state=None, + fit_intercept=True, + normalize=False, + ): + + if fit_intercept is False: + raise NotImplementedError("fit_incercept=False currently not supported.") + if normalize is True: + raise NotImplementedError("normalize=True currently not supported.") + + self._app = _instance() + if random_state is None: + self.rs: NumsRandomState = self._app.random + elif array_utils.is_int(random_state): + self.rs: NumsRandomState = NumsRandomState( + cm=self._app.cm, seed=random_state + ) + elif isinstance(random_state, NumsRandomState): + self.rs: NumsRandomState = random_state + else: + raise Exception( + "Unexpected type for random_state %s" % str(type(random_state)) + ) + self._penalty = None if penalty == "none" else penalty + if not (self._penalty is None or self._penalty == "l2"): + raise NotImplementedError("%s penalty not supported" % self._penalty) + self._lambda = 1.0 / C + self._lambda_vec = None + self._tol = tol + self._max_iter = max_iter + self._opt = solver + self._lr = lr + self._m = m + self.lbfgs_beta = None + self._beta = None + self._beta0 = None + + self._num_class = None + self.feature_dim = None + self.feature_block_dim = None + self.use_lbfgs_forward = False + self._lambda_id = None + + def fit(self, X: BlockArray, y: BlockArray): + # Note, it's critically important from a performance point-of-view + # to maintain the original block shape of X below, along axis 1. + # Otherwise, the concatenation operation will not construct the new X + # by referencing X's existing blocks. + # TODO: Option to do concat. + # TODO: Provide support for batching. + X = self._app.concatenate( + [ + X, + self._app.ones( + shape=(X.shape[0], 1), + block_shape=(X.block_shape[0], 1), + dtype=X.dtype, + ), + ], + axis=1, + axis_block_size=X.block_shape[1], + ) + assert ( + len(X.shape) == 2 and len(y.shape) == 2 + ), "X must be a 2D matrix and Y must be one-hot" + self._num_class = y.shape[1] + + self.feature_dim = X.shape[1] + self.feature_block_dim = X.block_shape[1] + + beta: BlockArray = self._app.zeros( + (self.feature_dim, self._num_class), + (self.feature_block_dim, self._num_class), + dtype=float, + ) + tol: BlockArray = self._app.scalar(self._tol) + max_iter: int = self._max_iter + self.use_lbfgs_forward = False + if self._penalty == "l2": + self._lambda_vec = ( + self._app.ones(beta.shape, beta.block_shape, beta.dtype) * self._lambda + ) + if self._opt == "gd" or self._opt == "sgd" or self._opt == "block_sgd": + lr: BlockArray = self._app.scalar(self._lr) + if self._opt == "gd": + beta = gd(self, beta, X, y, tol, max_iter, lr) + elif self._opt == "sgd": + beta = sgd(self, beta, X, y, tol, max_iter, lr) + else: + beta = block_sgd(self, beta, X, y, tol, max_iter, lr) + elif self._opt == "newton" or self._opt == "newton-cg": + if self._penalty == "l2": + self._lambda_id = ( + self._app.eye( + (self.feature_dim, self.feature_dim), + block_shape=(self.feature_block_dim, self.feature_block_dim), + ) + * self._lambda + ) + beta = newton(self._app, self, beta, X, y, tol, max_iter) + elif self._opt == "lbfgs": + self.use_lbfgs_forward = True + lbfgs_optimizer = LBFGS( + model=self, + m=self._m, + max_iter=max_iter, + thresh=self._tol, + dtype=X.dtype, + ) + beta = lbfgs_optimizer.execute(X, y, beta) + self.lbfgs_beta = beta + else: + raise Exception("Unsupported optimizer specified %s." % self._opt) + self._beta0 = beta[-1] + self._beta = beta[:-1] + + def lbfgs_forward(self, X, theta): + if X.shape[1] < theta.shape[0]: + assert X.shape[1] + 1 == theta.shape[0] + eta = theta[-1] + X @ theta[:-1] + else: + eta = X @ theta + eta = eta - self._app.max(eta, axis=1).expand_dims(-1) + unnormalized_probs = self._app.exp(eta) + mu = unnormalized_probs / self._app.sum(unnormalized_probs, axis=1).expand_dims( + -1 + ) + return mu # probabilities for each class + + def objective( + self, + X: BlockArray, + y: BlockArray, + beta: BlockArray = None, + mu: BlockArray = None, + ): + assert beta is not None or self._beta is not None + # neg log likelihood of correct class. y is an array of onehots + return -self._app.sum(y * self._app.log(mu + 1e-10)) + + def forward(self, X, beta=None): + if self.use_lbfgs_forward: + if beta: + return self.lbfgs_forward(X, beta) + if beta: + return self.link_inv(X @ beta) + return self.link_inv(self._beta0 + X @ self._beta) + + def link_inv(self, eta: BlockArray): + def truncate(x, maximum): + masked = (x - maximum) > 0 + return x * (1 - masked) + maximum * masked + + return self._app.one / (self._app.one + self._app.exp(truncate(-eta, 10))) + + def gradient( + self, + X: BlockArray, + y: BlockArray, + mu: BlockArray = None, + beta: BlockArray = None, + ): + if mu is None: + mu = self.forward(X) + if self._penalty is None: + return X.T @ (mu - y) + else: + assert beta is not None + return X.T @ (mu - y) + self._lambda_vec * beta + + def hessian( + self, + X: BlockArray, + y: BlockArray, + mu: BlockArray = None, + learning_ends_for_class=None, + ): + # pylint: disable=arguments-differ + class_count = mu.shape[1] + if mu is None: + mu = self.forward(X) + if learning_ends_for_class is None: + learning_ends_for_class = [False for _ in range(class_count)] + + dim, block_dim = mu.shape[0], mu.block_shape[0] + s = mu * (self._app.one - mu) + if self._penalty is None: + return [ + ( + X.T + @ ( + s[:, class_idx].reshape((dim, 1), block_shape=(block_dim, 1)) + * X + ) + ) + if not learning_ends_for_class[class_idx] + else None + for class_idx in range(class_count) + ] + else: + return [ + ( + X.T + @ ( + s[:, class_idx].reshape((dim, 1), block_shape=(block_dim, 1)) + * X + ) + + self._lambda_id + ) + if not learning_ends_for_class[class_idx] + else None + for class_idx in range(class_count) + ] + + def grad_norm_sq(self, X: BlockArray, y: BlockArray, beta=None): + g = self.gradient(X, y, self.forward(X, beta), beta=beta) + return self._app.sum(g * g) + + def predict(self, X: BlockArray): + if self.lbfgs_beta: + pred = self.forward(X, self.lbfgs_beta).get() + else: + pred = self.forward(X).get() + return np.argmax(pred, axis=-1) + + +def sgd( + model: MultinomialLogisticRegression, + beta, + X: BlockArray, + y: BlockArray, + tol: BlockArray, + max_iter: int, + lr: BlockArray, +): + # Classic SGD. + app = _instance() + for _ in range(max_iter): + # Sample an entry uniformly at random. + idx = model.rs.numpy().integers(X.shape[0]) + X_sample, y_sample = X[idx : idx + 1], y[idx : idx + 1] + mu = model.forward(X_sample, beta) + g = model.gradient(X_sample, y_sample, mu, beta=beta) + beta += -lr * g + if app.max(app.abs(g)) <= tol: + # sklearn uses max instead of l2 norm. + break + return beta + + +def block_sgd( + model: MultinomialLogisticRegression, + beta, + X: BlockArray, + y: BlockArray, + tol: BlockArray, + max_iter: int, + lr: BlockArray, +): + # SGD with batches equal to block shape along first axis. + app = _instance() + for _ in range(max_iter): + for (start, stop) in X.grid.grid_slices[0]: + X_batch, y_batch = X[start:stop], y[start:stop] + bsize = X_batch.shape[0] + mu = model.forward(X_batch, beta) + g = model.gradient(X_batch, y_batch, mu, beta=beta) + beta += -lr * g / bsize + if app.max(app.abs(g)) <= tol: + return beta + return beta + + +def gd( + model: MultinomialLogisticRegression, + beta, + X: BlockArray, + y: BlockArray, + tol: BlockArray, + max_iter: int, + lr: BlockArray, +): + app = _instance() + + for _ in range(max_iter): + mu = model.forward(X, beta) + g = model.gradient(X, y, mu, beta=beta) + beta += -lr * g + if app.max(app.abs(g)) <= tol: + break + return beta + + +def newton( + app: ArrayApplication, + model: MultinomialLogisticRegression, + beta, + X: BlockArray, + y: BlockArray, + tol: BlockArray, + max_iter: int, +): + num_classes = y.shape[1] + learning_ends_for_class = [False for _ in range(num_classes)] + + opt_count = [0 for _ in range(num_classes)] + for _ in range(max_iter): + + mu: BlockArray = model.forward(X, beta) + g = model.gradient(X, y, mu, beta=beta) + + hessians = model.hessian(X, y, mu, learning_ends_for_class) + + class_count = g.shape[1] + + for class_idx in range(class_count): + if learning_ends_for_class[class_idx]: + continue + opt_count[class_idx] += 1 + # These are PSD, but inv is faster than psd inv. + + h = hessians[class_idx] + stable_h = h + app.eye(h.shape, h.block_shape) * 1e-6 + invert_stable_h = inv(app, stable_h) + + step = -invert_stable_h @ g[:, class_idx] + beta[:, class_idx] += step # - invert_stable_h @ g[:,class_idx] + + if app.max(app.abs(g[:, class_idx])) <= tol: + learning_ends_for_class[class_idx] = True + + # learning ends if all class finishes + if all(learning_ends_for_class): + break + return beta diff --git a/setup.cfg b/setup.cfg index 2552cb27..a147e650 100644 --- a/setup.cfg +++ b/setup.cfg @@ -48,5 +48,6 @@ testing = mypy==0.910 black==21.4b0 tqdm + sklearn invoke modin diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py new file mode 100644 index 00000000..d39b6aaf --- /dev/null +++ b/tests/models/test_multinomial_lr.py @@ -0,0 +1,79 @@ +# coding=utf-8 +# Copyright (C) 2020 NumS Development Team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import numpy as np +import pytest + +from sklearn.datasets import load_iris +from sklearn.linear_model import LogisticRegression + +from nums.core.array.application import ArrayApplication +from nums.models.multinomial_lr import MultinomialLogisticRegression + + +@pytest.mark.parametrize("max_iter", [10, 2000]) +def test_multinomial_logistic(nps_app_inst: ArrayApplication, max_iter): + if max_iter > 100: + pytest.skip("skipping long tests") + real_X, real_y_indices = load_iris(return_X_y=True) + num_samples, _, num_classes = ( + real_X.shape[0], + real_X.shape[1], + real_y_indices.max() + 1, + ) + real_y = np.zeros((num_samples, num_classes)) + real_y[np.arange(num_samples), real_y_indices] = 1 # make it a onehot + X = nps_app_inst.array(real_X, block_shape=(100, 3)) + y = nps_app_inst.array( + real_y, block_shape=(100, 3) + ) # TODO block shape? iris is 3 classes, and we seem to crash when using less than 3 here. + param_set = [ + {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": max_iter}, + {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": max_iter}, + {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": max_iter}, + {"solver": "newton", "tol": 1e-8, "max_iter": max_iter}, + {"solver": "newton-cg", "tol": 1e-8, "max_iter": max_iter}, + {"solver": "lbfgs", "tol": 1e-8, "max_iter": max_iter}, + {"solver": "newton-cg", "tol": 1e-8, "max_iter": max_iter, "penalty": "none"}, + {"solver": "lbfgs", "tol": 1e-8, "max_iter": max_iter, "penalty": "none"}, + ] + + for kwargs in param_set: + lr_model: MultinomialLogisticRegression = MultinomialLogisticRegression( + **kwargs + ) + lr_model.fit(X, y) + y_pred = lr_model.predict(X) + score = np.sum(y.get().argmax(axis=1) == y_pred) / num_samples + # Sklearn multiclass lr only supports 'lbfgs', 'sag', 'saga' and 'newton-cg' solvers. + if kwargs.get("solver") in ["lbfgs", "newton-cg"]: + kwargs.update({"multi_class": "multinomial"}) + # pylint: disable=unexpected-keyword-arg + clf = LogisticRegression(**kwargs).fit(real_X, real_y_indices) + ref_score = clf.score(real_X, real_y_indices) + # print("opt", kwargs["solver"]) + # print(score, ref_score) + assert np.allclose(score, ref_score, atol=0.03) + + +if __name__ == "__main__": + # pylint: disable=import-error + from nums.core import application_manager + import nums.core.settings + + nums.core.settings.system_name = "serial" + nps_app_inst = application_manager.instance() + test_multinomial_logistic(nps_app_inst, 1000)