From f349f847667966ab9eeb5d8f3ea3a3e0afb83de6 Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Wed, 6 Oct 2021 13:08:32 -0700
Subject: [PATCH 01/16] multinomial regression integrated.

---
 autoscaler/example.py               |  12 +-
 nums/models/glms.py                 |  53 +++--
 nums/models/lbfgs.py                |  28 ++-
 nums/models/multinomial_lr.py       | 352 ++++++++++++++++++++++++++++
 setup.py                            |   1 +
 tests/models/test_multinomial_lr.py |  78 ++++++
 6 files changed, 482 insertions(+), 42 deletions(-)
 create mode 100644 nums/models/multinomial_lr.py
 create mode 100644 tests/models/test_multinomial_lr.py

diff --git a/autoscaler/example.py b/autoscaler/example.py
index 8b70d5b7..eb5cd53e 100644
--- a/autoscaler/example.py
+++ b/autoscaler/example.py
@@ -6,15 +6,15 @@
 
 # Initialize ray and connect it to the cluster.
 ray.init(address="auto")
-# Set the cluster shape for nums. Here we set it to use all the nodes in the ray cluster. 
-settings.cluster_shape = (len(ray.nodes())-1, 1)
+# Set the cluster shape for nums. Here we set it to use all the nodes in the ray cluster.
+settings.cluster_shape = (len(ray.nodes()) - 1, 1)
 
 
 def main():
-    X = nps.random.rand(10**4)
-    Y = nps.random.rand(10**4)
-    Z = nps.add(X,Y)
-    print("X + Y = ",Z.get())
+    X = nps.random.rand(10 ** 4)
+    Y = nps.random.rand(10 ** 4)
+    Z = nps.add(X, Y)
+    print("X + Y = ", Z.get())
 
 
 if __name__ == "__main__":
diff --git a/nums/models/glms.py b/nums/models/glms.py
index e7fe2390..19180164 100644
--- a/nums/models/glms.py
+++ b/nums/models/glms.py
@@ -60,7 +60,35 @@
 #   g(mu) = (b')^{-1}(mu) = ln(mu/(1-mu)) = ln(p/(1-p)) = theta(p)
 
 
-class GLM(object):
+class Model(object):
+    def forward(self, X, beta=None):
+        raise NotImplementedError()
+
+    def objective(
+        self,
+        X: BlockArray,
+        y: BlockArray,
+        beta: BlockArray = None,
+        mu: BlockArray = None,
+    ):
+        raise NotImplementedError()
+
+    def gradient(
+        self,
+        X: BlockArray,
+        y: BlockArray,
+        mu: BlockArray = None,
+        beta: BlockArray = None,
+    ):
+        # gradient w.r.t. beta.
+        raise NotImplementedError()
+
+    def hessian(self, X: BlockArray, y: BlockArray, mu: BlockArray = None):
+        # Hessian w.r.t. beta.
+        raise NotImplementedError()
+
+
+class GLM(Model):
     def __init__(
         self,
         penalty="none",
@@ -171,29 +199,6 @@ def predict(self, X):
     def link_inv(self, eta: BlockArray):
         raise NotImplementedError()
 
-    def objective(
-        self,
-        X: BlockArray,
-        y: BlockArray,
-        beta: BlockArray = None,
-        mu: BlockArray = None,
-    ):
-        raise NotImplementedError()
-
-    def gradient(
-        self,
-        X: BlockArray,
-        y: BlockArray,
-        mu: BlockArray = None,
-        beta: BlockArray = None,
-    ):
-        # gradient w.r.t. beta.
-        raise NotImplementedError()
-
-    def hessian(self, X: BlockArray, y: BlockArray, mu: BlockArray = None):
-        # Hessian w.r.t. beta.
-        raise NotImplementedError()
-
     def deviance(self, y, y_pred):
         raise NotImplementedError()
 
diff --git a/nums/models/lbfgs.py b/nums/models/lbfgs.py
index dfdc847c..62d710ed 100644
--- a/nums/models/lbfgs.py
+++ b/nums/models/lbfgs.py
@@ -19,18 +19,18 @@
 import numpy as np
 from nums.core.array.application import ArrayApplication
 from nums.core.application_manager import instance as _instance
-from nums.models.glms import GLM
+from nums.models.glms import Model
 
 
 # Based on Nocedal and Wright, chapters 2, 3, 6 and 7.
 
 
 class BackTrackingLineSearch(object):
-    def __init__(self, model: GLM):
+    def __init__(self, model: Model):
         self.app = _instance()
         self.model = model
 
-    def f(self, theta_prime, X, y):
+    def f(self, X, y, theta_prime):
         return self.model.objective(
             X, y, theta_prime, self.model.forward(X, theta_prime)
         )
@@ -42,7 +42,9 @@ def execute(
         alpha = init_alpha
         f_val = self.f(X, y, theta)
         f_next = self.f(X, y, theta + alpha * p)
-        while self.app.isnan(f_next) or f_next > f_val + c * alpha * grad.T @ p:
+        while self.app.isnan(f_next) or f_next > f_val + c * alpha * self.app.sum(
+            grad * p
+        ):
             alpha *= rho
             if alpha < min_alpha:
                 return min_alpha
@@ -56,15 +58,16 @@ def __init__(self, k, s, y):
         self.k = k
         self.s = s
         self.y = y
-        ys_inner = s.T @ y
+        app = _instance()
+        ys_inner = app.sum(s * y)
         self.rho = 1.0 / (ys_inner + 1e-30)
-        self.gamma = ys_inner / (y.T @ y + 1e-30)
+        self.gamma = ys_inner / (app.sum(y * y) + 1e-30)
 
 
 class LBFGS(object):
-    def __init__(self, model: GLM, m=3, max_iter=100, thresh=1e-5, dtype=np.float64):
+    def __init__(self, model: Model, m=3, max_iter=100, thresh=1e-5, dtype=np.float64):
         self.app: ArrayApplication = _instance()
-        self.model: GLM = model
+        self.model: Model = model
         self.m = m
         self.max_iter = max_iter
         self.thresh = thresh
@@ -89,12 +92,12 @@ def get_p(self, H, g):
             mem_i: LBFGSMemory = self.memory[i]
             if mem_i is None:
                 break
-            alpha = mem_i.rho * mem_i.s.T @ q
+            alpha = mem_i.rho * self.app.sum(mem_i.s * q)
             q -= alpha * mem_i.y
             forward_vars.insert(0, (alpha, mem_i))
         r = H @ q
         for alpha, mem_i in forward_vars:
-            beta = mem_i.rho * mem_i.y.T @ r
+            beta = mem_i.rho * self.app.sum(mem_i.y * r)
             r += mem_i.s * (alpha - beta)
         return r
 
@@ -127,7 +130,7 @@ def execute(self, X, y, theta):
                 c=1e-4,
                 min_alpha=1e-30,
             )
-            print("alpha", alpha)
+            # print("alpha", alpha)
             # print("alpha", alpha,
             #       "objective", f(theta).get(),
             #       "grad_norm", self.app.sqrt(g.T @ g).get())
@@ -156,7 +159,8 @@ def execute(self, X, y, theta):
         return theta
 
     def converged(self, g):
-        return self.app.sqrt(g.T @ g) < self.thresh
+        # return self.app.max(self.app.abs(g)) < self.thresh
+        return self.app.sqrt(self.app.sum(g * g)) < self.thresh
 
 
 if __name__ == "__main__":
diff --git a/nums/models/multinomial_lr.py b/nums/models/multinomial_lr.py
new file mode 100644
index 00000000..8bb1720f
--- /dev/null
+++ b/nums/models/multinomial_lr.py
@@ -0,0 +1,352 @@
+import numpy as np
+import random
+
+from nums.core.array.blockarray import BlockArray
+from nums.core.array.application import ArrayApplication
+
+from nums.core.application_manager import instance as _instance
+from nums.core.array import utils as array_utils
+from nums.core.array.random import NumsRandomState
+from collections import defaultdict
+from nums import numpy as nps
+
+from nums.models.lbfgs import LBFGS
+from nums.core.linalg import inv
+from nums.models.glms import Model
+
+
+class MultinomialLogisticRegression(Model):
+    def __init__(
+        self,
+        penalty="none",
+        C=1.0,
+        tol=0.0001,
+        max_iter=100,
+        solver="newton-cg",
+        lr=0.01,
+        m=3,
+        random_state=None,
+        fit_intercept=True,
+        normalize=False,
+    ):
+
+        if fit_intercept is False:
+            raise NotImplementedError("fit_incercept=False currently not supported.")
+        if normalize is True:
+            raise NotImplementedError("normalize=True currently not supported.")
+
+        self._app = _instance()
+        if random_state is None:
+            self.rs: NumsRandomState = self._app.random
+        elif array_utils.is_int(random_state):
+            self.rs: NumsRandomState = NumsRandomState(
+                cm=self._app.cm, seed=random_state
+            )
+        elif isinstance(random_state, NumsRandomState):
+            self.rs: NumsRandomState = random_state
+        else:
+            raise Exception(
+                "Unexpected type for random_state %s" % str(type(random_state))
+            )
+        self._penalty = None if penalty == "none" else penalty
+        if not (self._penalty is None or self._penalty == "l2"):
+            raise NotImplementedError("%s penalty not supported" % self._penalty)
+        self._lambda = 1.0 / C
+        self._lambda_vec = None
+        self._tol = tol
+        self._max_iter = max_iter
+        self._opt = solver
+        self._lr = lr
+        self._m = m
+        self._beta = None
+        self._beta0 = None
+
+    def fit(self, X: BlockArray, y: BlockArray):
+        # Note, it's critically important from a performance point-of-view
+        # to maintain the original block shape of X below, along axis 1.
+        # Otherwise, the concatenation operation will not construct the new X
+        # by referencing X's existing blocks.
+        # TODO: Option to do concat.
+        # TODO: Provide support for batching.
+        X = self._app.concatenate(
+            [
+                X,
+                self._app.ones(
+                    shape=(X.shape[0], 1),
+                    block_shape=(X.block_shape[0], 1),
+                    dtype=X.dtype,
+                ),
+            ],
+            axis=1,
+            axis_block_size=X.block_shape[1],
+        )
+        assert (
+            len(X.shape) == 2 and len(y.shape) == 2
+        ), "X must be a 2D matrix and Y must be one-hot"
+        self._num_class = y.shape[1]
+
+        self.feature_dim = X.shape[1]
+        self.feature_block_dim = X.block_shape[1]
+
+        beta: BlockArray = self._app.zeros(
+            (X.shape[1], self._num_class),
+            (X.block_shape[1], self._num_class),
+            dtype=float,
+        )
+        tol: BlockArray = self._app.scalar(self._tol)
+        max_iter: int = self._max_iter
+        self.use_lbfgs_forward = False
+        if self._penalty == "l2":
+            self._lambda_vec = (
+                self._app.ones(beta.shape, beta.block_shape, beta.dtype) * self._lambda
+            )
+        if self._opt == "gd" or self._opt == "sgd" or self._opt == "block_sgd":
+            lr: BlockArray = self._app.scalar(self._lr)
+            if self._opt == "gd":
+                beta = gd(self, beta, X, y, tol, max_iter, lr)
+            elif self._opt == "sgd":
+                beta = sgd(self, beta, X, y, tol, max_iter, lr)
+            else:
+                beta = block_sgd(self, beta, X, y, tol, max_iter, lr)
+        elif self._opt == "newton" or self._opt == "newton-cg":
+            beta = newton(self._app, self, beta, X, y, tol, max_iter)
+            if self._penalty == "l2":
+                self._lambda_id = (
+                    self._app.eye(
+                        (self.feature_dim, self.feature_dim),
+                        block_shape=(self.feature_block_dim, self.feature_block_dim),
+                    )
+                    * self._lambda
+                )
+        elif self._opt == "lbfgs":
+            self.use_lbfgs_forward = True
+            lbfgs_optimizer = LBFGS(
+                model=self,
+                m=self._m,
+                max_iter=max_iter,
+                thresh=self._tol,
+                dtype=X.dtype,
+            )
+            self.beta = beta
+            beta = lbfgs_optimizer.execute(X, y, beta)
+        else:
+            raise Exception("Unsupported optimizer specified %s." % self._opt)
+        self._beta0 = beta[-1]
+        self._beta = beta[:-1]
+
+    def lbfgs_forward(self, X, theta):
+        if X.shape[1] < theta.shape[0]:
+            assert X.shape[1] + 1 == theta.shape[0]
+            eta = theta[-1] + X @ theta[:-1]
+        else:
+            eta = X @ theta
+        eta = eta - self._app.max(eta, axis=1).expand_dims(-1)
+        unnormalized_probs = self._app.exp(eta)
+        mu = unnormalized_probs / self._app.sum(unnormalized_probs, axis=1).expand_dims(
+            -1
+        )
+        # print('mu', mu.get()[0])
+        return mu  # probabilities for each class
+
+    def objective(
+        self,
+        X: BlockArray,
+        y: BlockArray,
+        beta: BlockArray = None,
+        mu: BlockArray = None,
+    ):
+        assert beta is not None or self._beta is not None
+        # neg log likelihood of correct class. y is an array of onehots
+        return -self._app.sum(y * self._app.log(mu + 1e-10))
+
+    def forward(self, X, beta=None):
+        if self.use_lbfgs_forward:
+            if beta:
+                return self.lbfgs_forward(X, beta)
+        if beta:
+            return self.link_inv(X @ beta)
+        return self.link_inv(self._beta0 + X @ self._beta)
+
+    def link_inv(self, eta: BlockArray):
+        def truncate(x, maximum):
+            masked = (x - maximum) > 0
+            return x * (1 - masked) + maximum * masked
+
+        return self._app.one / (self._app.one + self._app.exp(truncate(-eta, 10)))
+
+    def gradient(
+        self,
+        X: BlockArray,
+        y: BlockArray,
+        mu: BlockArray = None,
+        beta: BlockArray = None,
+    ):
+        if mu is None:
+            mu = self.forward(X)
+        if self._penalty is None:
+            return X.T @ (mu - y)
+        else:
+            assert beta is not None
+            return X.T @ (mu - y) + self._lambda_vec * beta
+
+    def hessian(
+        self,
+        X: BlockArray,
+        y: BlockArray,
+        mu: BlockArray = None,
+        learning_ends_for_class=None,
+    ):
+        class_count = mu.shape[1]
+        if mu is None:
+            mu = self.forward(X)
+        if learning_ends_for_class is None:
+            learning_ends_for_class = [False for _ in range(class_count)]
+
+        dim, block_dim = mu.shape[0], mu.block_shape[0]
+        s = mu * (self._app.one - mu)
+        if self._penalty is None:
+            return [
+                (
+                    X.T
+                    @ (
+                        s[:, class_idx].reshape((dim, 1), block_shape=(block_dim, 1))
+                        * X
+                    )
+                )
+                if not learning_ends_for_class[class_idx]
+                else None
+                for class_idx in range(class_count)
+            ]
+        else:
+            return [
+                (
+                    X.T
+                    @ (
+                        s[:, class_idx].reshape((dim, 1), block_shape=(block_dim, 1))
+                        * X
+                    )
+                    + self._lambda_id
+                )
+                if not learning_ends_for_class[class_idx]
+                else None
+                for class_idx in range(class_count)
+            ]
+
+    def grad_norm_sq(self, X: BlockArray, y: BlockArray, beta=None):
+        g = self.gradient(X, y, self.forward(X, beta), beta=beta)
+        return self._app.sum(g * g)
+
+    def predict(self, X: BlockArray):
+        pred = self.forward(X).get()
+        return np.argmax(pred, axis=-1)
+
+
+def sgd(
+    model: MultinomialLogisticRegression,
+    beta,
+    X: BlockArray,
+    y: BlockArray,
+    tol: BlockArray,
+    max_iter: int,
+    lr: BlockArray,
+):
+    # Classic SGD.
+    app = _instance()
+    for _ in range(max_iter):
+        # Sample an entry uniformly at random.
+        idx = model.rs.numpy().integers(X.shape[0])
+        X_sample, y_sample = X[idx : idx + 1], y[idx : idx + 1]
+        mu = model.forward(X_sample, beta)
+        g = model.gradient(X_sample, y_sample, mu, beta=beta)
+        beta += -lr * g
+        if app.max(app.abs(g)) <= tol:
+            # sklearn uses max instead of l2 norm.
+            break
+    return beta
+
+
+def block_sgd(
+    model: MultinomialLogisticRegression,
+    beta,
+    X: BlockArray,
+    y: BlockArray,
+    tol: BlockArray,
+    max_iter: int,
+    lr: BlockArray,
+):
+    # SGD with batches equal to block shape along first axis.
+    app = _instance()
+    for _ in range(max_iter):
+        for (start, stop) in X.grid.grid_slices[0]:
+            X_batch, y_batch = X[start:stop], y[start:stop]
+            bsize = X_batch.shape[0]
+            mu = model.forward(X_batch, beta)
+            g = model.gradient(X_batch, y_batch, mu, beta=beta)
+            beta += -lr * g / bsize
+            if app.max(app.abs(g)) <= tol:
+                return beta
+    return beta
+
+
+def gd(
+    model: MultinomialLogisticRegression,
+    beta,
+    X: BlockArray,
+    y: BlockArray,
+    tol: BlockArray,
+    max_iter: int,
+    lr: BlockArray,
+):
+    app = _instance()
+
+    for _ in range(max_iter):
+        mu = model.forward(X, beta)
+        g = model.gradient(X, y, mu, beta=beta)
+        beta += -lr * g
+        if app.max(app.abs(g)) <= tol:
+            break
+    return beta
+
+
+def newton(
+    app: ArrayApplication,
+    model: MultinomialLogisticRegression,
+    beta,
+    X: BlockArray,
+    y: BlockArray,
+    tol: BlockArray,
+    max_iter: int,
+):
+    num_classes = y.shape[1]
+    learning_ends_for_class = [False for _ in range(num_classes)]
+
+    opt_count = [0 for _ in range(num_classes)]
+    for _ in range(max_iter):
+
+        mu: BlockArray = model.forward(X, beta)
+        g = model.gradient(X, y, mu, beta=beta)
+
+        hessians = model.hessian(X, y, mu, learning_ends_for_class)
+
+        class_count = g.shape[1]
+
+        for class_idx in range(class_count):
+            if learning_ends_for_class[class_idx]:
+                continue
+            opt_count[class_idx] += 1
+            # These are PSD, but inv is faster than psd inv.
+
+            h = hessians[class_idx]
+            stable_h = h + app.eye(h.shape, h.block_shape) * 1e-6
+            invert_stable_h = inv(app, stable_h)
+
+            step = -invert_stable_h @ g[:, class_idx]
+            beta[:, class_idx] += step  # - invert_stable_h @ g[:,class_idx]
+
+            if app.max(app.abs(g[:, class_idx])) <= tol:
+                learning_ends_for_class[class_idx] = True
+
+        # learning ends if all class finishes
+        if all(learning_ends_for_class):
+            break
+    return beta
diff --git a/setup.py b/setup.py
index a4f49f92..b908cbe5 100644
--- a/setup.py
+++ b/setup.py
@@ -1,2 +1,3 @@
 import setuptools
+
 setuptools.setup()
diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
new file mode 100644
index 00000000..8063ac0a
--- /dev/null
+++ b/tests/models/test_multinomial_lr.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# Copyright (C) 2020 NumS Development Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import time
+
+import numpy as np
+import pytest
+
+from sklearn.datasets import load_iris
+
+from nums.core.array.application import ArrayApplication
+from nums.core.storage.storage import BimodalGaussian
+from nums.models.multinomial_lr import MultinomialLogisticRegression
+
+# pylint: disable = protected-access, import-outside-toplevel, import-error
+
+
+def test_multinomial_logistic(nps_app_inst: ArrayApplication):
+    data = load_iris()
+    real_X = data["data"]
+    real_y_indices = data["target"]
+    num_samples, num_features, num_classes = (
+        real_X.shape[0],
+        real_X.shape[1],
+        real_y_indices.max() + 1,
+    )
+    real_y = np.zeros((num_samples, num_classes))
+    real_y[np.arange(num_samples), real_y_indices] = 1  # make it a onehot
+    X = nps_app_inst.array(real_X, block_shape=(100, 3))
+    y = nps_app_inst.array(
+        real_y, block_shape=(100, 3)
+    )  # TODO block shape? iris is 3 classes, and we seem to crash when using less than 3 here.
+    param_set = [
+        # {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
+        # {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
+        # {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
+        # {"solver": "newton", "tol": 1e-8, "max_iter": 10},
+        # TODO: This is not working.
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10, "m": 3}
+    ]
+    for kwargs in param_set:
+        runtime = time.time()
+        lr_model: MultinomialLogisticRegression = MultinomialLogisticRegression(
+            **kwargs
+        )
+        lr_model.fit(X, y)
+        runtime = time.time() - runtime
+        y_pred = lr_model.predict(
+            X
+        )  # .get() TODO we should return a nums object not np
+        # y_pred_proba = lr_model.predict_proba(X).get() # TODO this isn't implemented atm. does it make sense to implement?
+        # np.allclose(np.ones(shape=(y.shape[0],)), y_pred_proba[:, 0] + y_pred_proba[:, 1]) # TODO not sure if we need this line
+        print("opt", kwargs["solver"])
+        print("runtime", runtime)
+        # print("norm", lr_model.grad_norm_sq(X, y).get()) # TODO does this matter?
+        # print("objective", lr_model.objective(X, y).get()) # TODO we don't have this function implemented
+        print("accuracy", np.sum(y.get().argmax(axis=1) == y_pred) / num_samples)
+
+
+if __name__ == "__main__":
+    # pylint: disable=import-error
+    from nums.core import application_manager
+
+    nps_app_inst = application_manager.instance()
+    test_multinomial_logistic(nps_app_inst)

From 0b8ef7a30b523c49ee928887cbe56a09a25f1a95 Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Wed, 6 Oct 2021 13:12:47 -0700
Subject: [PATCH 02/16] revert changes for autoscaler.

---
 autoscaler/example.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/autoscaler/example.py b/autoscaler/example.py
index eb5cd53e..8b70d5b7 100644
--- a/autoscaler/example.py
+++ b/autoscaler/example.py
@@ -6,15 +6,15 @@
 
 # Initialize ray and connect it to the cluster.
 ray.init(address="auto")
-# Set the cluster shape for nums. Here we set it to use all the nodes in the ray cluster.
-settings.cluster_shape = (len(ray.nodes()) - 1, 1)
+# Set the cluster shape for nums. Here we set it to use all the nodes in the ray cluster. 
+settings.cluster_shape = (len(ray.nodes())-1, 1)
 
 
 def main():
-    X = nps.random.rand(10 ** 4)
-    Y = nps.random.rand(10 ** 4)
-    Z = nps.add(X, Y)
-    print("X + Y = ", Z.get())
+    X = nps.random.rand(10**4)
+    Y = nps.random.rand(10**4)
+    Z = nps.add(X,Y)
+    print("X + Y = ",Z.get())
 
 
 if __name__ == "__main__":

From 42c5abcb18133463f131cc9944e80e967465f249 Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Wed, 6 Oct 2021 13:15:12 -0700
Subject: [PATCH 03/16] add sklearn to testing requirements.

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index ec6ed00d..38b60ba0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -33,3 +33,4 @@ testing =
     mypy==0.910
     black==21.4b0
     tqdm
+    sklearn

From 5f513099e4fe75bd890ee0cecb5e963519751ca0 Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Wed, 6 Oct 2021 13:45:09 -0700
Subject: [PATCH 04/16] pylint.

---
 nums/models/multinomial_lr.py       | 14 ++++++++------
 tests/models/test_multinomial_lr.py |  9 ++++-----
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/nums/models/multinomial_lr.py b/nums/models/multinomial_lr.py
index 8bb1720f..c59c0ee9 100644
--- a/nums/models/multinomial_lr.py
+++ b/nums/models/multinomial_lr.py
@@ -1,17 +1,13 @@
 import numpy as np
-import random
 
 from nums.core.array.blockarray import BlockArray
 from nums.core.array.application import ArrayApplication
-
 from nums.core.application_manager import instance as _instance
 from nums.core.array import utils as array_utils
 from nums.core.array.random import NumsRandomState
-from collections import defaultdict
-from nums import numpy as nps
+from nums.core.linalg import inv
 
 from nums.models.lbfgs import LBFGS
-from nums.core.linalg import inv
 from nums.models.glms import Model
 
 
@@ -61,6 +57,12 @@ def __init__(
         self._beta = None
         self._beta0 = None
 
+        self._num_class = None
+        self.feature_dim = None
+        self.feature_block_dim = None
+        self.use_lbfgs_forward = False
+        self._lambda_id = None
+
     def fit(self, X: BlockArray, y: BlockArray):
         # Note, it's critically important from a performance point-of-view
         # to maintain the original block shape of X below, along axis 1.
@@ -127,7 +129,6 @@ def fit(self, X: BlockArray, y: BlockArray):
                 thresh=self._tol,
                 dtype=X.dtype,
             )
-            self.beta = beta
             beta = lbfgs_optimizer.execute(X, y, beta)
         else:
             raise Exception("Unsupported optimizer specified %s." % self._opt)
@@ -196,6 +197,7 @@ def hessian(
         mu: BlockArray = None,
         learning_ends_for_class=None,
     ):
+        # pylint: disable=arguments-differ
         class_count = mu.shape[1]
         if mu is None:
             mu = self.forward(X)
diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 8063ac0a..9f30307b 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -44,11 +44,10 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
         real_y, block_shape=(100, 3)
     )  # TODO block shape? iris is 3 classes, and we seem to crash when using less than 3 here.
     param_set = [
-        # {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
-        # {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
-        # {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
-        # {"solver": "newton", "tol": 1e-8, "max_iter": 10},
-        # TODO: This is not working.
+        {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
+        {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
+        {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
+        {"solver": "newton", "tol": 1e-8, "max_iter": 10},
         {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10, "m": 3}
     ]
     for kwargs in param_set:

From 807567e1fe50cc8d8ee77df8ecd63c2ddb736d88 Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Wed, 6 Oct 2021 13:46:07 -0700
Subject: [PATCH 05/16] black

---
 autoscaler/example.py               | 12 ++++++------
 tests/models/test_multinomial_lr.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/autoscaler/example.py b/autoscaler/example.py
index 8b70d5b7..eb5cd53e 100644
--- a/autoscaler/example.py
+++ b/autoscaler/example.py
@@ -6,15 +6,15 @@
 
 # Initialize ray and connect it to the cluster.
 ray.init(address="auto")
-# Set the cluster shape for nums. Here we set it to use all the nodes in the ray cluster. 
-settings.cluster_shape = (len(ray.nodes())-1, 1)
+# Set the cluster shape for nums. Here we set it to use all the nodes in the ray cluster.
+settings.cluster_shape = (len(ray.nodes()) - 1, 1)
 
 
 def main():
-    X = nps.random.rand(10**4)
-    Y = nps.random.rand(10**4)
-    Z = nps.add(X,Y)
-    print("X + Y = ",Z.get())
+    X = nps.random.rand(10 ** 4)
+    Y = nps.random.rand(10 ** 4)
+    Z = nps.add(X, Y)
+    print("X + Y = ", Z.get())
 
 
 if __name__ == "__main__":
diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 9f30307b..2a6a4cfb 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -48,7 +48,7 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
         {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "newton", "tol": 1e-8, "max_iter": 10},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10, "m": 3}
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10, "m": 3},
     ]
     for kwargs in param_set:
         runtime = time.time()

From ab775b6159a2d204ab93e0708a0808d93700e973 Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Wed, 6 Oct 2021 15:03:07 -0700
Subject: [PATCH 06/16] lint.

---
 tests/models/test_multinomial_lr.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 2a6a4cfb..0fa20a4f 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -17,12 +17,10 @@
 import time
 
 import numpy as np
-import pytest
 
 from sklearn.datasets import load_iris
 
 from nums.core.array.application import ArrayApplication
-from nums.core.storage.storage import BimodalGaussian
 from nums.models.multinomial_lr import MultinomialLogisticRegression
 
 # pylint: disable = protected-access, import-outside-toplevel, import-error
@@ -32,7 +30,7 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
     data = load_iris()
     real_X = data["data"]
     real_y_indices = data["target"]
-    num_samples, num_features, num_classes = (
+    num_samples, _, num_classes = (
         real_X.shape[0],
         real_X.shape[1],
         real_y_indices.max() + 1,
@@ -60,12 +58,16 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
         y_pred = lr_model.predict(
             X
         )  # .get() TODO we should return a nums object not np
-        # y_pred_proba = lr_model.predict_proba(X).get() # TODO this isn't implemented atm. does it make sense to implement?
-        # np.allclose(np.ones(shape=(y.shape[0],)), y_pred_proba[:, 0] + y_pred_proba[:, 1]) # TODO not sure if we need this line
+        # TODO this isn't implemented atm. does it make sense to implement?
+        # y_pred_proba = lr_model.predict_proba(X).get()
+        # TODO not sure if we need this line
+        # np.allclose(np.ones(shape=(y.shape[0],)), y_pred_proba[:, 0] + y_pred_proba[:, 1])
         print("opt", kwargs["solver"])
         print("runtime", runtime)
-        # print("norm", lr_model.grad_norm_sq(X, y).get()) # TODO does this matter?
-        # print("objective", lr_model.objective(X, y).get()) # TODO we don't have this function implemented
+        # TODO does this matter?
+        # print("norm", lr_model.grad_norm_sq(X, y).get())
+        # TODO we don't have this function implemented
+        # print("objective", lr_model.objective(X, y).get())
         print("accuracy", np.sum(y.get().argmax(axis=1) == y_pred) / num_samples)
 
 

From 2fd36a7a4381f03b84ad158734c75ee1563a585c Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Wed, 13 Oct 2021 13:41:47 -0700
Subject: [PATCH 07/16] make default penalty "l2"

---
 nums/models/multinomial_lr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nums/models/multinomial_lr.py b/nums/models/multinomial_lr.py
index c59c0ee9..cd5926dd 100644
--- a/nums/models/multinomial_lr.py
+++ b/nums/models/multinomial_lr.py
@@ -14,7 +14,7 @@
 class MultinomialLogisticRegression(Model):
     def __init__(
         self,
-        penalty="none",
+        penalty="l2",
         C=1.0,
         tol=0.0001,
         max_iter=100,

From 0269d5f4ac8d74fce0baaf70a211a74593328ab6 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Wed, 13 Oct 2021 13:56:34 -0700
Subject: [PATCH 08/16] minor fix

---
 nums/models/multinomial_lr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nums/models/multinomial_lr.py b/nums/models/multinomial_lr.py
index cd5926dd..ad98983c 100644
--- a/nums/models/multinomial_lr.py
+++ b/nums/models/multinomial_lr.py
@@ -111,7 +111,6 @@ def fit(self, X: BlockArray, y: BlockArray):
             else:
                 beta = block_sgd(self, beta, X, y, tol, max_iter, lr)
         elif self._opt == "newton" or self._opt == "newton-cg":
-            beta = newton(self._app, self, beta, X, y, tol, max_iter)
             if self._penalty == "l2":
                 self._lambda_id = (
                     self._app.eye(
@@ -120,6 +119,7 @@ def fit(self, X: BlockArray, y: BlockArray):
                     )
                     * self._lambda
                 )
+            beta = newton(self._app, self, beta, X, y, tol, max_iter)
         elif self._opt == "lbfgs":
             self.use_lbfgs_forward = True
             lbfgs_optimizer = LBFGS(

From 081cd475cfbec623f22df83156dce58cf20902c4 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Wed, 13 Oct 2021 15:33:55 -0700
Subject: [PATCH 09/16] add tests with sklearn

---
 tests/models/test_multinomial_lr.py | 32 +++++++++++++++++++----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 0fa20a4f..31dab91f 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -14,11 +14,10 @@
 # limitations under the License.
 
 
-import time
-
 import numpy as np
 
 from sklearn.datasets import load_iris
+from sklearn.linear_model import LogisticRegression
 
 from nums.core.array.application import ArrayApplication
 from nums.models.multinomial_lr import MultinomialLogisticRegression
@@ -27,9 +26,7 @@
 
 
 def test_multinomial_logistic(nps_app_inst: ArrayApplication):
-    data = load_iris()
-    real_X = data["data"]
-    real_y_indices = data["target"]
+    real_X, real_y_indices = load_iris(return_X_y=True)
     num_samples, _, num_classes = (
         real_X.shape[0],
         real_X.shape[1],
@@ -46,34 +43,47 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
         {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "newton", "tol": 1e-8, "max_iter": 10},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10, "m": 3},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
     ]
+
     for kwargs in param_set:
-        runtime = time.time()
         lr_model: MultinomialLogisticRegression = MultinomialLogisticRegression(
             **kwargs
         )
         lr_model.fit(X, y)
-        runtime = time.time() - runtime
+        # runtime = time.time() - runtime
         y_pred = lr_model.predict(
             X
         )  # .get() TODO we should return a nums object not np
+        score = np.sum(y.get().argmax(axis=1) == y_pred) / num_samples
+        # Sklearn multiclass lr only supports 'lbfgs', 'sag', 'saga' and 'newton-cg' solvers.
+        if kwargs.get("solver") in ["lbfgs", "newton-cg"]:
+            kwargs.update({"multi_class": "multinomial"})
+            # pylint: disable=unexpected-keyword-arg
+            clf = LogisticRegression(**kwargs).fit(real_X, real_y_indices)
+            ref_score = clf.score(real_X, real_y_indices)
+            print("opt", kwargs["solver"])
+            print(score, ref_score)
+            assert np.allclose(score, ref_score, atol=0.03)
+
         # TODO this isn't implemented atm. does it make sense to implement?
         # y_pred_proba = lr_model.predict_proba(X).get()
         # TODO not sure if we need this line
         # np.allclose(np.ones(shape=(y.shape[0],)), y_pred_proba[:, 0] + y_pred_proba[:, 1])
-        print("opt", kwargs["solver"])
-        print("runtime", runtime)
         # TODO does this matter?
         # print("norm", lr_model.grad_norm_sq(X, y).get())
         # TODO we don't have this function implemented
         # print("objective", lr_model.objective(X, y).get())
-        print("accuracy", np.sum(y.get().argmax(axis=1) == y_pred) / num_samples)
 
 
 if __name__ == "__main__":
     # pylint: disable=import-error
     from nums.core import application_manager
+    import nums.core.settings
 
+    nums.core.settings.system_name = "serial"
     nps_app_inst = application_manager.instance()
     test_multinomial_logistic(nps_app_inst)

From 0264fd67381911e566f08187f28ec2eb901172a3 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Wed, 13 Oct 2021 17:54:38 -0700
Subject: [PATCH 10/16] fix for l2 penalty

---
 nums/models/lbfgs.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/nums/models/lbfgs.py b/nums/models/lbfgs.py
index 62d710ed..0a404bbb 100644
--- a/nums/models/lbfgs.py
+++ b/nums/models/lbfgs.py
@@ -112,7 +112,7 @@ def execute(self, X, y, theta):
             dtype=self.dtype,
         )
 
-        g = self.model.gradient(X, y, self.model.forward(X, theta))
+        g = self.model.gradient(X, y, self.model.forward(X, theta), theta)
         next_g = None
         next_theta = None
         while self.k < self.max_iter:
@@ -139,7 +139,9 @@ def execute(self, X, y, theta):
                 # Terminate immediately if this is the last iteration.
                 theta = next_theta
                 break
-            next_g = self.model.gradient(X, y, self.model.forward(X, next_theta))
+            next_g = self.model.gradient(
+                X, y, self.model.forward(X, next_theta), next_theta
+            )
             theta_diff = next_theta - theta
             grad_diff = next_g - g
             mem: LBFGSMemory = LBFGSMemory(k=self.k, s=theta_diff, y=grad_diff)

From 20347d4321403df72ef1ec211302616214b6bfa1 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Wed, 13 Oct 2021 18:26:51 -0700
Subject: [PATCH 11/16] drop this commit

---
 tests/models/test_multinomial_lr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 31dab91f..57831ba0 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -43,10 +43,10 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
         {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "newton", "tol": 1e-8, "max_iter": 10},
-        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000},
-        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 10},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 10, "penalty": "none"},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10, "penalty": "none"},
     ]
 
     for kwargs in param_set:

From 5c6103d76cd46d9114703bc87e3071f77aa82e79 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Wed, 13 Oct 2021 18:28:57 -0700
Subject: [PATCH 12/16] Revert "drop this commit"

This reverts commit 20347d4321403df72ef1ec211302616214b6bfa1.
---
 tests/models/test_multinomial_lr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 57831ba0..31dab91f 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -43,10 +43,10 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
         {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
         {"solver": "newton", "tol": 1e-8, "max_iter": 10},
-        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 10},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10},
-        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 10, "penalty": "none"},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 10, "penalty": "none"},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
     ]
 
     for kwargs in param_set:

From b35bf8c790cdc6baa08092a193f7281fdc1e645a Mon Sep 17 00:00:00 2001
From: Melih Elibol <elibol@gmail.com>
Date: Fri, 15 Oct 2021 13:18:45 -0700
Subject: [PATCH 13/16] fix comments

---
 nums/models/glms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nums/models/glms.py b/nums/models/glms.py
index 19180164..c1d996d1 100644
--- a/nums/models/glms.py
+++ b/nums/models/glms.py
@@ -31,7 +31,7 @@
 #
 # The link function is expressed as follows.
 # E(Y | X) = mu
-# Define the linear predictor eta = X.T @ beta
+# Define the linear predictor eta = X @ beta
 # Define g as the link function, so that g(mu) = eta
 # E(Y | X) = g^{-1}(eta)
 #

From 2c1c8f1c961052174ef4bd5cf602b09d365ce330 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Fri, 22 Oct 2021 02:10:43 -0700
Subject: [PATCH 14/16] fix

---
 nums/models/multinomial_lr.py       | 12 ++++++++----
 tests/models/test_multinomial_lr.py | 20 +++-----------------
 2 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/nums/models/multinomial_lr.py b/nums/models/multinomial_lr.py
index ad98983c..a8a653e7 100644
--- a/nums/models/multinomial_lr.py
+++ b/nums/models/multinomial_lr.py
@@ -54,6 +54,7 @@ def __init__(
         self._opt = solver
         self._lr = lr
         self._m = m
+        self.lbfgs_beta = None
         self._beta = None
         self._beta0 = None
 
@@ -91,8 +92,8 @@ def fit(self, X: BlockArray, y: BlockArray):
         self.feature_block_dim = X.block_shape[1]
 
         beta: BlockArray = self._app.zeros(
-            (X.shape[1], self._num_class),
-            (X.block_shape[1], self._num_class),
+            (self.feature_dim, self._num_class),
+            (self.feature_block_dim, self._num_class),
             dtype=float,
         )
         tol: BlockArray = self._app.scalar(self._tol)
@@ -130,6 +131,7 @@ def fit(self, X: BlockArray, y: BlockArray):
                 dtype=X.dtype,
             )
             beta = lbfgs_optimizer.execute(X, y, beta)
+            self.lbfgs_beta = beta
         else:
             raise Exception("Unsupported optimizer specified %s." % self._opt)
         self._beta0 = beta[-1]
@@ -146,7 +148,6 @@ def lbfgs_forward(self, X, theta):
         mu = unnormalized_probs / self._app.sum(unnormalized_probs, axis=1).expand_dims(
             -1
         )
-        # print('mu', mu.get()[0])
         return mu  # probabilities for each class
 
     def objective(
@@ -239,7 +240,10 @@ def grad_norm_sq(self, X: BlockArray, y: BlockArray, beta=None):
         return self._app.sum(g * g)
 
     def predict(self, X: BlockArray):
-        pred = self.forward(X).get()
+        if self.lbfgs_beta:
+            pred = self.forward(X, self.lbfgs_beta).get()
+        else:
+            pred = self.forward(X).get()
         return np.argmax(pred, axis=-1)
 
 
diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 31dab91f..72c73670 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -22,8 +22,6 @@
 from nums.core.array.application import ArrayApplication
 from nums.models.multinomial_lr import MultinomialLogisticRegression
 
-# pylint: disable = protected-access, import-outside-toplevel, import-error
-
 
 def test_multinomial_logistic(nps_app_inst: ArrayApplication):
     real_X, real_y_indices = load_iris(return_X_y=True)
@@ -54,10 +52,7 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
             **kwargs
         )
         lr_model.fit(X, y)
-        # runtime = time.time() - runtime
-        y_pred = lr_model.predict(
-            X
-        )  # .get() TODO we should return a nums object not np
+        y_pred = lr_model.predict(X)
         score = np.sum(y.get().argmax(axis=1) == y_pred) / num_samples
         # Sklearn multiclass lr only supports 'lbfgs', 'sag', 'saga' and 'newton-cg' solvers.
         if kwargs.get("solver") in ["lbfgs", "newton-cg"]:
@@ -65,19 +60,10 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
             # pylint: disable=unexpected-keyword-arg
             clf = LogisticRegression(**kwargs).fit(real_X, real_y_indices)
             ref_score = clf.score(real_X, real_y_indices)
-            print("opt", kwargs["solver"])
-            print(score, ref_score)
+            # print("opt", kwargs["solver"])
+            # print(score, ref_score)
             assert np.allclose(score, ref_score, atol=0.03)
 
-        # TODO this isn't implemented atm. does it make sense to implement?
-        # y_pred_proba = lr_model.predict_proba(X).get()
-        # TODO not sure if we need this line
-        # np.allclose(np.ones(shape=(y.shape[0],)), y_pred_proba[:, 0] + y_pred_proba[:, 1])
-        # TODO does this matter?
-        # print("norm", lr_model.grad_norm_sq(X, y).get())
-        # TODO we don't have this function implemented
-        # print("objective", lr_model.objective(X, y).get())
-
 
 if __name__ == "__main__":
     # pylint: disable=import-error

From 69519da5a6c266f0ba1c7c8a89e624d1b987bc73 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Fri, 22 Oct 2021 15:40:53 -0700
Subject: [PATCH 15/16] skip long tests

---
 tests/models/test_multinomial_lr.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/models/test_multinomial_lr.py b/tests/models/test_multinomial_lr.py
index 72c73670..d39b6aaf 100644
--- a/tests/models/test_multinomial_lr.py
+++ b/tests/models/test_multinomial_lr.py
@@ -15,6 +15,7 @@
 
 
 import numpy as np
+import pytest
 
 from sklearn.datasets import load_iris
 from sklearn.linear_model import LogisticRegression
@@ -23,7 +24,10 @@
 from nums.models.multinomial_lr import MultinomialLogisticRegression
 
 
-def test_multinomial_logistic(nps_app_inst: ArrayApplication):
+@pytest.mark.parametrize("max_iter", [10, 2000])
+def test_multinomial_logistic(nps_app_inst: ArrayApplication, max_iter):
+    if max_iter > 100:
+        pytest.skip("skipping long tests")
     real_X, real_y_indices = load_iris(return_X_y=True)
     num_samples, _, num_classes = (
         real_X.shape[0],
@@ -37,14 +41,14 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
         real_y, block_shape=(100, 3)
     )  # TODO block shape? iris is 3 classes, and we seem to crash when using less than 3 here.
     param_set = [
-        {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
-        {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
-        {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": 10},
-        {"solver": "newton", "tol": 1e-8, "max_iter": 10},
-        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000},
-        {"solver": "newton-cg", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
-        {"solver": "lbfgs", "tol": 1e-8, "max_iter": 1000, "penalty": "none"},
+        {"solver": "gd", "lr": 1e-6, "tol": 1e-8, "max_iter": max_iter},
+        {"solver": "sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": max_iter},
+        {"solver": "block_sgd", "lr": 1e-6, "tol": 1e-8, "max_iter": max_iter},
+        {"solver": "newton", "tol": 1e-8, "max_iter": max_iter},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": max_iter},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": max_iter},
+        {"solver": "newton-cg", "tol": 1e-8, "max_iter": max_iter, "penalty": "none"},
+        {"solver": "lbfgs", "tol": 1e-8, "max_iter": max_iter, "penalty": "none"},
     ]
 
     for kwargs in param_set:
@@ -72,4 +76,4 @@ def test_multinomial_logistic(nps_app_inst: ArrayApplication):
 
     nums.core.settings.system_name = "serial"
     nps_app_inst = application_manager.instance()
-    test_multinomial_logistic(nps_app_inst)
+    test_multinomial_logistic(nps_app_inst, 1000)

From 482dcba9b8212cfee875796181c16325041d5b87 Mon Sep 17 00:00:00 2001
From: Vinamra Benara <9059893+vinamrabenara@users.noreply.github.com>
Date: Thu, 28 Oct 2021 15:05:54 -0700
Subject: [PATCH 16/16] misc

---
 nums/models/lbfgs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nums/models/lbfgs.py b/nums/models/lbfgs.py
index 6c788ad5..07384645 100644
--- a/nums/models/lbfgs.py
+++ b/nums/models/lbfgs.py
@@ -64,7 +64,7 @@ def __init__(self, k, s, y):
 
 
 class LBFGS(object):
-    def __init__(self, model: GLM, m=10, max_iter=100, thresh=1e-4, dtype=np.float64):
+    def __init__(self, model: Model, m=10, max_iter=100, thresh=1e-4, dtype=np.float64):
         self.app: ArrayApplication = _instance()
         self.model: Model = model
         self.m = m