From 0ec435199ee54f2a97990d0a1cd337c1f90aeea2 Mon Sep 17 00:00:00 2001
From: Daniel Zou <zoudl2000@gmail.com>
Date: Wed, 3 Aug 2022 22:11:06 -0500
Subject: [PATCH 1/5] Refactor Sparse/BlockArray API, RHS binary ops

---
 nums/core/array/application.py                |   4 +-
 nums/core/array/base.py                       | 794 ++++++++++++++----
 nums/core/array/blockarray.py                 | 521 +++---------
 nums/core/array/random.py                     |  12 +-
 nums/core/array/sparse.py                     | 595 ++++++-------
 nums/core/array/utils.py                      |  25 -
 nums/core/array/view.py                       |  32 +-
 nums/core/kernel/kernel_interface.py          |   3 +
 nums/core/kernel/numpy_kernel.py              |  10 +-
 nums/experimental/optimizer/graph.py          |  12 +-
 nums/experimental/optimizer/grapharray.py     |  57 +-
 nums/experimental/optimizer/reduction_ops.py  |   2 +-
 tests/core/array/test_bop.py                  |   6 +-
 tests/core/array/test_sparse.py               |  31 +-
 tests/experimental/optimizer/test_copy.py     |   2 +-
 .../experimental/optimizer/test_tensordot.py  |   6 +-
 16 files changed, 1157 insertions(+), 955 deletions(-)

diff --git a/nums/core/array/application.py b/nums/core/array/application.py
index 48654617..ce148994 100644
--- a/nums/core/array/application.py
+++ b/nums/core/array/application.py
@@ -917,7 +917,7 @@ def map_uop(
             # TODO(hme): Faster to create ndarray first,
             #  and instantiate block array on return
             #  to avoid instantiating blocks on BlockArray initialization.
-            rarr.blocks[grid_entry] = arr.blocks[grid_entry].uop_map(
+            rarr.blocks[grid_entry] = arr.blocks[grid_entry].map_uop(
                 op_name, args=args, kwargs=kwargs
             )
         return rarr
@@ -928,7 +928,7 @@ def matmul(self, arr_1: BlockArray, arr_2: BlockArray) -> BlockArray:
     def tensordot(
         self, arr_1: BlockArray, arr_2: BlockArray, axes: int = 2
     ) -> BlockArray:
-        return arr_1.tensordot(arr_2, axes)
+        return arr_1.tensordot(arr_1, arr_2, axes)
 
     def einsum(self, subscript, *operands):
         def _compute_syskwargs(blocks):
diff --git a/nums/core/array/base.py b/nums/core/array/base.py
index 9d0e34c5..04b3a534 100644
--- a/nums/core/array/base.py
+++ b/nums/core/array/base.py
@@ -23,11 +23,7 @@
 from nums.core.grid.grid import ArrayGrid
 
 
-class Block:
-    # pylint: disable=redefined-builtin, global-statement
-    # TODO(hme): Create a base class, and move this concrete class into blockarray.py.
-    #  Do this when we implement a SparseBlock object.
-
+class BlockBase:
     block_id_counter = -1
 
     def __init__(
@@ -40,7 +36,7 @@ def __init__(
         km: KernelManager,
         id=None,
     ):
-        self._km = km
+        self.km = km
         self.grid_entry: tuple = grid_entry
         self.grid_shape: tuple = grid_shape
         self.oid: np.object = None
@@ -56,31 +52,34 @@ def __init__(
         self._device = None
         self.fill_value = None
 
-    @property
-    def nnz(self):
-        return np.prod(self.shape)
-
     @property
     def is_dense(self):
         return self.fill_value is None
 
+    @property
+    def nbytes(self):
+        raise NotImplementedError()
+
     def __repr__(self):
-        return "Block(" + str(self.oid) + ")"
+        raise NotImplementedError()
 
     def size(self):
         return np.product(self.shape)
 
-    def copy(self, shallow=True):
-        assert shallow, "Only shallow copies are currently supported."
-        block = Block(
-            self.grid_entry,
-            self.grid_shape,
-            self.shape,
-            self.dtype,
-            self.transposed,
-            self._km,
+    def copy(self):
+        raise NotImplementedError()
+
+    def get(self):
+        return self.km.get(self.oid)
+
+    def astype(self, dtype):
+        block = self.copy()
+        block.dtype = dtype
+        block.oid = self.km.astype(
+            self.oid,
+            dtype.__name__,
+            syskwargs={"grid_entry": block.grid_entry, "grid_shape": block.grid_shape},
         )
-        block.oid = self.oid
         return block
 
     def true_grid_entry(self):
@@ -93,40 +92,16 @@ def true_grid_shape(self):
             return tuple(reversed(self.grid_shape))
         return self.grid_shape
 
+    def transpose(self, defer=False, redistribute=False):
+        raise NotImplementedError()
+
     def device(self):
         if self._device is not None:
             return self._device
-        return self._km.device_grid.get_device(
+        return self.km.device_grid.get_device(
             self.true_grid_entry(), self.true_grid_shape()
         )
 
-    def transpose(self, defer=False, redistribute=False):
-        # If defer is True, this operation does not modify the remote object.
-        # If defer is True and redistribute is False,
-        # this operation does not move the remote object.
-        grid_entryT = tuple(reversed(self.grid_entry))
-        grid_shapeT = tuple(reversed(self.grid_shape))
-        blockT = Block(
-            grid_entry=grid_entryT,
-            grid_shape=grid_shapeT,
-            shape=tuple(reversed(self.shape)),
-            dtype=self.dtype,
-            transposed=not self.transposed,
-            km=self._km,
-        )
-        blockT.oid = self.oid
-        if not defer:
-            blockT.transposed = False
-            if redistribute:
-                syskwargs = {"grid_entry": grid_entryT, "grid_shape": grid_shapeT}
-            else:
-                syskwargs = {
-                    "grid_entry": self.grid_entry,
-                    "grid_shape": self.grid_shape,
-                }
-            blockT.oid = self._km.transpose(self.oid, syskwargs=syskwargs)
-        return blockT
-
     def swapaxes(self, axis1, axis2):
         block = self.copy()
         grid_entry = list(block.grid_entry)
@@ -141,7 +116,7 @@ def swapaxes(self, axis1, axis2):
         block.grid_shape = tuple(grid_shape)
         block.shape = tuple(shape)
 
-        block.oid = self._km.swapaxes(
+        block.oid = self.km.swapaxes(
             block.oid,
             axis1,
             axis2,
@@ -149,48 +124,20 @@ def swapaxes(self, axis1, axis2):
         )
         return block
 
+    def map_uop(self, op_name, args=None, kwargs=None, device=None):
+        raise NotImplementedError()
+
     def ufunc(self, op_name, device=None):
-        return self.uop_map(op_name, device=device)
+        return self.map_uop(op_name, device=device)
 
-    def uop_map(self, op_name, args=None, kwargs=None, device=None):
-        # This retains transpose.
-        block = self.copy()
-        block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
-        args = () if args is None else args
-        kwargs = {} if kwargs is None else kwargs
-        if device is None:
-            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
-        else:
-            syskwargs = {"device": device}
-        block._device = device
-        block.oid = self._km.map_uop(
-            op_name, self.oid, args, kwargs, syskwargs=syskwargs
-        )
-        return block
+    def block_from_scalar(self, other):
+        raise NotImplementedError()
 
-    def _block_from_other(self, other):
-        # Assume other is numeric.
-        # This only occurs during some numpy operations (e.g. np.mean),
-        # where a literal is used in the operation.
-        assert isinstance(other, (int, float, np.int, np.float))
-        block = Block(
-            self.grid_entry,
-            self.grid_shape,
-            (1,),
-            self.dtype,
-            False,
-            self._km,
-        )
-        # We pass syskwargs here for correct node placement for `other`,
-        # which should be local to self.
-        block.oid = self._km.put(
-            np.array(other, dtype=self.dtype),
-            syskwargs={
-                "grid_entry": self.grid_entry,
-                "grid_shape": self.grid_shape,
-            },
-        )
-        return block
+    def conjugate(self):
+        return self.ufunc("conjugate")
+
+    def sqrt(self):
+        return self.ufunc("sqrt")
 
     @staticmethod
     def block_meta(op, block1, block2, args):
@@ -227,77 +174,105 @@ def block_meta(op, block1, block2, args):
 
     @staticmethod
     def init_block(op_name, block1, block2, args, device=None):
-        result_grid_entry, result_grid_shape, result_shape, dtype = Block.block_meta(
-            op_name, block1, block2, args
-        )
-        block = Block(
-            grid_entry=result_grid_entry,
-            grid_shape=result_grid_shape,
-            shape=result_shape,
-            dtype=dtype,
-            transposed=False,
-            km=block1._km,
-        )
-        block._device = device
-        return block
+        raise NotImplementedError()
 
-    def bop(self, op_name, other, args: dict, device=None):
-        if not isinstance(other, Block):
-            other = self._block_from_other(other)
-        block: Block = self.init_block(op_name, self, other, args, device)
-        if device is None:
-            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
-        else:
-            syskwargs = {"device": device}
-        block.oid = self._km.bop(
-            op_name,
-            self.oid,
-            other.oid,
-            self.transposed,
-            other.transposed,
-            axes=args.get("axes"),
-            syskwargs=syskwargs,
-        )
-        return block
+    def _check_bop_implemented(self, other):
+        raise NotImplementedError()
+
+    @staticmethod
+    def binary_op(op_name, a, b, args: dict, device=None):
+        raise NotImplementedError()
+
+    def bop(self, op_name, other, args: dict, device=None, **kwargs):
+        raise NotImplementedError()
 
     def tensordot(self, other, axes):
-        return self.bop("tensordot", other, args={"axes": axes})
+        raise NotImplementedError()
 
     def __add__(self, other):
-        return self.bop("add", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("add", self, other, args={})
+
+    def __radd__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("add", other, self, args={})
 
     def __sub__(self, other):
-        return self.bop("sub", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("sub", self, other, args={})
+
+    def __rsub__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("sub", other, self, args={})
 
     def __mul__(self, other):
-        return self.bop("mul", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("mul", self, other, args={})
 
-    def __matmul__(self, other):
-        return self.tensordot(other, axes=1)
+    def __rmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("mul", other, self, args={})
 
     def __truediv__(self, other):
-        return self.bop("truediv", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("truediv", self, other, args={})
+
+    def __rtruediv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("trudiv", other, self, args={})
 
     def __pow__(self, other):
-        return self.bop("pow", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("pow", self, other, args={})
+
+    def __rpow__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("pow", other, self, args={})
+
+    def __matmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.tensordot(other, axes=1)
 
     def __ge__(self, other):
-        return self.bop("ge", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("ge", self, other, args={})
 
     def __gt__(self, other):
-        return self.bop("gt", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("gt", self, other, args={})
 
     def __le__(self, other):
-        return self.bop("le", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("le", self, other, args={})
 
     def __lt__(self, other):
-        return self.bop("lt", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("lt", self, other, args={})
 
     def __eq__(self, other):
-        return self.bop("eq", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("eq", self, other, args={})
 
     def __ne__(self, other):
-        return self.bop("ne", other, args={})
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.binary_op("ne", self, other, args={})
 
     __iadd__ = __add__
     __isub__ = __sub__
@@ -306,24 +281,169 @@ def __ne__(self, other):
     __itruediv__ = __truediv__
     __ipow__ = __pow__
 
-    def astype(self, dtype):
+
+class Block(BlockBase):
+    # pylint: disable=redefined-builtin, global-statement
+
+    def __init__(
+        self,
+        grid_entry,
+        grid_shape,
+        shape,
+        dtype,
+        transposed,
+        km: KernelManager,
+        id=None,
+    ):
+        super().__init__(grid_entry, grid_shape, shape, dtype, transposed, km, id)
+
+    @property
+    def nbytes(self):
+        return np.prod(self.shape) * np.dtype(self.dtype).itemsize
+
+    def __repr__(self):
+        return "Block(" + str(self.oid) + ")"
+
+    def copy(self, shallow=True):
+        assert shallow, "Only shallow copies are currently supported."
+        block = Block(
+            self.grid_entry,
+            self.grid_shape,
+            self.shape,
+            self.dtype,
+            self.transposed,
+            self.km,
+        )
+        block.oid = self.oid
+        return block
+
+    def transpose(self, defer=False, redistribute=False):
+        # If defer is True, this operation does not modify the remote object.
+        # If defer is True and redistribute is False,
+        # this operation does not move the remote object.
+        grid_entryT = tuple(reversed(self.grid_entry))
+        grid_shapeT = tuple(reversed(self.grid_shape))
+        blockT = Block(
+            grid_entry=grid_entryT,
+            grid_shape=grid_shapeT,
+            shape=tuple(reversed(self.shape)),
+            dtype=self.dtype,
+            transposed=not self.transposed,
+            km=self.km,
+        )
+        blockT.oid = self.oid
+        if not defer:
+            blockT.transposed = False
+            if redistribute:
+                syskwargs = {"grid_entry": grid_entryT, "grid_shape": grid_shapeT}
+            else:
+                syskwargs = {
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                }
+            blockT.oid = self.km.transpose(self.oid, syskwargs=syskwargs)
+        return blockT
+
+    def map_uop(self, op_name, args=None, kwargs=None, device=None):
+        # This retains transpose.
         block = self.copy()
-        block.dtype = dtype
-        block.oid = self._km.astype(
-            self.oid,
-            dtype.__name__,
-            syskwargs={"grid_entry": block.grid_entry, "grid_shape": block.grid_shape},
+        block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
+        args = () if args is None else args
+        kwargs = {} if kwargs is None else kwargs
+        if device is None:
+            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
+        else:
+            syskwargs = {"device": device}
+        block._device = device
+        block.oid = self.km.map_uop(
+            op_name, self.oid, args, kwargs, syskwargs=syskwargs
         )
         return block
 
-    def conjugate(self):
-        return self.ufunc("conjugate")
+    def block_from_scalar(self, other):
+        # Assume other is numeric.
+        # This only occurs during some numpy operations (e.g. np.mean),
+        # where a literal is used in the operation.
+        assert array_utils.is_scalar(other)
+        block = Block(
+            self.grid_entry,
+            self.grid_shape,
+            (1,),
+            self.dtype,
+            False,
+            self.km,
+        )
+        # We pass syskwargs here for correct node placement for `other`,
+        # which should be local to self.
+        block.oid = self.km.put(
+            np.array(other, dtype=self.dtype),
+            syskwargs={
+                "grid_entry": self.grid_entry,
+                "grid_shape": self.grid_shape,
+            },
+        )
+        return block
 
-    def sqrt(self):
-        return self.ufunc("sqrt")
+    @staticmethod
+    def init_block(op_name, block1, block2, args, device=None):
+        (
+            result_grid_entry,
+            result_grid_shape,
+            result_shape,
+            dtype,
+        ) = BlockBase.block_meta(op_name, block1, block2, args)
+        block = Block(
+            grid_entry=result_grid_entry,
+            grid_shape=result_grid_shape,
+            shape=result_shape,
+            dtype=dtype,
+            transposed=False,
+            km=block1.km,
+        )
+        block._device = device
+        return block
 
-    def get(self):
-        return self._km.get(self.oid)
+    def _check_bop_implemented(self, other):
+        if isinstance(other, Block) or array_utils.is_scalar(other):
+            return True
+        return False
+
+    @staticmethod
+    def binary_op(op_name, a, b, args: dict, device=None):
+        if isinstance(a, Block) and array_utils.is_scalar(b):
+            b = a.block_from_scalar(b)
+        elif isinstance(b, Block) and array_utils.is_scalar(a):
+            a = b.block_from_scalar(a)
+        if not isinstance(a, Block) or not isinstance(b, Block):
+            raise NotImplementedError()
+
+        block: Block = a.init_block(op_name, a, b, args, device)
+        if device is None:
+            syskwargs = {"grid_entry": block.grid_entry, "grid_shape": block.grid_shape}
+        else:
+            syskwargs = {"device": device}
+        block.oid = a.km.bop(
+            op_name,
+            a.oid,
+            b.oid,
+            a.transposed,
+            b.transposed,
+            axes=args.get("axes"),
+            syskwargs=syskwargs,
+        )
+        return block
+
+    def bop(self, op_name, other, args: dict, device=None):
+        try:
+            return self.binary_op(op_name, self, other, args, device)
+        except NotImplementedError:
+            return other.binary_op(op_name, self, other, args, device)
+
+    def tensordot(self, other, axes):
+        try:
+            return self.binary_op("tensordot", self, other, args={"axes": axes})
+        except NotImplementedError:
+            return other.binary_op("tensordot", self, other, args={"axes": axes})
 
 
 class BlockArrayBase:
@@ -336,30 +456,9 @@ def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None
         self.size = np.product(self.shape)
         self.ndim = len(self.shape)
         self.dtype = self.grid.dtype
-        try:
-            self.nbytes = self.grid.nbytes()
-        except ValueError as _:
-            self.nbytes = None
         self.blocks = blocks
-        if self.blocks is None:
-            # TODO (hme): Subclass np.ndarray for self.blocks instances,
-            #  and override key methods to better integrate with NumPy's ufuncs.
-            self.blocks = np.empty(shape=self.grid.grid_shape, dtype=Block)
-            for grid_entry in self.grid.get_entry_iterator():
-                self.blocks[grid_entry] = Block(
-                    grid_entry=grid_entry,
-                    grid_shape=self.grid.grid_shape,
-                    shape=self.grid.get_block_shape(grid_entry),
-                    dtype=self.dtype,
-                    transposed=False,
-                    km=self.km,
-                )
         self.fill_value = None
 
-    @property
-    def nnz(self):
-        return np.prod(self.shape)
-
     @property
     def is_dense(self):
         return self.fill_value is None
@@ -369,7 +468,7 @@ def __repr__(self):
 
     def get(self) -> np.ndarray:
         result: np.ndarray = np.zeros(shape=self.grid.shape, dtype=self.grid.dtype)
-        block_shape: np.ndarray = np.array(self.grid.block_shape, dtype=np.int)
+        block_shape: np.ndarray = np.array(self.grid.block_shape, dtype=np.int64)
         arrays: list = self.km.get(
             [
                 self.blocks[grid_entry].oid
@@ -378,16 +477,50 @@ def get(self) -> np.ndarray:
         )
         for block_index, grid_entry in enumerate(self.grid.get_entry_iterator()):
             start = block_shape * grid_entry
-            entry_shape = np.array(self.grid.get_block_shape(grid_entry), dtype=np.int)
+            entry_shape = np.array(
+                self.grid.get_block_shape(grid_entry), dtype=np.int64
+            )
             end = start + entry_shape
             slices = tuple(map(lambda item: slice(*item), zip(*(start, end))))
-            block: Block = self.blocks[grid_entry]
+            block: BlockBase = self.blocks[grid_entry]
             arr: np.ndarray = arrays[block_index]
             if block.transposed:
                 arr = arr.T
             result[slices] = arr.reshape(block.shape)
         return result
 
+    def touch(self):
+        """
+        "Touch" an array. This is an efficient distributed "wait" operation.
+        """
+        oids = []
+        for grid_entry in self.grid.get_entry_iterator():
+            block: BlockBase = self.blocks[grid_entry]
+            oids.append(
+                self.km.touch(
+                    block.oid,
+                    syskwargs={
+                        "grid_entry": block.grid_entry,
+                        "grid_shape": block.grid_shape,
+                    },
+                )
+            )
+        self.km.get(oids)
+        return self
+
+    def copy(self):
+        raise NotImplementedError()
+
+    def astype(self, dtype):
+        raise NotImplementedError()
+
+    def flattened_oids(self):
+        oids = []
+        for grid_entry in self.grid.get_entry_iterator():
+            oid = self.blocks[grid_entry].oid
+            oids.append(oid)
+        return oids
+
     def broadcast_to(self, shape):
         b = array_utils.broadcast(self.shape, shape)
         result_block_shape = array_utils.broadcast_block_shape(
@@ -410,3 +543,324 @@ def broadcast_to(self, shape):
             broadcast = it.itviews[0]
         result.blocks = broadcast
         return result
+
+    def tree_reduce(
+        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape
+    ):
+        """
+        Basic tree reduce imp.
+        Schedules op on same node as left operand.
+        :param op_name: The reduction op.
+        :param blocks_or_oids: A list of type Block or a list of tuples.
+        Tuples must be of the form
+        (oid, grid_entry, grid_shape, transposed)
+        :param result_grid_entry: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :param result_grid_shape: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :return: The oid of the result.
+        """
+        oid_list = blocks_or_oids
+        if isinstance(blocks_or_oids[0], Block):
+            oid_list = [
+                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
+                for b in blocks_or_oids
+            ]
+        if len(oid_list) == 1:
+            return oid_list[0][0]
+        q = oid_list
+        while len(q) > 1:
+            a_oid, a_ge, a_gs, a_T = q.pop(0)
+            b_oid, _, _, b_T = q.pop(0)
+            ge, gs = (
+                (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs)
+            )
+            c_oid = self.km.bop_reduce(
+                op_name,
+                a_oid,
+                b_oid,
+                a_T,
+                b_T,
+                syskwargs={
+                    "grid_entry": ge,
+                    "grid_shape": gs,
+                },
+            )
+            q.append((c_oid, ge, gs, False))
+        r_oid, r_ge, r_gs, _ = q.pop(0)
+        assert r_ge == result_grid_entry
+        assert r_gs == result_grid_shape
+        return r_oid
+
+    def check_or_convert_other(self, other, compute_block_shape=False):
+        raise NotImplementedError()
+
+    def _check_bop_implemented(self, other):
+        raise NotImplementedError()
+
+    # All operators: https://docs.python.org/3/library/operator.html
+
+    #################
+    # Unary functions
+    #################
+
+    def ufunc(self, op_name):
+        raise NotImplementedError()
+
+    def __neg__(self):
+        return self.ufunc("negative")
+
+    def __pos__(self):
+        return self
+
+    def __abs__(self):
+        return self.ufunc("abs")
+
+    def __invert__(self):
+        return self.ufunc("invert")
+
+    #################
+    # Arithmetic
+    #################
+
+    @staticmethod
+    def elementwise(op_name, a, b):
+        raise NotImplementedError()
+
+    def __mod__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mod", self, other)
+
+    def __rmod__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mod", other, self)
+
+    __imod__ = __mod__
+
+    def __add__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("add", self, other)
+
+    def __radd__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("add", other, self)
+
+    __iadd__ = __add__
+
+    def __sub__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("sub", self, other)
+
+    def __rsub__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("sub", other, self)
+
+    __isub__ = __sub__
+
+    def __mul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mul", self, other)
+
+    def __rmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("mul", other, self)
+
+    __imul__ = __mul__
+
+    def __truediv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("truediv", self, other)
+
+    def __rtruediv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("truediv", other, self)
+
+    __itruediv__ = __truediv__
+
+    def __floordiv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("floor_divide", self, other)
+
+    def __rfloordiv__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("floor_divide", other, self)
+
+    __ifloordiv__ = __floordiv__
+
+    def __pow__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("pow", self, other)
+
+    def __rpow__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("pow", other, self)
+
+    __ipow__ = __pow__
+
+    ##################
+    # Boolean
+    ##################
+
+    # TODO (hme): Type check bool ops.
+    def __bool__(self):
+        # pylint: disable=no-member
+        if np.sum(self.shape) == len(self.shape):
+            # If all ones or scalar, then this is defined.
+            return self.get().__bool__()
+        return True
+
+    def __or__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_or", self, other)
+
+    def __ror__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_or", other, self)
+
+    __ior__ = __or__
+
+    def __and__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_and", self, other)
+
+    def __rand__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_and", other, self)
+
+    __iand__ = __and__
+
+    def __xor__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_xor", self, other)
+
+    def __rxor__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("bitwise_xor", other, self)
+
+    __ixor__ = __xor__
+
+    def __lshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("left_shift", self, other)
+
+    def __rlshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("left_shift", other, self)
+
+    __ilshift__ = __lshift__
+
+    def __rshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("right_shift", self, other)
+
+    def __rrshift__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.elementwise("right_shift", other, self)
+
+    __irshift__ = __rshift__
+
+    #################
+    # Linear Algebra
+    #################
+
+    def _compute_tensordot_syskwargs(self, self_block: Block, other_block: Block):
+        # Schedule on larger block.
+        if np.product(self_block.shape) >= np.product(other_block.shape):
+            return self_block.true_grid_entry(), self_block.true_grid_shape()
+        else:
+            return other_block.true_grid_entry(), other_block.true_grid_shape()
+
+    @staticmethod
+    def tensordot(a, b, axes=2):
+        raise NotImplementedError()
+
+    def __matmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        if len(self.shape) > 2:
+            # TODO (bcp): NumPy's implementation does a stacked matmul, which is not supported yet.
+            raise NotImplementedError(
+                "Matrix multiply for tensors of rank > 2 not supported yet."
+            )
+        else:
+            return self.tensordot(self, other, 1)
+
+    def __rmatmul__(self, other):
+        if not self._check_bop_implemented(other):
+            return NotImplemented
+        return self.tensordot(other, self, 1)
+
+    __imatmul__ = __matmul__
+
+    #################
+    # Inequalities
+    #################
+
+    def __inequality__(self, op, other):
+        raise NotImplementedError()
+
+    def __ge__(self, other):
+        return self.__inequality__("ge", other)
+
+    def __rge__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("ge", self)
+
+    def __gt__(self, other):
+        return self.__inequality__("gt", other)
+
+    def __rgt__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("gt", self)
+
+    def __le__(self, other):
+        return self.__inequality__("le", other)
+
+    def __rle__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("le", self)
+
+    def __lt__(self, other):
+        return self.__inequality__("lt", other)
+
+    def __rlt__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("lt", self)
+
+    def __eq__(self, other):
+        return self.__inequality__("eq", other)
+
+    def __req__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("eq", self)
+
+    def __ne__(self, other):
+        return self.__inequality__("ne", other)
+
+    def __rne__(self, other):
+        other = self.check_or_convert_other(other)
+        return other.__inequality__("ne", self)
diff --git a/nums/core/array/blockarray.py b/nums/core/array/blockarray.py
index 46a7daf3..9fccb971 100644
--- a/nums/core/array/blockarray.py
+++ b/nums/core/array/blockarray.py
@@ -17,7 +17,6 @@
 import itertools
 
 import numpy as np
-import nums.core.array.sparse as array_sparse
 
 from nums.core.array import utils as array_utils
 from nums.core.array.base import BlockArrayBase, Block
@@ -30,6 +29,28 @@
 
 
 class BlockArray(BlockArrayBase):
+    def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None):
+        if blocks is not None:
+            assert blocks.dtype == Block, "BlockArray must be initialized with Blocks"
+        super().__init__(grid, km, blocks)
+        try:
+            self.nbytes = self.grid.nbytes()
+        except ValueError as _:
+            self.nbytes = None
+        if self.blocks is None:
+            # TODO (hme): Subclass np.ndarray for self.blocks instances,
+            #  and override key methods to better integrate with NumPy's ufuncs.
+            self.blocks = np.empty(shape=self.grid.grid_shape, dtype=Block)
+            for grid_entry in self.grid.get_entry_iterator():
+                self.blocks[grid_entry] = Block(
+                    grid_entry=grid_entry,
+                    grid_shape=self.grid.grid_shape,
+                    shape=self.grid.get_block_shape(grid_entry),
+                    dtype=self.dtype,
+                    transposed=False,
+                    km=self.km,
+                )
+
     @classmethod
     def empty(cls, shape, block_shape, dtype, km: KernelManager):
         return BlockArray.create("empty", shape, block_shape, dtype, km)
@@ -115,24 +136,12 @@ def copy(self):
             rarr_copy.blocks[grid_entry] = self.blocks[grid_entry].copy()
         return rarr_copy
 
-    def touch(self):
-        """
-        "Touch" an array. This is an efficient distributed "wait" operation.
-        """
-        oids = []
-        for grid_entry in self.grid.get_entry_iterator():
-            block: Block = self.blocks[grid_entry]
-            oids.append(
-                self.km.touch(
-                    block.oid,
-                    syskwargs={
-                        "grid_entry": block.grid_entry,
-                        "grid_shape": block.grid_shape,
-                    },
-                )
-            )
-        self.km.get(oids)
-        return self
+    def astype(self, dtype):
+        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
+        result = BlockArray(grid, self.km)
+        for grid_entry in result.grid.get_entry_iterator():
+            result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype)
+        return result
 
     def is_single_block(self):
         return self.blocks.size == 1
@@ -323,7 +332,7 @@ def __getitem__(self, item):
 
         av: ArrayView = ArrayView.from_block_array(self)
         # TODO (hme): We don't have to create, but do so for now until we need to optimize.
-        return av[ss].create(BlockArray)
+        return av[ss].create()
 
     def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
         # Create output array along the axis of the selection operation.
@@ -635,60 +644,19 @@ def check_or_convert_other(self, other, compute_block_shape=False):
         block_shape = None if compute_block_shape else self.block_shape
         return BlockArray.to_block_array(other, self.km, block_shape=block_shape)
 
+    def _check_bop_implemented(self, other):
+        if isinstance(other, (BlockArray, np.ndarray, list)) or array_utils.is_scalar(
+            other
+        ):
+            return True
+        return False
+
     def ufunc(self, op_name):
         result = self.copy()
         for grid_entry in self.grid.get_entry_iterator():
             result.blocks[grid_entry] = self.blocks[grid_entry].ufunc(op_name)
         return result
 
-    def tree_reduce(
-        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape
-    ):
-        """
-        Basic tree reduce imp.
-        Schedules op on same node as left operand.
-        :param op_name: The reduction op.
-        :param blocks_or_oids: A list of type Block or a list of tuples.
-                               Tuples must be of the form
-                               (oid, grid_entry, grid_shape, transposed)
-        :param result_grid_entry: The grid entry of the result block. This will be used
-                                  to compute the final reduction step.
-        :param result_grid_shape: The grid entry of the result block. This will be used
-                                  to compute the final reduction step.
-        :return: The oid of the result.
-        """
-        oid_list = blocks_or_oids
-        if isinstance(blocks_or_oids[0], Block):
-            oid_list = [
-                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
-                for b in blocks_or_oids
-            ]
-        if len(oid_list) == 1:
-            return oid_list[0][0]
-        q = oid_list
-        while len(q) > 1:
-            a_oid, a_ge, a_gs, a_T = q.pop(0)
-            b_oid, _, _, b_T = q.pop(0)
-            ge, gs = (
-                (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs)
-            )
-            c_oid = self.km.bop_reduce(
-                op_name,
-                a_oid,
-                b_oid,
-                a_T,
-                b_T,
-                syskwargs={
-                    "grid_entry": ge,
-                    "grid_shape": gs,
-                },
-            )
-            q.append((c_oid, ge, gs, False))
-        r_oid, r_ge, r_gs, _ = q.pop(0)
-        assert r_ge == result_grid_entry
-        assert r_gs == result_grid_shape
-        return r_oid
-
     def reduce_axis(self, op_name, axis, keepdims=False):
         if not (axis is None or isinstance(axis, (int, np.int32, np.int64))):
             raise NotImplementedError("Only integer axis is currently supported.")
@@ -770,17 +738,66 @@ def reduce_axis(self, op_name, axis, keepdims=False):
         return result
 
     #################
-    # Linear Algebra
+    # Arithmetic
     #################
 
-    def _compute_tensordot_syskwargs(self, self_block: Block, other_block: Block):
-        # Schedule on larger block.
-        if np.product(self_block.shape) >= np.product(other_block.shape):
-            return self_block.true_grid_entry(), self_block.true_grid_shape()
+    @staticmethod
+    def elementwise(op_name, a, b):
+        if isinstance(a, BlockArray):
+            b = a.check_or_convert_other(b)
+        elif isinstance(b, BlockArray):
+            a = b.check_or_convert_other(a)
         else:
-            return other_block.true_grid_entry(), other_block.true_grid_shape()
+            raise NotImplementedError()
+
+        if a.shape == b.shape and a.block_shape == b.block_shape:
+            return BlockArray._fast_elementwise(op_name, a, b)
+        blocks_op = a.blocks.__getattribute__("__%s__" % op_name)
+        return BlockArray.from_blocks(blocks_op(b.blocks), result_shape=None, km=a.km)
+
+    @staticmethod
+    def _fast_elementwise(op_name, a, b):
+        """
+        Implements fast scheduling for basic element-wise operations.
+        """
+        dtype = array_utils.get_bop_output_type(op_name, a.dtype, b.dtype)
+        # Schedule the op first.
+        blocks = np.empty(shape=a.grid.grid_shape, dtype=Block)
+        for grid_entry in a.grid.get_entry_iterator():
+            a_block: Block = a.blocks[grid_entry]
+            b_block: Block = b.blocks[grid_entry]
+            blocks[grid_entry] = block = Block(
+                grid_entry=grid_entry,
+                grid_shape=a_block.grid_shape,
+                shape=a_block.shape,
+                dtype=dtype,
+                transposed=False,
+                km=a.km,
+            )
+            block.oid = a.km.bop(
+                op_name,
+                a_block.oid,
+                b_block.oid,
+                a_block.transposed,
+                b_block.transposed,
+                axes={},
+                syskwargs={
+                    "grid_entry": grid_entry,
+                    "grid_shape": a.grid.grid_shape,
+                },
+            )
+        return BlockArray(
+            ArrayGrid(a.shape, a.block_shape, dtype.__name__),
+            a.km,
+            blocks=blocks,
+        )
+
+    #################
+    # Linear Algebra
+    #################
 
-    def tensordot(self, other, axes=2):
+    @staticmethod
+    def tensordot(a, b, axes=2):
         if isinstance(axes, int):
             pass
         elif array_utils.is_array_like(axes):
@@ -788,59 +805,58 @@ def tensordot(self, other, axes=2):
         else:
             raise TypeError(f"Unexpected axes type '{type(axes).__name__}'")
 
-        other = self.check_or_convert_other(other, compute_block_shape=True)
+        if isinstance(a, BlockArray):
+            b = a.check_or_convert_other(b, compute_block_shape=True)
+        elif isinstance(b, BlockArray):
+            a = b.check_or_convert_other(a, compute_block_shape=True)
+        else:
+            raise NotImplementedError()
 
-        if array_utils.np_tensordot_param_test(
-            self.shape, self.ndim, other.shape, other.ndim, axes
-        ):
+        if array_utils.np_tensordot_param_test(a.shape, a.ndim, b.shape, b.ndim, axes):
             raise ValueError("shape-mismatch for sum")
 
         if axes > 0:
-            this_axes = self.grid.grid_shape[:-axes]
-            this_sum_axes = self.grid.grid_shape[-axes:]
-            other_axes = other.grid.grid_shape[axes:]
-            other_sum_axes = other.grid.grid_shape[:axes]
-            assert this_sum_axes == other_sum_axes
-            result_shape = tuple(self.shape[:-axes] + other.shape[axes:])
-            result_block_shape = tuple(
-                self.block_shape[:-axes] + other.block_shape[axes:]
-            )
+            a_axes = a.grid.grid_shape[:-axes]
+            a_sum_axes = a.grid.grid_shape[-axes:]
+            b_axes = b.grid.grid_shape[axes:]
+            b_sum_axes = b.grid.grid_shape[:axes]
+            assert a_sum_axes == b_sum_axes
+            result_shape = tuple(a.shape[:-axes] + b.shape[axes:])
+            result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:])
         else:
-            this_axes = self.grid.grid_shape
-            other_axes = other.grid.grid_shape
-            this_sum_axes = ()
-            result_shape = tuple(self.shape + other.shape)
-            result_block_shape = tuple(self.block_shape + other.block_shape)
+            a_axes = a.grid.grid_shape
+            b_axes = b.grid.grid_shape
+            a_sum_axes = ()
+            result_shape = tuple(a.shape + b.shape)
+            result_block_shape = tuple(a.block_shape + b.block_shape)
 
         result_grid = ArrayGrid(
             shape=result_shape,
             block_shape=result_block_shape,
             dtype=array_utils.get_bop_output_type(
-                "tensordot", self.dtype, other.dtype
+                "tensordot", a.dtype, b.dtype
             ).__name__,
         )
-        assert result_grid.grid_shape == tuple(this_axes + other_axes)
-        result = BlockArray(result_grid, self.km)
-        this_dims = list(itertools.product(*map(range, this_axes)))
-        other_dims = list(itertools.product(*map(range, other_axes)))
-        sum_dims = list(itertools.product(*map(range, this_sum_axes)))
-        for i in this_dims:
-            for j in other_dims:
+        assert result_grid.grid_shape == tuple(a_axes + b_axes)
+        result = BlockArray(result_grid, a.km)
+        a_dims = list(itertools.product(*map(range, a_axes)))
+        b_dims = list(itertools.product(*map(range, b_axes)))
+        sum_dims = list(itertools.product(*map(range, a_sum_axes)))
+        for i in a_dims:
+            for j in b_dims:
                 grid_entry = tuple(i + j)
                 result_block: Block = result.blocks[grid_entry]
                 sum_oids = []
                 for k in sum_dims:
-                    self_block: Block = self.blocks[tuple(i + k)]
-                    other_block: Block = other.blocks[tuple(k + j)]
-                    dot_grid_args = self._compute_tensordot_syskwargs(
-                        self_block, other_block
-                    )
-                    dotted_oid = self.km.bop(
+                    a_block: Block = a.blocks[tuple(i + k)]
+                    b_block: Block = b.blocks[tuple(k + j)]
+                    dot_grid_args = a._compute_tensordot_syskwargs(a_block, b_block)
+                    dotted_oid = a.km.bop(
                         "tensordot",
-                        self_block.oid,
-                        other_block.oid,
-                        self_block.transposed,
-                        other_block.transposed,
+                        a_block.oid,
+                        b_block.oid,
+                        a_block.transposed,
+                        b_block.transposed,
                         axes=axes,
                         syskwargs={
                             "grid_entry": dot_grid_args[0],
@@ -850,177 +866,19 @@ def tensordot(self, other, axes=2):
                     sum_oids.append(
                         (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
                     )
-                result_block.oid = self.tree_reduce(
+                result_block.oid = a.tree_reduce(
                     "sum", sum_oids, result_block.grid_entry, result_block.grid_shape
                 )
         return result
 
-    def __matmul__(self, other):
-        if len(self.shape) > 2:
-            # TODO (bcp): NumPy's implementation does a stacked matmul, which is not supported yet.
-            raise NotImplementedError(
-                "Matrix multiply for tensors of rank > 2 not supported yet."
-            )
-        else:
-            return self.tensordot(other, 1)
-
-    def __rmatmul__(self, other):
-        other = self.check_or_convert_other(other)
-        return other @ self
-
-    __imatmul__ = __matmul__
-
-    #################
-    # Arithmetic
-    #################
-
-    def _fast_element_wise(self, op_name, other):
-        """
-        Implements fast scheduling for basic element-wise operations.
-        """
-        dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
-        # Schedule the op first.
-        blocks = np.empty(shape=self.grid.grid_shape, dtype=Block)
-        for grid_entry in self.grid.get_entry_iterator():
-            self_block: Block = self.blocks[grid_entry]
-            other_block: Block = other.blocks[grid_entry]
-            blocks[grid_entry] = block = Block(
-                grid_entry=grid_entry,
-                grid_shape=self_block.grid_shape,
-                shape=self_block.shape,
-                dtype=dtype,
-                transposed=False,
-                km=self.km,
-            )
-            block.oid = self.km.bop(
-                op_name,
-                self_block.oid,
-                other_block.oid,
-                self_block.transposed,
-                other_block.transposed,
-                axes={},
-                syskwargs={
-                    "grid_entry": grid_entry,
-                    "grid_shape": self.grid.grid_shape,
-                },
-            )
-        return BlockArray(
-            ArrayGrid(self.shape, self.block_shape, dtype.__name__),
-            self.km,
-            blocks=blocks,
-        )
-
-    def __elementwise__(self, op_name, other):
-        other = self.check_or_convert_other(other)
-        if self.shape == other.shape and self.block_shape == other.block_shape:
-            return self._fast_element_wise(op_name, other)
-        blocks_op = self.blocks.__getattribute__("__%s__" % op_name)
-        return BlockArray.from_blocks(
-            blocks_op(other.blocks), result_shape=None, km=self.km
-        )
-
-    def __neg__(self):
-        return self.ufunc("negative")
-
-    def __pos__(self):
-        return self
-
-    def __abs__(self):
-        return self.ufunc("abs")
-
-    def __mod__(self, other):
-        return self.__elementwise__("mod", other)
-
-    def __rmod__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("mod", self)
-
-    __imod__ = __mod__
-
-    def __add__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("add", other)
-
-    def __radd__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("add", self)
-
-    __iadd__ = __add__
-
-    def __sub__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("sub", other)
-
-    def __rsub__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("sub", self)
-
-    __isub__ = __sub__
-
-    def __mul__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("mul", other)
-
-    def __rmul__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("mul", self)
-
-    __imul__ = __mul__
-
-    def __truediv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("truediv", other)
-
-    def __rtruediv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other / self
-
-    __itruediv__ = __truediv__
-
-    def __floordiv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("floor_divide", other)
-
-    def __rfloordiv__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("floor_divide", self)
-
-    __ifloordiv__ = __floordiv__
-
-    def __pow__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        return self.__elementwise__("pow", other)
-
-    def __rpow__(self, other):
-        if isinstance(other, array_sparse.SparseBlockArray):
-            return NotImplemented
-        other = self.check_or_convert_other(other)
-        return other**self
-
-    __ipow__ = __pow__
-
     #################
     # Inequalities
     #################
 
     def __inequality__(self, op, other):
         other = self.check_or_convert_other(other)
+        if other is NotImplemented:
+            return NotImplemented
         assert (
             other.shape == () or other.shape == self.shape
         ), "Currently supports comparison with scalars only."
@@ -1039,127 +897,8 @@ def __inequality__(self, op, other):
             result.blocks[grid_entry] = self.blocks[grid_entry].bop(
                 op, other_block, args={}
             )
-
         return result
 
-    def __ge__(self, other):
-        return self.__inequality__("ge", other)
-
-    def __rge__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("ge", self)
-
-    def __gt__(self, other):
-        return self.__inequality__("gt", other)
-
-    def __rgt__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("gt", self)
-
-    def __le__(self, other):
-        return self.__inequality__("le", other)
-
-    def __rle__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("le", self)
-
-    def __lt__(self, other):
-        return self.__inequality__("lt", other)
-
-    def __rlt__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("lt", self)
-
-    def __eq__(self, other):
-        return self.__inequality__("eq", other)
-
-    def __req__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("eq", self)
-
-    def __ne__(self, other):
-        return self.__inequality__("ne", other)
-
-    def __rne__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__inequality__("ne", self)
-
-    ##################
-    # Boolean
-    ##################
-
-    # TODO (hme): Type check bool ops.
-    def __bool__(self):
-        # pylint: disable=no-member
-        if np.sum(self.shape) == len(self.shape):
-            # If all ones or scalar, then this is defined.
-            return self.get().__bool__()
-        return True
-
-    def __invert__(self):
-        return self.ufunc("invert")
-
-    def __or__(self, other):
-        return self.__elementwise__("bitwise_or", other)
-
-    def __ror__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_or", self)
-
-    __ior__ = __or__
-
-    def __and__(self, other):
-        return self.__elementwise__("bitwise_and", other)
-
-    def __rand__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_and", self)
-
-    __iand__ = __and__
-
-    def __xor__(self, other):
-        return self.__elementwise__("bitwise_xor", other)
-
-    def __rxor__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("bitwise_xor", self)
-
-    __ixor__ = __xor__
-
-    def __lshift__(self, other):
-        return self.__elementwise__("left_shift", other)
-
-    def __rlshift__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("left_shift", self)
-
-    __ilshift__ = __lshift__
-
-    def __rshift__(self, other):
-        return self.__elementwise__("right_shift", other)
-
-    def __rrshift__(self, other):
-        other = self.check_or_convert_other(other)
-        return other.__elementwise__("right_shift", self)
-
-    __irshift__ = __rshift__
-
-    # All operators: https://docs.python.org/3/library/operator.html
-
-    def astype(self, dtype):
-        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
-        result = BlockArray(grid, self.km)
-        for grid_entry in result.grid.get_entry_iterator():
-            result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype)
-        return result
-
-    def flattened_oids(self):
-        oids = []
-        for grid_entry in self.grid.get_entry_iterator():
-            oid = self.blocks[grid_entry].oid
-            oids.append(oid)
-        return oids
-
 
 class Reshape:
     @staticmethod
diff --git a/nums/core/array/random.py b/nums/core/array/random.py
index 3452860d..fb17e1d8 100644
--- a/nums/core/array/random.py
+++ b/nums/core/array/random.py
@@ -220,21 +220,21 @@ def sparse_randint(
         self,
         low,
         high=None,
-        dtype=int,
+        dtype=None,
         shape=None,
         block_shape=None,
         p=0.01,
         fill_value=0,
     ):
         if dtype is None:
-            dtype = np.float64
+            dtype = np.int64
         assert isinstance(dtype, type)
         return self._sparse_sample_basic(
             "randint",
+            {"low": low, "high": high, "dtype": dtype},
             shape,
             block_shape,
             dtype,
-            {"low": low, "high": high, "dtype": dtype},
             p,
             fill_value,
         )
@@ -251,10 +251,10 @@ def sparse_uniform(
     ):
         return self._sparse_sample_basic(
             "uniform",
+            {"low": low, "high": high},
             shape,
             block_shape,
             dtype,
-            {"low": low, "high": high},
             p,
             fill_value,
         )
@@ -271,10 +271,10 @@ def sparse_normal(
     ):
         return self._sparse_sample_basic(
             "normal",
+            {"loc": loc, "scale": scale},
             shape,
             block_shape,
             dtype,
-            {"loc": loc, "scale": scale},
             p,
             fill_value,
         )
@@ -282,10 +282,10 @@ def sparse_normal(
     def _sparse_sample_basic(
         self,
         rfunc_name,
+        rfunc_args: Dict,
         shape,
         block_shape,
         dtype,
-        rfunc_args: Dict,
         p,
         fill_value,
     ) -> SparseBlockArray:
diff --git a/nums/core/array/sparse.py b/nums/core/array/sparse.py
index fb71b0bd..c3e99c6c 100644
--- a/nums/core/array/sparse.py
+++ b/nums/core/array/sparse.py
@@ -1,17 +1,16 @@
 from typing import List
 from nums.core.array import utils as array_utils
-from nums.core.array.base import BlockArrayBase, Block
+from nums.core.array.base import BlockBase, Block, BlockArrayBase
 from nums.core.array.blockarray import BlockArray
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.core.grid.grid import ArrayGrid
 import numpy as np
 import itertools
+import warnings
 import sparse
 
 
-# TODO: merge with Block
-# TODO: RHS binary operations
-class SparseBlock(Block):
+class SparseBlock(BlockBase):
     def __init__(
         self,
         grid_entry,
@@ -29,14 +28,12 @@ def __init__(
         self.index_dtype = index_dtype
         self.oid = None
         self._nnz: object = None
-        self._nbytes: object = (
-            None  # TODO: implement as lazily fetched exact result, same as nnz
-        )
+        self._nbytes: object = None
 
     @property
     def nnz(self):
         if self._nnz is None:
-            self._nnz = self._km.sparse_nnz(
+            self._nnz = self.km.sparse_nnz(
                 self.oid,
                 syskwargs={
                     "grid_entry": self.grid_entry,
@@ -44,30 +41,26 @@ def nnz(self):
                 },
             )
         if not array_utils.is_int(self._nnz):
-            self._nnz = self._km.get(self._nnz)
+            self._nnz = self.km.get(self._nnz)
         return self._nnz
 
-    # TODO: implement as lazily fetched exact result
     @property
     def nbytes(self):
-        return self._estimate_nbytes(format="coo")
-
-    # TODO: deprecate
-    def _estimate_nbytes(self, format=None):
-        if format is None:
-            return self.nnz
-        elif format == "coo":
-            return (
-                self.nnz * np.dtype(self.dtype).itemsize
-                + self.nnz * self.ndim * np.dtype(self.index_dtype).itemsize
+        if self._nbytes is None:
+            self._nbytes = self.km.sparse_nbytes(
+                self.oid,
+                syskwargs={
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                },
             )
+        if not array_utils.is_int(self._nbytes):
+            self._nbytes = self.km.get(self._nbytes)
+        return self._nbytes
 
     def __repr__(self):
         return f"SparseBlock({self.oid})"
 
-    def size(self):
-        return np.product(self.shape)
-
     def copy(self, shallow=True):
         assert shallow, "Only shallow copies are currently supported."
         block = SparseBlock(
@@ -76,7 +69,7 @@ def copy(self, shallow=True):
             self.shape,
             self.dtype,
             self.transposed,
-            self._km,
+            self.km,
         )
         block.oid = self.oid
         return block
@@ -90,7 +83,7 @@ def transpose(self, defer=False, redistribute=False):
             shape=tuple(reversed(self.shape)),
             dtype=self.dtype,
             transposed=not self.transposed,
-            km=self._km,
+            km=self.km,
         )
         blockT.oid = self.oid
         if not defer:
@@ -102,13 +95,10 @@ def transpose(self, defer=False, redistribute=False):
                     "grid_entry": self.grid_entry,
                     "grid_shape": self.grid_shape,
                 }
-            blockT.oid = self._km.transpose(self.oid, syskwargs=syskwargs)
+            blockT.oid = self.km.transpose(self.oid, syskwargs=syskwargs)
         return blockT
 
-    def ufunc(self, op_name, device=None):
-        return self.uop_map(op_name, device=device)
-
-    def uop_map(self, op_name, args=None, kwargs=None, device=None):
+    def map_uop(self, op_name, args=None, kwargs=None, device=None):
         block = self.copy()
         block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
         args = () if args is None else args
@@ -118,14 +108,15 @@ def uop_map(self, op_name, args=None, kwargs=None, device=None):
         else:
             syskwargs = {"device": device}
         block._device = device
-        block.oid = self._km.sparse_map_uop(
+        block.oid = self.km.sparse_map_uop(
             op_name, self.oid, args, kwargs, syskwargs=syskwargs
         )
-        block._nnz = self._km.sparse_nnz(block.oid, syskwargs=syskwargs)
+        block._nnz = self.km.sparse_nnz(block.oid, syskwargs=syskwargs)
+        block._nbytes = self.km.sparse_nbytes(block.oid, syskwargs=syskwargs)
         block.fill_value = np.__getattribute__(op_name)(self.fill_value)
         return block
 
-    def _block_from_scalar(self, other):
+    def block_from_scalar(self, other):
         assert array_utils.is_scalar(other)
         block = SparseBlock(
             self.grid_entry,
@@ -133,12 +124,11 @@ def _block_from_scalar(self, other):
             (1,),
             self.dtype,
             False,
-            self._km,
+            self.km,
             fill_value=other,
         )
-        # TODO: generalize for different kernels
-        block.oid = self._km.put(
-            sparse.COO.from_numpy(np.array(other), fill_value=other),
+        block.oid = self.km.sparse_block_from_scalar(
+            other,
             syskwargs={
                 "grid_entry": self.grid_entry,
                 "grid_shape": self.grid_shape,
@@ -153,7 +143,7 @@ def init_block(op_name, block1, block2, args, device=None):
             result_grid_shape,
             result_shape,
             dtype,
-        ) = Block.block_meta(op_name, block1, block2, args)
+        ) = BlockBase.block_meta(op_name, block1, block2, args)
         fill_value = array_utils.get_bop_fill_value(
             op_name, block1.fill_value, block2.fill_value
         )
@@ -165,21 +155,32 @@ def init_block(op_name, block1, block2, args, device=None):
             dtype=dtype,
             transposed=False,
             fill_value=fill_value,
-            km=block1._km,
+            km=block1.km,
         )
         block._device = device
         return block
 
-    def bop(self, op_name, other, args: dict, device=None):
-        if not isinstance(other, Block):
-            other = self._block_from_scalar(other)
+    def _check_bop_implemented(self, other):
+        if isinstance(other, BlockBase) or array_utils.is_scalar(other):
+            return True
+        return False
+
+    @staticmethod
+    def binary_op(op_name, a: BlockBase, b: BlockBase, args: dict, device=None):
+        if isinstance(a, SparseBlock) and array_utils.is_scalar(b):
+            b = a.block_from_scalar(b)
+        elif isinstance(b, SparseBlock) and array_utils.is_scalar(a):
+            a = b.block_from_scalar(a)
+        if not isinstance(a, BlockBase) or not isinstance(b, BlockBase):
+            raise NotImplementedError()
+
         densify = array_utils.get_sparse_bop_return_type(
-            op_name, self.fill_value, other.fill_value
+            op_name, a.fill_value, b.fill_value
         )
         if densify:
-            block = Block.init_block(op_name, self, other, args, device)
+            block = Block.init_block(op_name, a, b, args, device)
         else:
-            block = SparseBlock.init_block(op_name, self, other, args, device)
+            block = SparseBlock.init_block(op_name, a, b, args, device)
         if device is None:
             syskwargs = {
                 "grid_entry": block.grid_entry,
@@ -187,30 +188,33 @@ def bop(self, op_name, other, args: dict, device=None):
             }
         else:
             syskwargs = {"device": device}
-        block.oid = self._km.sparse_bop(
+        block.oid = a.km.sparse_bop(
             op_name,
-            self.oid,
-            other.oid,
-            self.transposed,
-            other.transposed,
+            a.oid,
+            b.oid,
+            a.transposed,
+            b.transposed,
             axes=args.get("axes"),
             densify=densify,
             syskwargs=syskwargs,
         )
         if not densify:
-            block._nnz = self._km.sparse_nnz(block.oid, syskwargs=syskwargs)
+            block._nnz = a.km.sparse_nnz(block.oid, syskwargs=syskwargs)
+            block._nbytes = a.km.sparse_nbytes(block.oid, syskwargs=syskwargs)
         return block
 
+    def bop(self, op_name, other, args: dict, device=None):
+        return self.binary_op(op_name, self, other, args, device)
+
     # TODO: densify when fill_value != 0
     def tensordot(self, other, axes):
         assert self.fill_value == 0
         if not other.is_dense:
             assert other.fill_value == 0
-        return self.bop("tensordot", other, args={"axes": axes})
+        return self.binary_op("tensordot", self, other, args={"axes": axes})
 
 
-# TODO: merge with BlockArray, have decorator for dense-only methods?
-class SparseBlockArray(BlockArray):
+class SparseBlockArray(BlockArrayBase):
     def __init__(
         self,
         grid: ArrayGrid,
@@ -218,15 +222,11 @@ def __init__(
         fill_value=0,
         blocks: np.ndarray = None,
     ):
-        self.grid = grid
-        self.km = km
-        self.shape = self.grid.shape
-        self.block_shape = self.grid.block_shape
-        self.grid_shape = self.grid.grid_shape
-        self.size = np.product(self.shape)
-        self.ndim = len(self.shape)
-        self.dtype = self.grid.dtype
-        self.blocks = blocks
+        if blocks is not None:
+            assert (
+                blocks.dtype == SparseBlock
+            ), "SparseBlockArray must be initialized with SparseBlocks"
+        super().__init__(grid, km, blocks)
         if self.blocks is None:
             self.blocks = np.empty(shape=self.grid_shape, dtype=SparseBlock)
             for grid_entry in self.grid.get_entry_iterator():
@@ -247,10 +247,6 @@ def __init__(
     def nnz(self):
         return self._get_nnz()
 
-    @property
-    def nbytes(self):
-        return self._get_nbytes()
-
     def _get_nnz(self):
         if self._nnz == -1:
             self._nnz = 0
@@ -258,6 +254,10 @@ def _get_nnz(self):
                 self._nnz += self.blocks[grid_entry].nnz
         return self._nnz
 
+    @property
+    def nbytes(self):
+        return self._get_nbytes()
+
     def _get_nbytes(self):
         if self._nbytes == -1:
             self._nbytes = 0
@@ -265,6 +265,9 @@ def _get_nbytes(self):
                 self._nbytes += self.blocks[grid_entry].nbytes
         return self._nbytes
 
+    def __repr__(self):
+        return f"SparseBlockArray({self.blocks})"
+
     @classmethod
     def from_np(cls, arr, block_shape, copy, km, fill_value=0):
         dtype_str = str(arr.dtype)
@@ -357,6 +360,7 @@ def from_ba(cls, ba: BlockArrayBase, fill_value=0):
                 syskwargs=syskwargs,
             )
             sblock._nnz = sba.km.sparse_nnz(sblock.oid, syskwargs=syskwargs)
+            sblock._nbytes = sba.km.sparse_nbytes(sblock.oid, syskwargs=syskwargs)
         sba.fill_value = fill_value
         return sba
 
@@ -383,21 +387,12 @@ def copy(self):
             rarr_copy.blocks[grid_entry] = self.blocks[grid_entry].copy()
         return rarr_copy
 
-    @staticmethod
-    def to_block_array(obj, km: KernelManager, block_shape=None):
-        if isinstance(obj, (BlockArray, SparseBlockArray)):
-            return obj
-        if isinstance(obj, np.ndarray):
-            np_array = obj
-        elif isinstance(obj, list):
-            np_array = np.array(obj)
-        elif array_utils.is_scalar(obj):
-            return SparseBlockArray.from_scalar(obj, km)
-        else:
-            raise Exception("Unsupported type %s" % type(obj))
-        if block_shape is None:
-            block_shape = km.get_block_shape(np_array.shape, np_array.dtype)
-        return BlockArray.from_np(np_array, block_shape, False, km)
+    def astype(self, dtype):
+        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
+        result = BlockArray(grid, self.km)
+        for grid_entry in result.grid.get_entry_iterator():
+            result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype)
+        return result
 
     def transpose(self, defer=False, redistribute=False):
         if defer and redistribute:
@@ -414,10 +409,33 @@ def transpose(self, defer=False, redistribute=False):
             )
         return rarrT
 
+    @staticmethod
+    def to_block_array(obj, km: KernelManager, block_shape=None):
+        if isinstance(obj, (BlockArray, SparseBlockArray)):
+            return obj
+        if isinstance(obj, np.ndarray):
+            np_array = obj
+        elif isinstance(obj, list):
+            np_array = np.array(obj)
+        elif array_utils.is_scalar(obj):
+            return SparseBlockArray.from_scalar(obj, km)
+        else:
+            raise Exception("Unsupported type %s" % type(obj))
+        if block_shape is None:
+            block_shape = km.get_block_shape(np_array.shape, np_array.dtype)
+        return BlockArray.from_np(np_array, block_shape, False, km)
+
     def check_or_convert_other(self, other, compute_block_shape=False):
         block_shape = None if compute_block_shape else self.block_shape
         return SparseBlockArray.to_block_array(other, self.km, block_shape=block_shape)
 
+    def _check_bop_implemented(self, other):
+        if isinstance(
+            other, (BlockArrayBase, np.ndarray, list)
+        ) or array_utils.is_scalar(other):
+            return True
+        return False
+
     def ufunc(self, op_name):
         result = self.copy()
         for grid_entry in self.grid.get_entry_iterator():
@@ -425,6 +443,197 @@ def ufunc(self, op_name):
         func = np.__getattribute__(op_name)
         result.fill_value = func(self.fill_value)
         result._nnz = -1
+        result._nbytes = -1
+        return result
+
+    #################
+    # Arithmetic
+    #################
+
+    @staticmethod
+    def elementwise(op_name, a, b):
+        if isinstance(a, SparseBlockArray):
+            b = a.check_or_convert_other(b)
+        elif isinstance(b, SparseBlockArray):
+            a = b.check_or_convert_other(a)
+        else:
+            raise NotImplementedError()
+
+        densify = array_utils.get_sparse_bop_return_type(
+            op_name,
+            a.fill_value,
+            b.fill_value,
+        )
+        if a.shape == b.shape and a.block_shape == b.block_shape:
+            return SparseBlockArray._fast_elementwise(op_name, a, b, densify)
+        else:
+            blocks_op = a.blocks.__getattribute__(f"__{op_name}__")
+            if densify:
+                result = BlockArray.from_blocks(
+                    blocks_op(b.blocks),
+                    result_shape=None,
+                    km=a.km,
+                )
+            else:
+                fill_value = array_utils.get_bop_fill_value(
+                    op_name, a.fill_value, b.fill_value
+                )
+                result = SparseBlockArray.from_blocks(
+                    blocks_op(b.blocks),
+                    result_shape=None,
+                    fill_value=fill_value,
+                    km=a.km,
+                )
+            return result
+
+    @staticmethod
+    def _fast_elementwise(op_name, a, b, densify):
+        # a, b have the same grid_shape and block_shape
+        dtype = array_utils.get_bop_output_type(op_name, a.dtype, b.dtype)
+        if densify:
+            block_type = Block
+            fill_value = None
+        else:
+            block_type = SparseBlock
+            fill_value = array_utils.get_bop_fill_value(
+                op_name, a.fill_value, b.fill_value
+            )
+        blocks = np.empty(shape=a.grid_shape, dtype=block_type)
+        for grid_entry in a.grid.get_entry_iterator():
+            a_block: BlockBase = a.blocks[grid_entry]
+            b_block: BlockBase = b.blocks[grid_entry]
+            blocks[grid_entry] = block_type(
+                grid_entry,
+                a_block.grid_shape,
+                a_block.shape,
+                dtype,
+                transposed=False,
+                km=a.km,
+            )
+            blocks[grid_entry].oid = a.km.sparse_bop(
+                op_name,
+                a_block.oid,
+                b_block.oid,
+                a_block.transposed,
+                b_block.transposed,
+                axes={},
+                densify=densify,
+                syskwargs={
+                    "grid_entry": grid_entry,
+                    "grid_shape": a.grid.grid_shape,
+                },
+            )
+        grid = ArrayGrid(a.shape, a.block_shape, dtype.__name__)
+        if densify:
+            return BlockArray(grid, a.km, blocks=blocks)
+        else:
+            return SparseBlockArray(grid, a.km, fill_value, blocks=blocks)
+
+    #################
+    # Linear Algebra
+    #################
+
+    @staticmethod
+    def tensordot(a, b, axes=2):
+        if isinstance(axes, int):
+            pass
+        elif array_utils.is_array_like(axes):
+            raise NotImplementedError("Non-integer axes is currently not supported.")
+        else:
+            raise TypeError(f"Unexpected axes type '{type(axes).__name__}'")
+
+        if isinstance(a, SparseBlockArray):
+            b = a.check_or_convert_other(b, compute_block_shape=True)
+        elif isinstance(b, SparseBlockArray):
+            a = b.check_or_convert_other(a, compute_block_shape=True)
+        else:
+            raise NotImplementedError()
+
+        # PyData/Sparse only works with fill_value == 0
+        # TODO: densify when fill_value != 0
+        if not (a.is_dense or b.is_dense):
+            assert (
+                a.fill_value == 0 and b.fill_value == 0
+            ), "Sparse-sparse tensordot with non-zero fill value is not supported."
+
+        if array_utils.np_tensordot_param_test(a.shape, a.ndim, b.shape, b.ndim, axes):
+            raise ValueError("shape-mismatch for sum")
+
+        densify = array_utils.get_sparse_bop_return_type(
+            "tensordot",
+            a.fill_value,
+            b.fill_value,
+        )
+
+        if axes > 0:
+            a_axes = a.grid.grid_shape[:-axes]
+            a_sum_axes = a.grid.grid_shape[-axes:]
+            b_axes = b.grid.grid_shape[axes:]
+            b_sum_axes = b.grid.grid_shape[:axes]
+            assert a_sum_axes == b_sum_axes
+            result_shape = tuple(a.shape[:-axes] + b.shape[axes:])
+            result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:])
+        else:
+            a_axes = a.grid.grid_shape
+            b_axes = b.grid.grid_shape
+            a_sum_axes = ()
+            result_shape = tuple(a.shape + b.shape)
+            result_block_shape = tuple(a.block_shape + b.block_shape)
+
+        result_grid = ArrayGrid(
+            shape=result_shape,
+            block_shape=result_block_shape,
+            dtype=array_utils.get_bop_output_type(
+                "tensordot", a.dtype, b.dtype
+            ).__name__,
+        )
+        assert result_grid.grid_shape == tuple(a_axes + b_axes)
+        if densify:
+            result = BlockArray(result_grid, a.km)
+        else:
+            result = SparseBlockArray(result_grid, a.km, a.fill_value)
+        a_dims = list(itertools.product(*map(range, a_axes)))
+        b_dims = list(itertools.product(*map(range, b_axes)))
+        sum_dims = list(itertools.product(*map(range, a_sum_axes)))
+        for i in a_dims:
+            for j in b_dims:
+                grid_entry = tuple(i + j)
+                result_block: Block = result.blocks[grid_entry]
+                sum_oids = []
+                for k in sum_dims:
+                    a_block: Block = a.blocks[tuple(i + k)]
+                    b_block: Block = b.blocks[tuple(k + j)]
+                    dot_grid_args = a._compute_tensordot_syskwargs(a_block, b_block)
+                    dotted_oid = a.km.sparse_bop(
+                        "tensordot",
+                        a_block.oid,
+                        b_block.oid,
+                        a_block.transposed,
+                        b_block.transposed,
+                        axes=axes,
+                        densify=densify,
+                        syskwargs={
+                            "grid_entry": dot_grid_args[0],
+                            "grid_shape": dot_grid_args[1],
+                        },
+                    )
+                    sum_oids.append(
+                        (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
+                    )
+                result_block.oid = a.tree_reduce(
+                    "sum", sum_oids, result_block.grid_entry, result_block.grid_shape
+                )
+                if not densify:
+                    syskwargs = {
+                        "grid_entry": result_block.grid_entry,
+                        "grid_shape": result_block.grid_shape,
+                    }
+                    result_block._nnz = a.km.sparse_nnz(
+                        result_block.oid, syskwargs=syskwargs
+                    )
+                    result_block._nbytes = a.km.sparse_nbytes(
+                        result_block.oid, syskwargs=syskwargs
+                    )
         return result
 
     def sdtp(self, *block_arrays: List[BlockArray]):
@@ -485,11 +694,11 @@ def sdtd(self, x: BlockArray, y: BlockArray, axes: int):
         assert result_grid.grid_shape == self.grid_shape
         assert result_grid.block_shape == self.block_shape
         result: SparseBlockArray = SparseBlockArray(self.grid, self.km, self.fill_value)
-        this_dims = list(itertools.product(*map(range, x_axes)))
-        other_dims = list(itertools.product(*map(range, y_axes)))
+        x_dims = list(itertools.product(*map(range, x_axes)))
+        y_dims = list(itertools.product(*map(range, y_axes)))
         sum_dims = tuple([0] * axes)
-        for i in this_dims:
-            for j in other_dims:
+        for i in x_dims:
+            for j in y_dims:
                 grid_entry = tuple(i + j)
                 x_block: Block = x.blocks[tuple(i + sum_dims)]
                 y_block: Block = y.blocks[tuple(sum_dims + j)]
@@ -505,217 +714,15 @@ def sdtd(self, x: BlockArray, y: BlockArray, axes: int):
                 )
         return result
 
-    def _fast_elementwise(self, op_name, other, densify):
-        dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
-        if densify:
-            blocks = np.empty(shape=self.grid_shape, dtype=Block)
-        else:
-            blocks = np.empty(shape=self.grid_shape, dtype=SparseBlock)
-        for grid_entry in self.grid.get_entry_iterator():
-            self_block: SparseBlock = self.blocks[grid_entry]
-            other_block: Block = other.blocks[grid_entry]
-            blocks[grid_entry] = block = self_block.bop(op_name, other_block, args={})
-            block.oid = self.km.sparse_bop(
-                op_name,
-                self_block.oid,
-                other_block.oid,
-                self_block.transposed,
-                other_block.transposed,
-                axes={},
-                densify=densify,
-                syskwargs={
-                    "grid_entry": grid_entry,
-                    "grid_shape": self.grid.grid_shape,
-                },
-            )
-        grid = ArrayGrid(self.shape, self.block_shape, dtype.__name__)
-        if densify:
-            return BlockArray(grid, self.km, blocks=blocks)
-        else:
-            fill_value = array_utils.get_bop_fill_value(
-                op_name, self.fill_value, other.fill_value
-            )
-            result = SparseBlockArray(grid, self.km, fill_value, blocks=blocks)
-            return result
-
-    def __elementwise__(self, op_name, other):
-        other = self.check_or_convert_other(other)
-        densify = array_utils.get_sparse_bop_return_type(
-            op_name,
-            self.fill_value,
-            other.fill_value,
-        )
-        if self.shape == other.shape and self.block_shape == other.block_shape:
-            return self._fast_elementwise(op_name, other, densify)
-        blocks_op = self.blocks.__getattribute__("__%s__" % op_name)
-        if densify:
-            result = BlockArray.from_blocks(
-                blocks_op(other.blocks),
-                result_shape=None,
-                km=self.km,
-            )
-        else:
-            fill_value = array_utils.get_bop_fill_value(
-                op_name, self.fill_value, other.fill_value
-            )
-            result = SparseBlockArray.from_blocks(
-                blocks_op(other.blocks),
-                result_shape=None,
-                fill_value=fill_value,
-                km=self.km,
-            )
-        return result
-
-    def tensordot(self, other, axes=2):
-        if isinstance(axes, int):
-            pass
-        elif array_utils.is_array_like(axes):
-            raise NotImplementedError("Non-integer axes is currently not supported.")
-        else:
-            raise TypeError(f"Unexpected axes type '{type(axes).__name__}'")
-
-        other = self.check_or_convert_other(other, compute_block_shape=True)
-
-        if array_utils.np_tensordot_param_test(
-            self.shape, self.ndim, other.shape, other.ndim, axes
-        ):
-            raise ValueError("shape-mismatch for sum")
-
-        # Pydata/Sparse only works with fill_value == 0
-        # TODO: densify when fill_value != 0
-        assert self.fill_value == 0
-        if isinstance(other, SparseBlockArray):
-            assert other.fill_value == 0, "Sparse-dense tensordot may not be tractable."
-        densify = array_utils.get_sparse_bop_return_type(
-            "tensordot",
-            self.fill_value,
-            other.fill_value,
-        )
-
-        if axes > 0:
-            this_axes = self.grid.grid_shape[:-axes]
-            this_sum_axes = self.grid.grid_shape[-axes:]
-            other_axes = other.grid.grid_shape[axes:]
-            other_sum_axes = other.grid.grid_shape[:axes]
-            assert this_sum_axes == other_sum_axes
-            result_shape = tuple(self.shape[:-axes] + other.shape[axes:])
-            result_block_shape = tuple(
-                self.block_shape[:-axes] + other.block_shape[axes:]
-            )
-        else:
-            this_axes = self.grid.grid_shape
-            other_axes = other.grid.grid_shape
-            this_sum_axes = ()
-            result_shape = tuple(self.shape + other.shape)
-            result_block_shape = tuple(self.block_shape + other.block_shape)
-
-        result_grid = ArrayGrid(
-            shape=result_shape,
-            block_shape=result_block_shape,
-            dtype=array_utils.get_bop_output_type(
-                "tensordot", self.dtype, other.dtype
-            ).__name__,
-        )
-        assert result_grid.grid_shape == tuple(this_axes + other_axes)
-        if densify:
-            result = BlockArray(result_grid, self.km)
-        else:
-            result = SparseBlockArray(result_grid, self.km, self.fill_value)
-        this_dims = list(itertools.product(*map(range, this_axes)))
-        other_dims = list(itertools.product(*map(range, other_axes)))
-        sum_dims = list(itertools.product(*map(range, this_sum_axes)))
-        for i in this_dims:
-            for j in other_dims:
-                grid_entry = tuple(i + j)
-                result_block: Block = result.blocks[grid_entry]
-                sum_oids = []
-                for k in sum_dims:
-                    self_block: Block = self.blocks[tuple(i + k)]
-                    other_block: Block = other.blocks[tuple(k + j)]
-                    dot_grid_args = self._compute_tensordot_syskwargs(
-                        self_block, other_block
-                    )
-                    dotted_oid = self.km.sparse_bop(
-                        "tensordot",
-                        self_block.oid,
-                        other_block.oid,
-                        self_block.transposed,
-                        other_block.transposed,
-                        axes=axes,
-                        densify=densify,
-                        syskwargs={
-                            "grid_entry": dot_grid_args[0],
-                            "grid_shape": dot_grid_args[1],
-                        },
-                    )
-                    sum_oids.append(
-                        (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
-                    )
-                result_block.oid = self.tree_reduce(
-                    "sum", sum_oids, result_block.grid_entry, result_block.grid_shape
-                )
-                if not densify:
-                    result_block._nnz = self.km.sparse_nnz(
-                        result_block.oid,
-                        syskwargs={
-                            "grid_entry": result_block.grid_entry,
-                            "grid_shape": result_block.grid_shape,
-                        },
-                    )
-        return result
-
-    def __add__(self, other):
-        return self.__elementwise__("add", other)
-
-    def __radd__(self, other):
-        return self.__elementwise__("add", other)
-
-    __iadd__ = __add__
-
-    def __sub__(self, other):
-        return self.__elementwise__("sub", other)
-
-    def __rsub__(self, other):
-        # FIXME: not commutative
-        return self.__elementwise__("sub", other)
-
-    __isub__ = __sub__
-
-    def __mul__(self, other):
-        return self.__elementwise__("mul", other)
-
-    def __rmul__(self, other):
-        return self.__elementwise__("mul", other)
-
-    __imul__ = __mul__
-
-    def __truediv__(self, other):
-        return self.__elementwise__("truediv", other)
-
-    def __rtruediv__(self, other):
-        return self.__elementwise__("truediv", other)
-
-    __itruediv__ = __truediv__
-
-    def __floordiv__(self, other):
-        return self.__elementwise__("floordiv", other)
-
-    def __rfloordiv__(self, other):
-        return self.__elementwise__("floordiv", other)
-
-    __ifloordiv__ = __floordiv__
-
-    def __pow__(self, other):
-        return self.__elementwise__("pow", other)
-
-    def __rpow__(self, other):
-        return self.__elementwise__("pow", other)
-
-    __ipow__ = __pow__
+    #################
+    # Inequalities
+    #################
 
     def __inequality__(self, op_name, other):
         other = self.check_or_convert_other(other)
-        assert other.shape == (), "Currently supports comparison with scalars only."
+        assert (
+            other.shape == () or other.shape == self.shape
+        ), "Currently supports comparison with scalars only."
         shape = array_utils.broadcast(self.shape, other.shape).shape
         block_shape = array_utils.broadcast_block_shape(
             self.shape, other.shape, self.block_shape
diff --git a/nums/core/array/utils.py b/nums/core/array/utils.py
index 8cefd766..872f5d9d 100644
--- a/nums/core/array/utils.py
+++ b/nums/core/array/utils.py
@@ -22,8 +22,6 @@
 
 from nums.core.settings import np_ufunc_map
 from nums.core.array.errors import AxisError
-from nums.core.array.base import Block
-from nums.core.array.sparse import SparseBlock
 
 # pylint: disable = no-member, trailing-whitespace
 
@@ -455,29 +453,6 @@ def normalize_axis_index(axis, ndim):
     return axis % ndim
 
 
-# def get_sparse_bop_return_type(op_name, a: Block, b: Block):
-#     def sample_array(block):
-#         s = np.eye(2)
-#         if isinstance(block, SparseBlock):
-#             return sparse.COO.from_numpy(s, fill_value=block.fill_value)
-#         return s
-
-#     sa = sample_array(a)
-#     sb = sample_array(b)
-#     if op_name == "tensordot":
-#         result = sparse.tensordot(sa, sb)
-#     else:
-#         op_name = np_ufunc_map.get(op_name, op_name)
-#         try:
-#             ufunc = np.__getattribute__(op_name)
-#         except Exception as _:
-#             ufunc = scipy.special.__getattribute__(op_name)
-#         result = sparse.elemwise(ufunc, sa, sb)
-#     if isinstance(result, sparse.SparseArray):
-#         return False
-#     return True
-
-
 def get_sparse_bop_return_type(op_name, a_fv, b_fv):
     def sample_array(fv):
         s = np.eye(2)
diff --git a/nums/core/array/view.py b/nums/core/array/view.py
index a9186c9c..f2e20ccb 100644
--- a/nums/core/array/view.py
+++ b/nums/core/array/view.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Tuple
+from typing import Tuple, Type
 
 import numpy as np
 
@@ -39,6 +39,7 @@ def from_subscript(cls, bab, subscript):
     def __init__(self, source, sel: BasicSelection = None, block_shape: tuple = None):
         self._source: BlockArrayBase = source
         self._km: KernelManager = self._source.km
+        self._concrete_cls = type(source)
 
         if sel is None:
             sel = BasicSelection.from_shape(self._source.shape)
@@ -84,7 +85,7 @@ def basic_select(self, subscript: tuple):
         result: ArrayView = ArrayView(self._source, sel)
         return result
 
-    def create(self, concrete_cls=None) -> BlockArrayBase:
+    def create(self) -> BlockArrayBase:
         if self.sel.basic_steps():
             if self.sel.is_aligned(self._source.block_shape):
                 # Assertion below should form a conjunction with the above condition.
@@ -95,20 +96,21 @@ def create(self, concrete_cls=None) -> BlockArrayBase:
                     self.sel.get_broadcastable_block_shape(self.block_shape),
                     self._source.block_shape,
                 )
-                return self.create_references(concrete_cls)
+                return self.create_references(self._concrete_cls)
             else:
-                return self.create_basic_single_step(concrete_cls)
+                return self.create_basic_single_step(self._concrete_cls)
         else:
-            return self.create_basic_multi_step(concrete_cls)
+            return self.create_basic_multi_step(self._concrete_cls)
 
     def create_references(self, concrete_cls) -> BlockArrayBase:
         # TODO (hme): Double check this.
-        array_cls = BlockArrayBase if concrete_cls is None else concrete_cls
-        dst_ba: BlockArrayBase = array_cls(self.grid, self._km)
+        # array_cls = BlockArrayBase if concrete_cls is None else concrete_cls
+        # dst_ba: BlockArrayBase = array_cls(self.grid, self._km)
+        dst_ba: BlockArrayBase = concrete_cls(self.grid, self._km)
         if 0 in self.shape:
             return dst_ba
         grid_offset = self.sel.position().value // np.array(
-            self._source.block_shape, dtype=np.int
+            self._source.block_shape, dtype=np.intp
         )
         dst_inflated_shape = self.sel.get_broadcastable_shape()
         dst_inflated_block_shape = self.sel.get_broadcastable_block_shape(
@@ -123,7 +125,9 @@ def create_references(self, concrete_cls) -> BlockArrayBase:
         ):
             dst_grid_entry = dst_grid_entry_iterator[dst_index]
             src_grid_entry = tuple(
-                (np.array(dst_inflated_grid_entry, dtype=np.int) + grid_offset).tolist()
+                (
+                    np.array(dst_inflated_grid_entry, dtype=np.intp) + grid_offset
+                ).tolist()
             )
             dst_ba.blocks[dst_grid_entry].oid = self._source.blocks[src_grid_entry].oid
             dst_ba.blocks[dst_grid_entry].transposed = self._source.blocks[
@@ -261,7 +265,7 @@ def assign_references(self, dst_sel: BasicSelection, value):
         # but the destination selection may not have the same shape as value.
         # May need to broadcast value to destination selection output shape.
         dst_offset = dst_sel.position().value // np.array(
-            self._source.block_shape, dtype=np.int
+            self._source.block_shape, dtype=np.intp
         )
         # Do we need to broadcast?
         if isinstance(value, ArrayView) and (
@@ -273,7 +277,7 @@ def assign_references(self, dst_sel: BasicSelection, value):
             # We don't need to create value to perform the reference copy.
             # No broadcasting required, so this should be okay.
             src_offset = value.sel.position().value // np.array(
-                value._source.block_shape, dtype=np.int
+                value._source.block_shape, dtype=np.intp
             )
             src_inflated_shape = dst_sel.get_broadcastable_shape()
             src_inflated_block_shape = dst_sel.get_broadcastable_block_shape(
@@ -286,12 +290,12 @@ def assign_references(self, dst_sel: BasicSelection, value):
                 # Num axes in value grid may be too small.
                 dst_grid_entry = tuple(
                     (
-                        np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset
+                        np.array(src_grid_entry_inflated, dtype=np.intp) + dst_offset
                     ).tolist()
                 )
                 src_grid_entry = tuple(
                     (
-                        np.array(src_grid_entry_inflated, dtype=np.int) + src_offset
+                        np.array(src_grid_entry_inflated, dtype=np.intp) + src_offset
                     ).tolist()
                 )
                 # This is a reference assignment, and the grid properties between the
@@ -322,7 +326,7 @@ def assign_references(self, dst_sel: BasicSelection, value):
                 src_grid_entry = src_grid_entry_iterator[src_index]
                 dst_grid_entry = tuple(
                     (
-                        np.array(src_grid_entry_inflated, dtype=np.int) + dst_offset
+                        np.array(src_grid_entry_inflated, dtype=np.intp) + dst_offset
                     ).tolist()
                 )
                 # This is a reference assignment, and the grid properties between the
diff --git a/nums/core/kernel/kernel_interface.py b/nums/core/kernel/kernel_interface.py
index 04bd194d..79c2be92 100644
--- a/nums/core/kernel/kernel_interface.py
+++ b/nums/core/kernel/kernel_interface.py
@@ -198,6 +198,9 @@ def sparse_map_uop(self, op_name, arr, args, kwargs, syskwargs: Dict):
     def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify, syskwargs: Dict):
         raise NotImplementedError()
 
+    def sparse_block_from_scalar(self, x, syskwargs: Dict):
+        raise NotImplementedError()
+
     def sdtp(self, s, *dense_arrays, syskwargs: Dict):
         raise NotImplementedError()
 
diff --git a/nums/core/kernel/numpy_kernel.py b/nums/core/kernel/numpy_kernel.py
index ead191c7..ef671a5e 100644
--- a/nums/core/kernel/numpy_kernel.py
+++ b/nums/core/kernel/numpy_kernel.py
@@ -435,6 +435,7 @@ def bop(self, op, a1, a2, a1_T, a2_T, axes):
             ufunc = scipy.special.__getattribute__(op)
         return ufunc(a1, a2)
 
+    # Works for sparse too.
     def bop_reduce(self, op, a1, a2, a1_T, a2_T):
         if a1_T:
             a1 = a1.T
@@ -561,10 +562,7 @@ def sparse_random_block(
             format="coo",
             fill_value=fill_value,
         )
-        if rfunc_name != "randint":
-            # Only random and integer supports sampling of a specific type.
-            result = result.astype(dtype)
-        return result
+        return result.astype(dtype)
 
     def sparse_map_uop(self, op_name, arr, args, kwargs):
         """
@@ -605,6 +603,10 @@ def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify):
             assert isinstance(result, sparse.SparseArray)
         return result
 
+    def sparse_block_from_scalar(self, x):
+        assert np.isscalar(x)
+        return sparse.COO.from_numpy(np.array(x), fill_value=x)
+
     def sdtp(self, s: sparse.COO, *dense_arrays):
         data = np.copy(s.data)
         for position in range(s.nnz):
diff --git a/nums/experimental/optimizer/graph.py b/nums/experimental/optimizer/graph.py
index 3ac3f3df..5c505340 100644
--- a/nums/experimental/optimizer/graph.py
+++ b/nums/experimental/optimizer/graph.py
@@ -443,9 +443,9 @@ def _collapse(self, device: Device):
             shape=self.shape(),
             dtype=self.dtype(),
             transposed=False,
-            km=child_block._km,
+            km=child_block.km,
         )
-        block.oid = child_block._km.reduce_axis(
+        block.oid = child_block.km.reduce_axis(
             op_name=op_name,
             arr=child_block.oid,
             axis=self.axis,
@@ -1006,12 +1006,12 @@ def _collapse(self, device: Device):
             assert isinstance(child, Leaf)
             block_oids.append(child.block.oid)
             if km is None:
-                km = child.block._km
+                km = child.block.km
         block: Block = Block(
             self._grid_entry, self._grid_shape, self._shape, self._dtype, False, km
         )
         block._device = device
-        block.oid = block._km.call(
+        block.oid = block.km.call(
             self.op_hash, *block_oids, syskwargs={"device": device}
         )
         leaf: Leaf = Leaf(self.cluster_state)
@@ -1203,12 +1203,12 @@ def _collapse(self, device: Device):
             assert isinstance(child, Leaf)
             block_oids.append(child.block.oid)
             if km is None:
-                km = child.block._km
+                km = child.block.km
         block: Block = Block(
             self.grid_entry(), self.grid_shape(), self.shape(), self.dtype(), False, km
         )
         block._device = device
-        block.oid = block._km.einsum(
+        block.oid = block.km.einsum(
             self.subscript, *block_oids, syskwargs={"device": device}
         )
         leaf: Leaf = Leaf(self.cluster_state)
diff --git a/nums/experimental/optimizer/grapharray.py b/nums/experimental/optimizer/grapharray.py
index 5f604118..f03a4163 100644
--- a/nums/experimental/optimizer/grapharray.py
+++ b/nums/experimental/optimizer/grapharray.py
@@ -176,28 +176,29 @@ def to_blocks(self) -> np.ndarray:
     def other_to_ga(self, other):
         return GraphArray.to_ga(other, self.cluster_state, self.km, self.copy_on_op)
 
-    def tensordot(self, other, axes=2):
-        other = self.other_to_ga(other)
+    @staticmethod
+    def tensordot(a, b, axes=2):
+        b = a.other_to_ga(b)
         # TODO: Reuse BlockArrayBase tensordot operator.
-        this_axes = self.grid.grid_shape[:-axes]
-        this_sum_axes = self.grid.grid_shape[-axes:]
-        other_axes = other.grid.grid_shape[axes:]
-        other_sum_axes = other.grid.grid_shape[:axes]
-        assert this_sum_axes == other_sum_axes
-        result_shape = tuple(self.shape[:-axes] + other.shape[axes:])
-        result_block_shape = tuple(self.block_shape[:-axes] + other.block_shape[axes:])
+        a_axes = a.grid.grid_shape[:-axes]
+        a_sum_axes = a.grid.grid_shape[-axes:]
+        b_axes = b.grid.grid_shape[axes:]
+        b_sum_axes = b.grid.grid_shape[:axes]
+        assert a_sum_axes == b_sum_axes
+        result_shape = tuple(a.shape[:-axes] + b.shape[axes:])
+        result_block_shape = tuple(a.block_shape[:-axes] + b.block_shape[axes:])
         result_grid = ArrayGrid(
             shape=result_shape,
             block_shape=result_block_shape,
-            dtype=self.dtype.__name__,
+            dtype=a.dtype.__name__,
         )
-        assert result_grid.grid_shape == tuple(this_axes + other_axes)
+        assert result_grid.grid_shape == tuple(a_axes + b_axes)
         result_graphs = np.empty(shape=result_grid.grid_shape, dtype=np.object)
-        this_dims = list(itertools.product(*map(range, this_axes)))
-        other_dims = list(itertools.product(*map(range, other_axes)))
-        sum_dims = list(itertools.product(*map(range, this_sum_axes)))
-        for i in this_dims:
-            for j in other_dims:
+        a_dims = list(itertools.product(*map(range, a_axes)))
+        b_dims = list(itertools.product(*map(range, b_axes)))
+        sum_dims = list(itertools.product(*map(range, a_sum_axes)))
+        for i in a_dims:
+            for j in b_dims:
                 # A \in \R^{I \times K}
                 # B \in \R^{K \times J}
                 # C \in \R^{I \times J}
@@ -205,20 +206,20 @@ def tensordot(self, other, axes=2):
                 grid_entry = tuple(i + j)
                 if len(sum_dims) == 1:
                     k = sum_dims[0]
-                    self_node: TreeNode = self.graphs[tuple(i + k)]
-                    other_node: TreeNode = other.graphs[tuple(k + j)]
-                    dot_node: TreeNode = self_node.tensordot(other_node, axes=axes)
+                    a_node: TreeNode = a.graphs[tuple(i + k)]
+                    b_node: TreeNode = b.graphs[tuple(k + j)]
+                    dot_node: TreeNode = a_node.tensordot(b_node, axes=axes)
                     result_graphs[grid_entry] = dot_node
                 else:
-                    rop = TreeReductionOp(self.cluster_state)
+                    rop = TreeReductionOp(a.cluster_state)
                     rop.set_grid_entry(grid_entry)
                     rop.set_grid_shape(result_grid.grid_shape)
                     rop.op_name = "sum"
-                    rop.copy_on_op = self.copy_on_op
+                    rop.copy_on_op = a.copy_on_op
                     for k in sum_dims:
-                        self_node: TreeNode = self.graphs[tuple(i + k)]
-                        other_node: TreeNode = other.graphs[tuple(k + j)]
-                        dot_node: TreeNode = self_node.tensordot(other_node, axes=axes)
+                        a_node: TreeNode = a.graphs[tuple(i + k)]
+                        b_node: TreeNode = b.graphs[tuple(k + j)]
+                        dot_node: TreeNode = a_node.tensordot(b_node, axes=axes)
                         # Explicitly add parent here, since sum depends on prod.
                         # Not needed for other ops; make_bop takes care of it.
                         # We don't need to copy the node here since the local
@@ -229,14 +230,14 @@ def tensordot(self, other, axes=2):
 
         return GraphArray(
             result_grid,
-            self.cluster_state,
+            a.cluster_state,
             result_graphs,
-            self.km,
-            copy_on_op=self.copy_on_op,
+            a.km,
+            copy_on_op=a.copy_on_op,
         )
 
     def __matmul__(self, other):
-        return self.tensordot(other, axes=1)
+        return self.tensordot(self, other, axes=1)
 
     def ga_from_arr(self, arr: Union[TreeNode, np.ndarray], result_shape: tuple):
         if isinstance(arr, TreeNode):
diff --git a/nums/experimental/optimizer/reduction_ops.py b/nums/experimental/optimizer/reduction_ops.py
index d5879d15..1a363513 100644
--- a/nums/experimental/optimizer/reduction_ops.py
+++ b/nums/experimental/optimizer/reduction_ops.py
@@ -266,7 +266,7 @@ def _collapse(self, device: Device, left: Leaf, right: Leaf):
         block: Block = lblock.copy()
         block.transposed = False
         block.dtype = array_utils.get_reduce_output_type(self.op_name, lblock.dtype)
-        block.oid = lblock._km.bop_reduce(
+        block.oid = lblock.km.bop_reduce(
             op_name,
             lblock.oid,
             rblock.oid,
diff --git a/tests/core/array/test_bop.py b/tests/core/array/test_bop.py
index d14da1b0..166c4a8e 100644
--- a/tests/core/array/test_bop.py
+++ b/tests/core/array/test_bop.py
@@ -69,7 +69,7 @@ def test_tensordot_basic(app_inst: ArrayApplication):
     shape = 2, 4, 10, 15
     npX = np.arange(np.product(shape)).reshape(*shape)
     rX = app_inst.array(npX, block_shape=(1, 2, 10, 3))
-    rResult = rX.T.tensordot(rX, axes=1)
+    rResult = rX.tensordot(rX.T, rX, axes=1)
     assert np.allclose(rResult.get(), (np.tensordot(npX.T, npX, axes=1)))
     common.check_block_integrity(rResult)
 
@@ -93,7 +93,7 @@ def test_tensordot_large_shape(app_inst: ArrayApplication):
 
     block_a = app_inst.array(a, block_shape=(30, 5, 3, 2))
     block_b = app_inst.array(b, block_shape=(2, 3, 5, 25))
-    block_c = block_a.tensordot(block_b, axes=1)
+    block_c = block_a.tensordot(block_a, block_b, axes=1)
     assert np.allclose(block_c.get(), c)
     common.check_block_integrity(block_c)
 
@@ -129,7 +129,7 @@ def test_tensordot_all_shapes(app_inst: ArrayApplication):
                 )
                 block_a = app_inst.array(a, block_shape=a_block_shape)
                 block_b = app_inst.array(b, block_shape=b_block_shape)
-                block_c = block_a.tensordot(block_b, axes=axes)
+                block_c = block_a.tensordot(block_a, block_b, axes=axes)
                 assert np.allclose(block_c.get(), c)
                 common.check_block_integrity(block_c)
 
diff --git a/tests/core/array/test_sparse.py b/tests/core/array/test_sparse.py
index 4ef9bc06..85f0254b 100644
--- a/tests/core/array/test_sparse.py
+++ b/tests/core/array/test_sparse.py
@@ -34,14 +34,18 @@ def test_from_coo(app_inst: ArrayApplication):
 def test_sparse_random(app_inst: ArrayApplication):
     rs: NumsRandomState = app_inst.random_state(1337)
     x_sba = rs.sparse_randint(
-        1, high=5, dtype=int, shape=(15, 10), block_shape=(5, 5), p=0.1, fill_value=0
+        1,
+        high=5,
+        dtype=np.int64,
+        shape=(100, 50),
+        block_shape=(50, 50),
+        p=0.1,
+        fill_value=0,
     )
     x_ba = x_sba.to_ba()
     x_np = x_ba.get()
     x_sp = sparse.COO.from_numpy(x_np, fill_value=0)
     assert x_sba.nnz == x_sp.nnz
-    assert x_sba.nbytes == x_sp.nbytes
-    print(x_np)
 
 
 def test_sparse_uop(app_inst: ArrayApplication):
@@ -73,17 +77,27 @@ def test_sparse_add(app_inst: ArrayApplication):
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 + x2, y_ba.get())
 
-    # Test sparse-dense.
+    # Test dense-sparse.
     y_ba = x2_ba + x1_sba  # __radd__
     assert np.array_equal(x2 + x1_sp, y_ba.get())
+    y_ba = x1_sba - x2_ba  # __sub__
+    assert np.array_equal(x1_sp - x2, y_ba.get())
+    y_ba = x2_ba - x1_sba  # __rsub__
+    assert np.array_equal(x2 - x1_sp, y_ba.get())
 
     # Test sparse-scalar.
-    y_sp = x1_sp - 1
+    y_sp = x1_sp - 1  # __sub__
     y_sba = x1_sba - 1
     assert y_sba.fill_value == y_sp.fill_value  # 4
     assert y_sba.nnz == y_sp.nnz  # 16
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 - 1, y_ba.get())
+    y_sp = 1 - x1_sp  # __rsub__
+    y_sba = 1 - x1_sba
+    assert y_sba.fill_value == y_sp.fill_value  # 4
+    assert y_sba.nnz == y_sp.nnz  # 16
+    y_ba = y_sba.to_ba()
+    assert np.array_equal(1 - x1, y_ba.get())
 
 
 def test_sparse_mul(app_inst: ArrayApplication):
@@ -138,7 +152,7 @@ def test_neq(app_inst: ArrayApplication):
 
 
 def test_tensordot(app_inst: ArrayApplication):
-    x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
+    x1 = np.array([[0, 0, 1, 2], [0, 0, 3, 4], [5, 6, 0, 0], [7, 8, 0, 0]])
     x2 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])
     x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
     x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
@@ -147,12 +161,15 @@ def test_tensordot(app_inst: ArrayApplication):
     x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=0)
     x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=0)
     y_sp = sparse.tensordot(x1_sp, x2_sp, axes=1)
-    y_sba = x1_sba.tensordot(x2_sba, axes=1)
+    y_sba = x1_sba @ x2_sba
     assert y_sba.fill_value == y_sp.fill_value  # 0
     assert y_sba.nnz == y_sp.nnz  #
     y_ba = y_sba.to_ba()
     assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get())
 
+    y_ba = x1_ba @ x2_sba  # __rmatmul__
+    assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get())
+
 
 def test_sdtp(app_inst: ArrayApplication):
     shape = 50, 50, 50
diff --git a/tests/experimental/optimizer/test_copy.py b/tests/experimental/optimizer/test_copy.py
index 5de7b42a..ea31aac6 100644
--- a/tests/experimental/optimizer/test_copy.py
+++ b/tests/experimental/optimizer/test_copy.py
@@ -88,7 +88,7 @@ def tensordot(
     cluster_state = ClusterState(lhs.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state, copy_on_op=copy_on_op)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state, copy_on_op=copy_on_op)
-    return lhs_ga.tensordot(rhs_ga, axes=axes)
+    return GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
 
 
 def optimized_tensordot(
diff --git a/tests/experimental/optimizer/test_tensordot.py b/tests/experimental/optimizer/test_tensordot.py
index 7657b898..c06ae1ae 100644
--- a/tests/experimental/optimizer/test_tensordot.py
+++ b/tests/experimental/optimizer/test_tensordot.py
@@ -41,7 +41,7 @@ def optimized_tensordot(
     cluster_state: ClusterState = ClusterState(lhs.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state, copy_on_op=copy_on_op)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state, copy_on_op=copy_on_op)
-    tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes)
+    tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
     global random_state
     print("*" * 50)
     print("op grid shape", tensordot_ga.grid.grid_shape)
@@ -119,7 +119,7 @@ def test_load_sqr(app_inst_mock_big):
     cluster_state: ClusterState = ClusterState(app.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state)
-    tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes)
+    tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
 
     mem_diff = max(cluster_state.resources[0]) - min(cluster_state.resources[0])
     net_in_diff = max(cluster_state.resources[1]) - min(cluster_state.resources[1])
@@ -163,7 +163,7 @@ def test_load_single_block_rhs(app_inst_mock_big):
     cluster_state: ClusterState = ClusterState(app.km.devices())
     lhs_ga: GraphArray = GraphArray.from_ba(lhs, cluster_state)
     rhs_ga: GraphArray = GraphArray.from_ba(rhs, cluster_state)
-    tensordot_ga = lhs_ga.tensordot(rhs_ga, axes=axes)
+    tensordot_ga = GraphArray.tensordot(lhs_ga, rhs_ga, axes=axes)
 
     print("memory", cluster_state.resources[0])
     print("net_in", cluster_state.resources[1])

From 359e3958c3a25346b2373f95f420c4d81d7b4e40 Mon Sep 17 00:00:00 2001
From: Daniel Zou <zoudl2000@gmail.com>
Date: Wed, 10 Aug 2022 23:12:17 -0700
Subject: [PATCH 2/5] Relocated selections and reshape, but not working for SBA

---
 nums/core/array/base.py                      | 616 ++++++++++++++++++-
 nums/core/array/blockarray.py                | 583 +-----------------
 nums/core/array/sparse.py                    |  23 +
 nums/core/array/view.py                      |  18 +-
 nums/core/kernel/kernel_interface.py         |   3 +
 nums/core/kernel/numpy_kernel.py             |  10 +
 nums/experimental/optimizer/fusion.py        |   1 -
 nums/experimental/optimizer/graph.py         |  38 +-
 nums/experimental/optimizer/grapharray.py    |   6 +-
 nums/experimental/optimizer/reduction_ops.py |  14 +-
 tests/core/array/test_sparse.py              |  11 +
 11 files changed, 701 insertions(+), 622 deletions(-)

diff --git a/nums/core/array/base.py b/nums/core/array/base.py
index 04b3a534..648a4fc5 100644
--- a/nums/core/array/base.py
+++ b/nums/core/array/base.py
@@ -14,7 +14,9 @@
 
 
 # pylint: disable = protected-access
+# pylint: disable=too-many-lines
 
+import warnings
 
 import numpy as np
 
@@ -466,6 +468,19 @@ def is_dense(self):
     def __repr__(self):
         return "BlockArray(" + str(self.blocks) + ")"
 
+    def __getattr__(self, item):
+        if item == "__array_priority__" or item == "__array_struct__":
+            # This is triggered by a numpy array on the LHS.
+            raise TypeError(
+                "Unexpected conversion attempt from BlockArrayBase to ndarray."
+            )
+        elif item == "ndim":
+            return len(self.shape)
+        elif item == "T":
+            return self.transpose()
+        else:
+            raise NotImplementedError(item)
+
     def get(self) -> np.ndarray:
         result: np.ndarray = np.zeros(shape=self.grid.shape, dtype=self.grid.dtype)
         block_shape: np.ndarray = np.array(self.grid.block_shape, dtype=np.int64)
@@ -521,6 +536,17 @@ def flattened_oids(self):
             oids.append(oid)
         return oids
 
+    @classmethod
+    def empty(cls, shape, block_shape, dtype, km: KernelManager):
+        raise NotImplementedError()
+
+    @staticmethod
+    def to_block_array(obj, km: KernelManager, block_shape=None):
+        raise NotImplementedError()
+
+    def transpose(self, defer=False, redistribute=False):
+        raise NotImplementedError()
+
     def broadcast_to(self, shape):
         b = array_utils.broadcast(self.shape, shape)
         result_block_shape = array_utils.broadcast_block_shape(
@@ -544,6 +570,391 @@ def broadcast_to(self, shape):
         result.blocks = broadcast
         return result
 
+    def reshape(self, *shape, **kwargs):
+        block_shape = kwargs.get("block_shape", None)
+        if array_utils.is_int(shape):
+            shape = (shape,)
+        elif len(shape) == 0:
+            shape = self.shape
+        elif isinstance(shape[0], (tuple, list)):
+            assert len(shape) == 1
+            shape = shape[0]
+        else:
+            assert all(np.issubdtype(type(n), int) for n in shape)
+        shape = Reshape.compute_shape(self.shape, shape)
+        if block_shape is None:
+            if shape == self.shape:
+                # This is a noop.
+                block_shape = self.block_shape
+            else:
+                block_shape = self.km.get_block_shape(shape, self.dtype)
+        return Reshape()(self, shape, block_shape)
+
+    def _preprocess_subscript(self, item):
+        if not isinstance(item, tuple):
+            ss = (item,)
+        else:
+            ss = item
+        # We need to fetch any block arrays.
+        tmp = []
+        for entry in ss:
+            if isinstance(entry, BlockArrayBase):
+                val = entry.get()
+            else:
+                val = entry
+            if isinstance(val, list):
+                val = np.array(val)
+            if isinstance(val, np.ndarray):
+                # If this is a Boolean mask, convert it to integers.
+                if array_utils.is_bool(val.dtype, type_test=True):
+                    val = np.arange(len(val))[val]
+                if val.shape == ():
+                    val = val.item()
+            tmp.append(val)
+        ss = tuple(tmp)
+        is_handled_advanced = False
+        array_encountered = False
+        axis = None
+
+        # Check if this is a supported advanced indexing operation.
+        for i, entry in enumerate(ss):
+            if isinstance(entry, slice) and entry.start is None and entry.stop is None:
+                continue
+            elif array_utils.is_int(entry) or array_utils.is_uint(entry):
+                continue
+            elif array_utils.is_array_like(entry):
+                if array_encountered:
+                    raise NotImplementedError(
+                        "Advanced indexing is only supported along a single axis."
+                    )
+                is_handled_advanced = True
+                array_encountered = True
+                axis = i
+                if not (np.all(0 <= entry) and np.all(entry < self.shape[axis])):
+                    raise IndexError(
+                        "Advanced indexing array along axis %s is out of bounds." % axis
+                    )
+            else:
+                if array_encountered:
+                    raise NotImplementedError(
+                        "Advanced indexing is only supported "
+                        "with full slices and integers along other axes."
+                    )
+                is_handled_advanced = False
+                break
+
+        return ss, is_handled_advanced, axis
+
+    def __getitem__(self, item):
+        ss, is_handled_advanced, axis = self._preprocess_subscript(item)
+
+        if is_handled_advanced:
+            # Treat this as a shuffle.
+            return self._advanced_single_array_select(ss, axis=axis)
+
+        # This is to deal with circular imports. Little overhead since this happens once per call.
+        # However, would be better to rearrange modules in the future.
+        from nums.core.array.view import ArrayView
+
+        av: ArrayView = ArrayView.from_block_array(self)
+        # TODO (hme): We don't have to create, but do so for now until we need to optimize.
+        return av[ss].create()
+
+    def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
+        # Create output array along the axis of the selection operation.
+        # We don't allocate zeros for output array. Instead, we let the update kernel
+        # create the initial set of zeros to save some memory.
+        array = ss[axis]
+        assert len(array.shape) == 1
+
+        # TODO: We may encounter block shape incompatability due to this.
+        block_size = self.block_shape[axis]
+        self.km.update_block_shape_map(array.shape[0], block_size)
+
+        dst_axis = None
+        shape = []
+        block_shape = []
+        for i in range(len(self.shape)):
+            if i == axis:
+                dst_axis = len(shape)
+                shape.append(array.shape[0])
+                block_shape.append(block_size)
+            elif i < len(ss):
+                if isinstance(ss[i], slice):
+                    shape.append(self.shape[i])
+                    block_shape.append(self.block_shape[i])
+                else:
+                    # It's an index. We drop the indices.
+                    continue
+            else:
+                shape.append(self.shape[i])
+                block_shape.append(self.block_shape[i])
+
+        dst_arr = type(self)(
+            ArrayGrid(
+                shape=tuple(shape),
+                block_shape=tuple(block_shape),
+                dtype=self.dtype.__name__,
+            ),
+            km=self.km,
+        )
+
+        src_arr = self
+        np_ss = ss
+        ss = self.km.put(
+            ss,
+            syskwargs={
+                "grid_entry": (0,),
+                "grid_shape": (1,),
+            },
+        )
+        for src_grid_entry in src_arr.grid.get_entry_iterator():
+            src_coord: tuple = src_arr.grid.get_entry_coordinates(src_grid_entry)
+            src_block: Block = src_arr.blocks[src_grid_entry]
+
+            # Make sure index values in subscript are within bounds of src_arr.
+            # We also prepare dst_grid_entry here.
+            dst_grid_entry_list = []
+            skip = False
+            for curr_axis in range(len(np_ss)):
+                if curr_axis == axis:
+                    dst_grid_entry_list.append(None)
+                elif isinstance(np_ss[curr_axis], slice):
+                    dst_grid_entry_list.append(src_grid_entry[curr_axis])
+                elif not (
+                    src_coord[curr_axis]
+                    <= np_ss[curr_axis]
+                    < src_coord[curr_axis] + src_block.shape[curr_axis]
+                ):
+                    skip = True
+                    break
+            if skip:
+                continue
+            for curr_axis in range(len(np_ss), len(src_grid_entry)):
+                dst_grid_entry_list.append(src_grid_entry[curr_axis])
+
+            for j in range(dst_arr.grid.grid_shape[dst_axis]):
+                dst_grid_entry_list[dst_axis] = j
+                dst_grid_entry = tuple(dst_grid_entry_list)
+                dst_block: Block = dst_arr.blocks[dst_grid_entry]
+                dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
+
+                if dst_block.oid is None:
+                    dst_arg = (dst_block.shape, dst_block.dtype)
+                else:
+                    dst_arg = dst_block.oid
+                dst_block.oid = self.km.advanced_select_block_along_axis(
+                    dst_arg,
+                    src_block.oid,
+                    ss,
+                    dst_axis,
+                    axis,
+                    dst_coord,
+                    src_coord,
+                    syskwargs={
+                        "grid_entry": dst_grid_entry,
+                        "grid_shape": dst_arr.grid.grid_shape,
+                    },
+                )
+        return dst_arr
+
+    def __setitem__(self, key, value):
+        value: BlockArrayBase = self.to_block_array(value, self.km)
+        ss, is_handled_advanced, axis = self._preprocess_subscript(key)
+        if is_handled_advanced:
+            return self._advanced_single_array_assign(ss, value, axis)
+
+        # This is to deal with circular imports. Little overhead since this happens once per call.
+        # However, would be better to rearrange modules in the future.
+        from nums.core.array.view import ArrayView
+
+        av: ArrayView = ArrayView.from_block_array(self)
+        av[key] = value
+
+    def _advanced_single_array_assign(
+        self,
+        ss: tuple,
+        value,
+        axis: int,
+    ):
+        array = ss[axis]
+        assert len(array.shape) == 1
+
+        # The subscript contains a single array. We therefore know one of two things is true:
+        # 1. value is the same shape as self along axes != axis.
+        # 2. value is scalar or 1-dimensional.
+        # We currently don't support the case where value may broadcasted if it has more dims.
+        # This should be a straight-forward future task.
+        value: BlockArrayBase = value
+        mode = None
+        if len(value.shape) == 0:
+            # subscripted value per block will broadcast to other dimensions.
+            mode = "scalar"
+        elif len(value.shape) == 1:
+            # assert len(value.shape) == len(ss)
+            mode = "single-dim"
+            # Can broadcast if trailing dim matches.
+            assert len(ss[axis]) == value.shape[0]
+
+            for i in range(len(self.shape)):
+                if i == axis:
+                    assert len(ss[i]) == value.shape[0]
+                elif i < axis:
+                    # Nothing to check here.
+                    # These entries are : or integer.
+                    pass
+                else:
+                    if i < len(ss):
+                        if not isinstance(ss[i], slice):
+                            # ss[i] is an integer.
+                            continue
+                    # If we're here, then the rest of the subscript operator
+                    # will resolve to :, which is not broadcastable.
+                    raise ValueError(
+                        "Cannot broadcast input array "
+                        "from shape %s into shape %s"
+                        % (value.shape, tuple([value.shape[0]] + list(self.shape[i:])))
+                    )
+        elif len(value.shape) == len(self.shape):
+            mode = "multi-dim"
+            new_block_shape = []
+            for i in range(len(self.shape)):
+                if i == axis:
+                    new_block_shape.append(value.block_shape[i])
+                elif i < len(ss) and (
+                    array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
+                ):
+                    # These entries are : or integer.
+                    # assert array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
+                    assert value.shape[i] == 1
+                    new_block_shape.append(1)
+                else:
+                    assert value.shape[i] == self.shape[i], "Shape mismatch."
+                    new_block_shape.append(self.block_shape[i])
+            new_block_shape = tuple(new_block_shape)
+            if new_block_shape != value.block_shape:
+                # TODO: This message occurs on X[idx[:n]] = X[idx[n:]] + 0.5,
+                #  even when n is a multiple of block_shape[0].
+                warnings.warn(
+                    ("Assigned value block shape %s " % str(value.block_shape))
+                    + (
+                        "does not match block shape %s of assignee. "
+                        % str(new_block_shape)
+                    )
+                    + "Applying reshape to assigned value."
+                )
+                value = value.reshape(block_shape=new_block_shape)
+
+        # Like select, iterate over destination blocks along the axis being updated.
+        # e.g. if self is 2-dim and axis=0, then fix the row and iterate over the columns.
+        # If value has the same shape as self, then for each destination block,
+        # iterate over the blocks in value along axis.
+        # e.g. if self is 2-dim and axis=0, then for the given column, iterate over the rows
+        # of value.
+        # If value is scalar, then attempt to assign it to every destination block.
+        # If value is 1-dim, the just iterate over the dim and assign accordingly.
+
+        dst_arr = self
+        src_arr = value
+        src_grid_shape = src_arr.grid.grid_shape
+        np_ss = ss
+        ss = self.km.put(
+            ss,
+            syskwargs={
+                "grid_entry": (0,),
+                "grid_shape": (1,),
+            },
+        )
+        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
+            dst_block: BlockBase = dst_arr.blocks[dst_grid_entry]
+            dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
+
+            # Make sure index values in subscript are within bounds of dst_arr.
+            # We don't need to check src_arr:
+            # 1) The block shapes of dst_arr and src_arr are the same except along axis
+            #    and indices in ss. We are not concerned with axes the indices in ss correspond to,
+            #    because they are of size 1 in src_arr => we only need to check that indices
+            #    fall within bounds of dst_arr.
+            # 2) For each dst_arr, we test the values
+            #    to assign to dst_arr by traverse the src_arr along axis.
+            #    Thus, size along all other axes are equal or broadcasted.
+            skip = False
+            for curr_axis in range(len(np_ss)):
+                if curr_axis == axis or isinstance(np_ss[curr_axis], slice):
+                    continue
+                if not (
+                    dst_coord[curr_axis]
+                    <= np_ss[curr_axis]
+                    < dst_coord[curr_axis] + dst_block.shape[curr_axis]
+                ):
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            if mode == "scalar":
+                src_block: BlockBase = src_arr.blocks.item()
+                src_coord: tuple = src_arr.grid.get_entry_coordinates(
+                    src_block.grid_entry
+                )
+                dst_block.oid = self.km.advanced_assign_block_along_axis(
+                    dst_block.oid,
+                    src_block.oid,
+                    ss,
+                    axis,
+                    dst_coord,
+                    src_coord,
+                    syskwargs={
+                        "grid_entry": dst_grid_entry,
+                        "grid_shape": dst_arr.grid.grid_shape,
+                    },
+                )
+            elif mode == "single-dim":
+                for src_grid_entry in src_arr.grid.get_entry_iterator():
+                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
+                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
+                        src_grid_entry
+                    )
+                    dst_block.oid = self.km.advanced_assign_block_along_axis(
+                        dst_block.oid,
+                        src_block.oid,
+                        ss,
+                        axis,
+                        dst_coord,
+                        src_coord,
+                        syskwargs={
+                            "grid_entry": dst_grid_entry,
+                            "grid_shape": dst_arr.grid.grid_shape,
+                        },
+                    )
+            elif mode == "multi-dim":
+                for j in range(src_grid_shape[axis]):
+                    # Apply sel from each block along axis of src_arr.
+                    # e.g. for 2 dim array, we fix the column blocks
+                    # given by dst_grid_entry, and iterate over the rows.
+                    src_grid_entry = tuple(
+                        list(dst_grid_entry[:axis])
+                        + [j]
+                        + list(dst_grid_entry[axis + 1 :])
+                    )
+                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
+                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
+                        src_grid_entry
+                    )
+                    dst_block.oid = self.km.advanced_assign_block_along_axis(
+                        dst_block.oid,
+                        src_block.oid,
+                        ss,
+                        axis,
+                        dst_coord,
+                        src_coord,
+                        syskwargs={
+                            "grid_entry": dst_grid_entry,
+                            "grid_shape": dst_arr.grid.grid_shape,
+                        },
+                    )
+        return dst_arr
+
     def tree_reduce(
         self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape
     ):
@@ -787,7 +1198,9 @@ def __rrshift__(self, other):
     # Linear Algebra
     #################
 
-    def _compute_tensordot_syskwargs(self, self_block: Block, other_block: Block):
+    def _compute_tensordot_syskwargs(
+        self, self_block: BlockBase, other_block: BlockBase
+    ):
         # Schedule on larger block.
         if np.product(self_block.shape) >= np.product(other_block.shape):
             return self_block.true_grid_entry(), self_block.true_grid_shape()
@@ -864,3 +1277,204 @@ def __ne__(self, other):
     def __rne__(self, other):
         other = self.check_or_convert_other(other)
         return other.__inequality__("ne", self)
+
+
+class Reshape:
+    @staticmethod
+    def compute_shape(shape, input_shape):
+        size = np.product(shape)
+        if -1 in input_shape:
+            new_shape = []
+            other_dim_prod = 1
+            negative_one_seen = False
+            for dim in input_shape:
+                if dim == -1:
+                    if negative_one_seen:
+                        raise Exception("Only one -1 permitted in reshape.")
+                    negative_one_seen = True
+                    continue
+                other_dim_prod *= dim
+            if size % other_dim_prod != 0:
+                raise Exception("Invalid shape.")
+            for dim in input_shape:
+                if dim == -1:
+                    new_shape.append(size // other_dim_prod)
+                else:
+                    new_shape.append(dim)
+        else:
+            new_shape = input_shape
+        assert size == np.product(new_shape)
+        return new_shape
+
+    def _group_index_lists_by_block(
+        self, dst_slice_tuples, src_grid: ArrayGrid, dst_index_list, src_index_list
+    ):
+        # TODO(hme): Keep this function here until it's needed for greater support of
+        #  selection/assignment operations.
+        # Block grid entries needed to write to given dst_slice_selection.
+        src_blocks = {}
+        dst_slice_np = np.array(dst_slice_tuples).T
+        dst_index_arr = np.array(dst_index_list)
+        src_index_arr = np.array(src_index_list)
+        # Pick the smallest type to represent indices.
+        # A set of these indices may be transmitted over the network,
+        # so we want to pick the smallest encoding possible.
+        index_types = [
+            (2**8, np.uint8),
+            (2**16, np.uint16),
+            (2**32, np.uint32),
+            (2**64, np.uint64),
+        ]
+        index_type = None
+        for bound, curr_index_type in index_types:
+            if np.all(np.array(src_grid.block_shape) < bound) and np.all(
+                dst_slice_np[1] < bound
+            ):
+                index_type = curr_index_type
+                break
+        if index_type is None:
+            raise Exception("Unable to encode block indices, blocks are too large.")
+        for grid_entry in src_grid.get_entry_iterator():
+            src_slice_np = np.array(src_grid.get_slice_tuples(grid_entry)).T
+            index_pairs = []
+            for i in range(src_index_arr.shape[0]):
+                src_index = src_index_arr[i]
+                dst_index = dst_index_arr[i]
+                if np.all(
+                    (src_slice_np[0] <= src_index) & (src_index < src_slice_np[1])
+                ):
+                    index_pair = (
+                        (dst_index - dst_slice_np[0]).astype(index_type),
+                        (src_index - src_slice_np[0]).astype(index_type),
+                    )
+                    index_pairs.append(index_pair)
+            if len(index_pairs) > 0:
+                src_blocks[grid_entry] = index_pairs
+        return src_blocks
+
+    def _arbitrary_reshape(
+        self, arr: BlockArrayBase, shape, block_shape
+    ) -> BlockArrayBase:
+        # This is the worst-case scenario.
+        # Generate index mappings per block, and group source indices to minimize
+        # RPCs and generation of new objects.
+        km = arr.km
+        dst_arr = type(arr).empty(
+            shape=shape, block_shape=block_shape, dtype=arr.dtype, km=km
+        )
+        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
+            dst_block: BlockBase = dst_arr.blocks[dst_grid_entry]
+            dst_slice_selection = dst_arr.grid.get_slice(dst_grid_entry)
+            dst_index_list = array_utils.slice_sel_to_index_list(dst_slice_selection)
+            src_index_list = array_utils.translate_index_list(
+                dst_index_list, shape, arr.shape
+            )
+            src_blocks = self._group_index_lists_by_block(
+                dst_arr.grid.get_slice_tuples(dst_grid_entry),
+                arr.grid,
+                dst_index_list,
+                src_index_list,
+            )
+            for src_grid_entry in src_blocks:
+                src_block: BlockBase = arr.blocks[src_grid_entry]
+                index_pairs = src_blocks[src_grid_entry]
+                syskwargs = {
+                    "grid_entry": dst_grid_entry,
+                    "grid_shape": dst_arr.grid.grid_shape,
+                }
+                dst_block.oid = km.update_block_by_index(
+                    dst_block.oid, src_block.oid, index_pairs, syskwargs=syskwargs
+                )
+        return dst_arr
+
+    def _block_shape_reshape(self, arr, block_shape):
+        rarr: BlockArrayBase = type(arr).empty(
+            arr.shape, block_shape, arr.dtype, arr.km
+        )
+        for grid_entry in rarr.grid.get_entry_iterator():
+            grid_entry_slice = rarr.grid.get_slice(grid_entry)
+            # TODO (hme): This could be less costly.
+            rarr[grid_entry_slice] = arr[grid_entry_slice]
+        return rarr
+
+    def _strip_ones(self, shape):
+        return tuple(filter(lambda x: x != 1, shape))
+
+    def _check_positions_ones(self, shape, block_shape):
+        # If a position in the shape is 1, then the corresponding
+        # position in block_shape should also be 1.
+        for i in range(len(shape)):
+            if shape[i] == 1:
+                if shape[i] != block_shape[i]:
+                    return False
+        return True
+
+    def _is_simple_reshape(self, arr: BlockArrayBase, shape, block_shape):
+        # Is the reshape a difference of factors of 1?
+        # Strip out 1s and compare.
+        # If a position in the shape is 1, then the corresponding
+        # position in block_shape should also be 1.
+
+        # If source shape and dest shape are the same or source block_shape and dest block_shape
+        # are same, this is not a simple reshape.
+        if shape == arr.shape or block_shape == arr.block_shape:
+            return False
+
+        # Checks if source shape and dest shape are same & source block_shape and dest
+        # block_shape are same after stripping ones.
+        if not (
+            self._strip_ones(shape) == self._strip_ones(arr.shape)
+            and self._strip_ones(block_shape) == self._strip_ones(arr.block_shape)
+        ):
+            return False
+        if not self._check_positions_ones(shape, block_shape):
+            return False
+        return True
+
+    def _simple_reshape(self, arr, shape, block_shape):
+        # Reshape the array of blocks only.
+        # This is only used when the difference in shape are factors of 1s,
+        # and the ordering of other factors are maintained.
+
+        # Check assumptions.
+        assert len(self._strip_ones(arr.shape)) == len(self._strip_ones(shape))
+
+        # Create new grid, and perform reshape on blocks
+        # to simplify access to source blocks.
+        grid = ArrayGrid(shape, block_shape, dtype=arr.dtype.__name__)
+        src_blocks = arr.blocks.reshape(grid.grid_shape)
+        if arr.is_dense:
+            rarr = type(arr)(grid, arr.km)
+        else:
+            rarr = type(arr)(grid, arr.km, arr.fill_value)
+        for grid_entry in grid.get_entry_iterator():
+            src_block: BlockBase = src_blocks[grid_entry]
+            dst_block: BlockBase = rarr.blocks[grid_entry]
+            syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape}
+            dst_block.oid = arr.km.reshape(
+                src_block.oid, dst_block.shape, syskwargs=syskwargs
+            )
+        return rarr
+
+    def _validate(self, arr, shape, block_shape):
+        assert -1 not in shape
+        assert -1 not in block_shape
+        assert len(shape) == len(block_shape)
+        assert np.product(arr.shape) == np.product(shape)
+
+    def __call__(self, arr: BlockArrayBase, shape, block_shape):
+        self._validate(arr, shape, block_shape)
+        if arr.shape == shape and arr.block_shape == block_shape:
+            return arr
+        elif self._is_simple_reshape(arr, shape, block_shape):
+            return self._simple_reshape(arr, shape, block_shape)
+        elif arr.shape == shape and arr.block_shape != block_shape:
+            return self._block_shape_reshape(arr, block_shape)
+        elif arr.shape != shape and arr.block_shape == block_shape:
+            # Just do full reshape for this case as well.
+            # Though there may be a better solution, we generally expect
+            # the block shape to change with array shape.
+            return self._arbitrary_reshape(arr, shape, block_shape)
+        else:
+            assert arr.shape != shape and arr.block_shape != block_shape
+            return self._arbitrary_reshape(arr, shape, block_shape)
diff --git a/nums/core/array/blockarray.py b/nums/core/array/blockarray.py
index 9fccb971..041747b2 100644
--- a/nums/core/array/blockarray.py
+++ b/nums/core/array/blockarray.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from nums.core.array import utils as array_utils
-from nums.core.array.base import BlockArrayBase, Block
+from nums.core.array.base import Block, BlockArrayBase
 from nums.core.array.view import ArrayView
 from nums.core.grid.grid import ArrayGrid
 from nums.core.kernel.kernel_manager import KernelManager
@@ -161,26 +161,6 @@ def to_single_block(self, replicate=False):
                 )
         return res
 
-    def reshape(self, *shape, **kwargs):
-        block_shape = kwargs.get("block_shape", None)
-        if array_utils.is_int(shape):
-            shape = (shape,)
-        elif len(shape) == 0:
-            shape = self.shape
-        elif isinstance(shape[0], (tuple, list)):
-            assert len(shape) == 1
-            shape = shape[0]
-        else:
-            assert all(np.issubdtype(type(n), int) for n in shape)
-        shape = Reshape.compute_shape(self.shape, shape)
-        if block_shape is None:
-            if shape == self.shape:
-                # This is a noop.
-                block_shape = self.block_shape
-            else:
-                block_shape = self.km.get_block_shape(shape, self.dtype)
-        return Reshape()(self, shape, block_shape)
-
     def expand_dims(self, axis):
         """
         This function refers to the numpy implementation of expand_dims.
@@ -257,373 +237,6 @@ def transpose(self, defer=False, redistribute=False):
             )
         return rarrT
 
-    def __getattr__(self, item):
-        if item == "__array_priority__" or item == "__array_struct__":
-            # This is triggered by a numpy array on the LHS.
-            raise TypeError("Unexpected conversion attempt from BlockArray to ndarray.")
-        elif item == "ndim":
-            return len(self.shape)
-        elif item == "T":
-            return self.transpose()
-        else:
-            raise NotImplementedError(item)
-
-    def _preprocess_subscript(self, item):
-        if not isinstance(item, tuple):
-            ss = (item,)
-        else:
-            ss = item
-        # We need to fetch any block arrays.
-        tmp = []
-        for entry in ss:
-            if isinstance(entry, BlockArray):
-                val = entry.get()
-            else:
-                val = entry
-            if isinstance(val, list):
-                val = np.array(val)
-            if isinstance(val, np.ndarray):
-                # If this is a Boolean mask, convert it to integers.
-                if array_utils.is_bool(val.dtype, type_test=True):
-                    val = np.arange(len(val))[val]
-                if val.shape == ():
-                    val = val.item()
-            tmp.append(val)
-        ss = tuple(tmp)
-        is_handled_advanced = False
-        array_encountered = False
-        axis = None
-
-        # Check if this is a supported advanced indexing operation.
-        for i, entry in enumerate(ss):
-            if isinstance(entry, slice) and entry.start is None and entry.stop is None:
-                continue
-            elif array_utils.is_int(entry) or array_utils.is_uint(entry):
-                continue
-            elif array_utils.is_array_like(entry):
-                if array_encountered:
-                    raise NotImplementedError(
-                        "Advanced indexing is only supported along a single axis."
-                    )
-                is_handled_advanced = True
-                array_encountered = True
-                axis = i
-                if not (np.all(0 <= entry) and np.all(entry < self.shape[axis])):
-                    raise IndexError(
-                        "Advanced indexing array along axis %s is out of bounds." % axis
-                    )
-            else:
-                if array_encountered:
-                    raise NotImplementedError(
-                        "Advanced indexing is only supported "
-                        "with full slices and integers along other axes."
-                    )
-                is_handled_advanced = False
-                break
-
-        return ss, is_handled_advanced, axis
-
-    def __getitem__(self, item):
-        ss, is_handled_advanced, axis = self._preprocess_subscript(item)
-
-        if is_handled_advanced:
-            # Treat this as a shuffle.
-            return self._advanced_single_array_select(ss, axis=axis)
-
-        av: ArrayView = ArrayView.from_block_array(self)
-        # TODO (hme): We don't have to create, but do so for now until we need to optimize.
-        return av[ss].create()
-
-    def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
-        # Create output array along the axis of the selection operation.
-        # We don't allocate zeros for output array. Instead, we let the update kernel
-        # create the initial set of zeros to save some memory.
-        array = ss[axis]
-        assert len(array.shape) == 1
-
-        # TODO: We may encounter block shape incompatability due to this.
-        block_size = self.block_shape[axis]
-        self.km.update_block_shape_map(array.shape[0], block_size)
-
-        dst_axis = None
-        shape = []
-        block_shape = []
-        for i in range(len(self.shape)):
-            if i == axis:
-                dst_axis = len(shape)
-                shape.append(array.shape[0])
-                block_shape.append(block_size)
-            elif i < len(ss):
-                if isinstance(ss[i], slice):
-                    shape.append(self.shape[i])
-                    block_shape.append(self.block_shape[i])
-                else:
-                    # It's an index. We drop the indices.
-                    continue
-            else:
-                shape.append(self.shape[i])
-                block_shape.append(self.block_shape[i])
-
-        dst_arr = BlockArray(
-            ArrayGrid(
-                shape=tuple(shape),
-                block_shape=tuple(block_shape),
-                dtype=self.dtype.__name__,
-            ),
-            km=self.km,
-        )
-
-        src_arr = self
-        np_ss = ss
-        ss = self.km.put(
-            ss,
-            syskwargs={
-                "grid_entry": (0,),
-                "grid_shape": (1,),
-            },
-        )
-        for src_grid_entry in src_arr.grid.get_entry_iterator():
-            src_coord: tuple = src_arr.grid.get_entry_coordinates(src_grid_entry)
-            src_block: Block = src_arr.blocks[src_grid_entry]
-
-            # Make sure index values in subscript are within bounds of src_arr.
-            # We also prepare dst_grid_entry here.
-            dst_grid_entry_list = []
-            skip = False
-            for curr_axis in range(len(np_ss)):
-                if curr_axis == axis:
-                    dst_grid_entry_list.append(None)
-                elif isinstance(np_ss[curr_axis], slice):
-                    dst_grid_entry_list.append(src_grid_entry[curr_axis])
-                elif not (
-                    src_coord[curr_axis]
-                    <= np_ss[curr_axis]
-                    < src_coord[curr_axis] + src_block.shape[curr_axis]
-                ):
-                    skip = True
-                    break
-            if skip:
-                continue
-            for curr_axis in range(len(np_ss), len(src_grid_entry)):
-                dst_grid_entry_list.append(src_grid_entry[curr_axis])
-
-            for j in range(dst_arr.grid.grid_shape[dst_axis]):
-                dst_grid_entry_list[dst_axis] = j
-                dst_grid_entry = tuple(dst_grid_entry_list)
-                dst_block: Block = dst_arr.blocks[dst_grid_entry]
-                dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
-
-                if dst_block.oid is None:
-                    dst_arg = (dst_block.shape, dst_block.dtype)
-                else:
-                    dst_arg = dst_block.oid
-                dst_block.oid = self.km.advanced_select_block_along_axis(
-                    dst_arg,
-                    src_block.oid,
-                    ss,
-                    dst_axis,
-                    axis,
-                    dst_coord,
-                    src_coord,
-                    syskwargs={
-                        "grid_entry": dst_grid_entry,
-                        "grid_shape": dst_arr.grid.grid_shape,
-                    },
-                )
-        return dst_arr
-
-    def __setitem__(self, key, value):
-        value: BlockArray = BlockArray.to_block_array(value, self.km)
-        ss, is_handled_advanced, axis = self._preprocess_subscript(key)
-        if is_handled_advanced:
-            return self._advanced_single_array_assign(ss, value, axis)
-        av: ArrayView = ArrayView.from_block_array(self)
-        av[key] = value
-
-    def _advanced_single_array_assign(
-        self,
-        ss: tuple,
-        value,
-        axis: int,
-    ):
-        array = ss[axis]
-        assert len(array.shape) == 1
-
-        # The subscript contains a single array. We therefore know one of two things is true:
-        # 1. value is the same shape as self along axes != axis.
-        # 2. value is scalar or 1-dimensional.
-        # We currently don't support the case where value may broadcasted if it has more dims.
-        # This should be a straight-forward future task.
-        value: BlockArray = value
-        mode = None
-        if len(value.shape) == 0:
-            # subscripted value per block will broadcast to other dimensions.
-            mode = "scalar"
-        elif len(value.shape) == 1:
-            # assert len(value.shape) == len(ss)
-            mode = "single-dim"
-            # Can broadcast if trailing dim matches.
-            assert len(ss[axis]) == value.shape[0]
-
-            for i in range(len(self.shape)):
-                if i == axis:
-                    assert len(ss[i]) == value.shape[0]
-                elif i < axis:
-                    # Nothing to check here.
-                    # These entries are : or integer.
-                    pass
-                else:
-                    if i < len(ss):
-                        if not isinstance(ss[i], slice):
-                            # ss[i] is an integer.
-                            continue
-                    # If we're here, then the rest of the subscript operator
-                    # will resolve to :, which is not broadcastable.
-                    raise ValueError(
-                        "Cannot broadcast input array "
-                        "from shape %s into shape %s"
-                        % (value.shape, tuple([value.shape[0]] + list(self.shape[i:])))
-                    )
-        elif len(value.shape) == len(self.shape):
-            mode = "multi-dim"
-            new_block_shape = []
-            for i in range(len(self.shape)):
-                if i == axis:
-                    new_block_shape.append(value.block_shape[i])
-                elif i < len(ss) and (
-                    array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
-                ):
-                    # These entries are : or integer.
-                    # assert array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
-                    assert value.shape[i] == 1
-                    new_block_shape.append(1)
-                else:
-                    assert value.shape[i] == self.shape[i], "Shape mismatch."
-                    new_block_shape.append(self.block_shape[i])
-            new_block_shape = tuple(new_block_shape)
-            if new_block_shape != value.block_shape:
-                # TODO: This message occurs on X[idx[:n]] = X[idx[n:]] + 0.5,
-                #  even when n is a multiple of block_shape[0].
-                warnings.warn(
-                    ("Assigned value block shape %s " % str(value.block_shape))
-                    + (
-                        "does not match block shape %s of assignee. "
-                        % str(new_block_shape)
-                    )
-                    + "Applying reshape to assigned value."
-                )
-                value = value.reshape(block_shape=new_block_shape)
-
-        # Like select, iterate over destination blocks along the axis being updated.
-        # e.g. if self is 2-dim and axis=0, then fix the row and iterate over the columns.
-        # If value has the same shape as self, then for each destination block,
-        # iterate over the blocks in value along axis.
-        # e.g. if self is 2-dim and axis=0, then for the given column, iterate over the rows
-        # of value.
-        # If value is scalar, then attempt to assign it to every destination block.
-        # If value is 1-dim, the just iterate over the dim and assign accordingly.
-
-        dst_arr = self
-        src_arr = value
-        src_grid_shape = src_arr.grid.grid_shape
-        np_ss = ss
-        ss = self.km.put(
-            ss,
-            syskwargs={
-                "grid_entry": (0,),
-                "grid_shape": (1,),
-            },
-        )
-        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
-            dst_block: Block = dst_arr.blocks[dst_grid_entry]
-            dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
-
-            # Make sure index values in subscript are within bounds of dst_arr.
-            # We don't need to check src_arr:
-            # 1) The block shapes of dst_arr and src_arr are the same except along axis
-            #    and indices in ss. We are not concerned with axes the indices in ss correspond to,
-            #    because they are of size 1 in src_arr => we only need to check that indices
-            #    fall within bounds of dst_arr.
-            # 2) For each dst_arr, we test the values
-            #    to assign to dst_arr by traverse the src_arr along axis.
-            #    Thus, size along all other axes are equal or broadcasted.
-            skip = False
-            for curr_axis in range(len(np_ss)):
-                if curr_axis == axis or isinstance(np_ss[curr_axis], slice):
-                    continue
-                if not (
-                    dst_coord[curr_axis]
-                    <= np_ss[curr_axis]
-                    < dst_coord[curr_axis] + dst_block.shape[curr_axis]
-                ):
-                    skip = True
-                    break
-            if skip:
-                continue
-
-            if mode == "scalar":
-                src_block: Block = src_arr.blocks.item()
-                src_coord: tuple = src_arr.grid.get_entry_coordinates(
-                    src_block.grid_entry
-                )
-                dst_block.oid = self.km.advanced_assign_block_along_axis(
-                    dst_block.oid,
-                    src_block.oid,
-                    ss,
-                    axis,
-                    dst_coord,
-                    src_coord,
-                    syskwargs={
-                        "grid_entry": dst_grid_entry,
-                        "grid_shape": dst_arr.grid.grid_shape,
-                    },
-                )
-            elif mode == "single-dim":
-                for src_grid_entry in src_arr.grid.get_entry_iterator():
-                    src_block: Block = src_arr.blocks[src_grid_entry]
-                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
-                        src_grid_entry
-                    )
-                    dst_block.oid = self.km.advanced_assign_block_along_axis(
-                        dst_block.oid,
-                        src_block.oid,
-                        ss,
-                        axis,
-                        dst_coord,
-                        src_coord,
-                        syskwargs={
-                            "grid_entry": dst_grid_entry,
-                            "grid_shape": dst_arr.grid.grid_shape,
-                        },
-                    )
-            elif mode == "multi-dim":
-                for j in range(src_grid_shape[axis]):
-                    # Apply sel from each block along axis of src_arr.
-                    # e.g. for 2 dim array, we fix the column blocks
-                    # given by dst_grid_entry, and iterate over the rows.
-                    src_grid_entry = tuple(
-                        list(dst_grid_entry[:axis])
-                        + [j]
-                        + list(dst_grid_entry[axis + 1 :])
-                    )
-                    src_block: Block = src_arr.blocks[src_grid_entry]
-                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
-                        src_grid_entry
-                    )
-                    dst_block.oid = self.km.advanced_assign_block_along_axis(
-                        dst_block.oid,
-                        src_block.oid,
-                        ss,
-                        axis,
-                        dst_coord,
-                        src_coord,
-                        syskwargs={
-                            "grid_entry": dst_grid_entry,
-                            "grid_shape": dst_arr.grid.grid_shape,
-                        },
-                    )
-        return dst_arr
-
     @staticmethod
     def to_block_array(obj, km: KernelManager, block_shape=None):
         if isinstance(obj, BlockArray):
@@ -898,197 +511,3 @@ def __inequality__(self, op, other):
                 op, other_block, args={}
             )
         return result
-
-
-class Reshape:
-    @staticmethod
-    def compute_shape(shape, input_shape):
-        size = np.product(shape)
-        if -1 in input_shape:
-            new_shape = []
-            other_dim_prod = 1
-            negative_one_seen = False
-            for dim in input_shape:
-                if dim == -1:
-                    if negative_one_seen:
-                        raise Exception("Only one -1 permitted in reshape.")
-                    negative_one_seen = True
-                    continue
-                other_dim_prod *= dim
-            if size % other_dim_prod != 0:
-                raise Exception("Invalid shape.")
-            for dim in input_shape:
-                if dim == -1:
-                    new_shape.append(size // other_dim_prod)
-                else:
-                    new_shape.append(dim)
-        else:
-            new_shape = input_shape
-        assert size == np.product(new_shape)
-        return new_shape
-
-    def _group_index_lists_by_block(
-        self, dst_slice_tuples, src_grid: ArrayGrid, dst_index_list, src_index_list
-    ):
-        # TODO(hme): Keep this function here until it's needed for greater support of
-        #  selection/assignment operations.
-        # Block grid entries needed to write to given dst_slice_selection.
-        src_blocks = {}
-        dst_slice_np = np.array(dst_slice_tuples).T
-        dst_index_arr = np.array(dst_index_list)
-        src_index_arr = np.array(src_index_list)
-        # Pick the smallest type to represent indices.
-        # A set of these indices may be transmitted over the network,
-        # so we want to pick the smallest encoding possible.
-        index_types = [
-            (2**8, np.uint8),
-            (2**16, np.uint16),
-            (2**32, np.uint32),
-            (2**64, np.uint64),
-        ]
-        index_type = None
-        for bound, curr_index_type in index_types:
-            if np.all(np.array(src_grid.block_shape) < bound) and np.all(
-                dst_slice_np[1] < bound
-            ):
-                index_type = curr_index_type
-                break
-        if index_type is None:
-            raise Exception("Unable to encode block indices, blocks are too large.")
-        for grid_entry in src_grid.get_entry_iterator():
-            src_slice_np = np.array(src_grid.get_slice_tuples(grid_entry)).T
-            index_pairs = []
-            for i in range(src_index_arr.shape[0]):
-                src_index = src_index_arr[i]
-                dst_index = dst_index_arr[i]
-                if np.all(
-                    (src_slice_np[0] <= src_index) & (src_index < src_slice_np[1])
-                ):
-                    index_pair = (
-                        (dst_index - dst_slice_np[0]).astype(index_type),
-                        (src_index - src_slice_np[0]).astype(index_type),
-                    )
-                    index_pairs.append(index_pair)
-            if len(index_pairs) > 0:
-                src_blocks[grid_entry] = index_pairs
-        return src_blocks
-
-    def _arbitrary_reshape(self, arr: BlockArray, shape, block_shape) -> BlockArray:
-        # This is the worst-case scenario.
-        # Generate index mappings per block, and group source indices to minimize
-        # RPCs and generation of new objects.
-        km = arr.km
-        dst_arr = BlockArray.empty(
-            shape=shape, block_shape=block_shape, dtype=arr.dtype, km=km
-        )
-        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
-            dst_block: Block = dst_arr.blocks[dst_grid_entry]
-            dst_slice_selection = dst_arr.grid.get_slice(dst_grid_entry)
-            dst_index_list = array_utils.slice_sel_to_index_list(dst_slice_selection)
-            src_index_list = array_utils.translate_index_list(
-                dst_index_list, shape, arr.shape
-            )
-            src_blocks = self._group_index_lists_by_block(
-                dst_arr.grid.get_slice_tuples(dst_grid_entry),
-                arr.grid,
-                dst_index_list,
-                src_index_list,
-            )
-            for src_grid_entry in src_blocks:
-                src_block: Block = arr.blocks[src_grid_entry]
-                index_pairs = src_blocks[src_grid_entry]
-                syskwargs = {
-                    "grid_entry": dst_grid_entry,
-                    "grid_shape": dst_arr.grid.grid_shape,
-                }
-                dst_block.oid = km.update_block_by_index(
-                    dst_block.oid, src_block.oid, index_pairs, syskwargs=syskwargs
-                )
-        return dst_arr
-
-    def _block_shape_reshape(self, arr, block_shape):
-        rarr: BlockArray = BlockArray.empty(arr.shape, block_shape, arr.dtype, arr.km)
-        for grid_entry in rarr.grid.get_entry_iterator():
-            grid_entry_slice = rarr.grid.get_slice(grid_entry)
-            # TODO (hme): This could be less costly.
-            rarr[grid_entry_slice] = arr[grid_entry_slice]
-        return rarr
-
-    def _strip_ones(self, shape):
-        return tuple(filter(lambda x: x != 1, shape))
-
-    def _check_positions_ones(self, shape, block_shape):
-        # If a position in the shape is 1, then the corresponding
-        # position in block_shape should also be 1.
-        for i in range(len(shape)):
-            if shape[i] == 1:
-                if shape[i] != block_shape[i]:
-                    return False
-        return True
-
-    def _is_simple_reshape(self, arr: BlockArray, shape, block_shape):
-        # Is the reshape a difference of factors of 1?
-        # Strip out 1s and compare.
-        # If a position in the shape is 1, then the corresponding
-        # position in block_shape should also be 1.
-
-        # If source shape and dest shape are the same or source block_shape and dest block_shape
-        # are same, this is not a simple reshape.
-        if shape == arr.shape or block_shape == arr.block_shape:
-            return False
-
-        # Checks if source shape and dest shape are same & source block_shape and dest
-        # block_shape are same after stripping ones.
-        if not (
-            self._strip_ones(shape) == self._strip_ones(arr.shape)
-            and self._strip_ones(block_shape) == self._strip_ones(arr.block_shape)
-        ):
-            return False
-        if not self._check_positions_ones(shape, block_shape):
-            return False
-        return True
-
-    def _simple_reshape(self, arr, shape, block_shape):
-        # Reshape the array of blocks only.
-        # This is only used when the difference in shape are factors of 1s,
-        # and the ordering of other factors are maintained.
-
-        # Check assumptions.
-        assert len(self._strip_ones(arr.shape)) == len(self._strip_ones(shape))
-
-        # Create new grid, and perform reshape on blocks
-        # to simplify access to source blocks.
-        grid = ArrayGrid(shape, block_shape, dtype=arr.dtype.__name__)
-        src_blocks = arr.blocks.reshape(grid.grid_shape)
-        rarr = BlockArray(grid, arr.km)
-        for grid_entry in grid.get_entry_iterator():
-            src_block: Block = src_blocks[grid_entry]
-            dst_block: Block = rarr.blocks[grid_entry]
-            syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape}
-            dst_block.oid = arr.km.reshape(
-                src_block.oid, dst_block.shape, syskwargs=syskwargs
-            )
-        return rarr
-
-    def _validate(self, arr, shape, block_shape):
-        assert -1 not in shape
-        assert -1 not in block_shape
-        assert len(shape) == len(block_shape)
-        assert np.product(arr.shape) == np.product(shape)
-
-    def __call__(self, arr: BlockArray, shape, block_shape):
-        self._validate(arr, shape, block_shape)
-        if arr.shape == shape and arr.block_shape == block_shape:
-            return arr
-        elif self._is_simple_reshape(arr, shape, block_shape):
-            return self._simple_reshape(arr, shape, block_shape)
-        elif arr.shape == shape and arr.block_shape != block_shape:
-            return self._block_shape_reshape(arr, block_shape)
-        elif arr.shape != shape and arr.block_shape == block_shape:
-            # Just do full reshape for this case as well.
-            # Though there may be a better solution, we generally expect
-            # the block shape to change with array shape.
-            return self._arbitrary_reshape(arr, shape, block_shape)
-        else:
-            assert arr.shape != shape and arr.block_shape != block_shape
-            return self._arbitrary_reshape(arr, shape, block_shape)
diff --git a/nums/core/array/sparse.py b/nums/core/array/sparse.py
index c3e99c6c..0a51947e 100644
--- a/nums/core/array/sparse.py
+++ b/nums/core/array/sparse.py
@@ -268,6 +268,28 @@ def _get_nbytes(self):
     def __repr__(self):
         return f"SparseBlockArray({self.blocks})"
 
+    @classmethod
+    def empty(cls, shape, block_shape, dtype, km: KernelManager):
+        return SparseBlockArray.create(
+            "empty", shape, block_shape, dtype, km, fill_value=0
+        )
+
+    @classmethod
+    def create(
+        cls, create_op_name, shape, block_shape, dtype, km: KernelManager, fill_value=0
+    ):
+        grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__)
+        grid_meta = grid.to_meta()
+        arr = SparseBlockArray(grid, km, fill_value)
+        for grid_entry in grid.get_entry_iterator():
+            arr.blocks[grid_entry].oid = km.new_sparse_block(
+                create_op_name,
+                grid_entry,
+                grid_meta,
+                syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape},
+            )
+        return arr
+
     @classmethod
     def from_np(cls, arr, block_shape, copy, km, fill_value=0):
         dtype_str = str(arr.dtype)
@@ -423,6 +445,7 @@ def to_block_array(obj, km: KernelManager, block_shape=None):
             raise Exception("Unsupported type %s" % type(obj))
         if block_shape is None:
             block_shape = km.get_block_shape(np_array.shape, np_array.dtype)
+        # Assume object is dense.
         return BlockArray.from_np(np_array, block_shape, False, km)
 
     def check_or_convert_other(self, other, compute_block_shape=False):
diff --git a/nums/core/array/view.py b/nums/core/array/view.py
index f2e20ccb..88e79267 100644
--- a/nums/core/array/view.py
+++ b/nums/core/array/view.py
@@ -19,7 +19,7 @@
 
 from nums.core.array import selection
 from nums.core.array import utils as array_utils
-from nums.core.array.base import Block, BlockArrayBase
+from nums.core.array.base import BlockBase, BlockArrayBase
 from nums.core.array.selection import BasicSelection
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.core.grid.grid import ArrayGrid
@@ -175,7 +175,7 @@ def create_basic_single_step(self, concrete_cls) -> BlockArrayBase:
                 ]
                 if src_dst_intersection_block.is_empty():
                     continue
-                src_block: Block = self._source.blocks[src_grid_entry]
+                src_block: BlockBase = self._source.blocks[src_grid_entry]
                 src_oids.append(src_block.oid)
                 src_sel_block: BasicSelection = src_sel_arr[src_grid_entry]
                 src_dep_sel_loc = src_dst_intersection_block - src_sel_block.position()
@@ -184,7 +184,7 @@ def create_basic_single_step(self, concrete_cls) -> BlockArrayBase:
                     src_dst_intersection_block - dst_sel_offset_block.position()
                 )
                 dst_params.append((dst_block_sel_loc.selector(), False))
-            dst_block: Block = dst_ba.blocks.reshape(dst_grid_bc.grid_shape)[
+            dst_block: BlockBase = dst_ba.blocks.reshape(dst_grid_bc.grid_shape)[
                 dst_grid_entry_bc
             ]
             dst_block.oid = km.create_block(
@@ -300,8 +300,8 @@ def assign_references(self, dst_sel: BasicSelection, value):
                 )
                 # This is a reference assignment, and the grid properties between the
                 # two blocks may differ, so retain those properties in the copy.
-                dst_block: Block = self._source.blocks[dst_grid_entry]
-                src_block_copy: Block = value._source.blocks[src_grid_entry].copy()
+                dst_block: BlockBase = self._source.blocks[dst_grid_entry]
+                src_block_copy: BlockBase = value._source.blocks[src_grid_entry].copy()
                 src_block_copy.grid_entry = dst_block.grid_entry
                 src_block_copy.grid_shape = dst_block.grid_shape
                 self._source.blocks[dst_grid_entry] = src_block_copy
@@ -331,8 +331,8 @@ def assign_references(self, dst_sel: BasicSelection, value):
                 )
                 # This is a reference assignment, and the grid properties between the
                 # two blocks may differ, so retain those properties in the copy.
-                dst_block: Block = self._source.blocks[dst_grid_entry]
-                src_block_copy: Block = src_ba.blocks[src_grid_entry].copy()
+                dst_block: BlockBase = self._source.blocks[dst_grid_entry]
+                src_block_copy: BlockBase = src_ba.blocks[src_grid_entry].copy()
                 src_block_copy.grid_entry = dst_block.grid_entry
                 src_block_copy.grid_shape = dst_block.grid_shape
                 self._source.blocks[dst_grid_entry] = src_block_copy
@@ -393,7 +393,7 @@ def basic_assign_single_step(self, dst_sel: BasicSelection, value):
             src_oids = []
             src_params = []
             dst_params = []
-            dst_block: Block = dst_ba.blocks[dst_grid_entry]
+            dst_block: BlockBase = dst_ba.blocks[dst_grid_entry]
             for src_index, src_grid_entry_bc in enumerate(
                 src_inflated_grid.get_entry_iterator()
             ):
@@ -404,7 +404,7 @@ def basic_assign_single_step(self, dst_sel: BasicSelection, value):
                     continue
 
                 src_grid_entry = src_grid_entry_iterator[src_index]
-                src_block: Block = src_ba_bc.blocks[src_grid_entry]
+                src_block: BlockBase = src_ba_bc.blocks[src_grid_entry]
                 src_oids.append(src_block.oid)
 
                 src_sel_block_offset: BasicSelection = src_sel_offset[src_grid_entry_bc]
diff --git a/nums/core/kernel/kernel_interface.py b/nums/core/kernel/kernel_interface.py
index 79c2be92..76631799 100644
--- a/nums/core/kernel/kernel_interface.py
+++ b/nums/core/kernel/kernel_interface.py
@@ -179,6 +179,9 @@ def sparse_nnz(self, arr, syskwargs: Dict):
     def sparse_nbytes(self, arr, syskwargs: Dict):
         raise NotImplementedError()
 
+    def new_sparse_block(self, op_name, grid_entry, grid_meta, syskwargs: Dict):
+        raise NotImplementedError()
+
     def sparse_random_block(
         self,
         rng_params,
diff --git a/nums/core/kernel/numpy_kernel.py b/nums/core/kernel/numpy_kernel.py
index ef671a5e..f5fee593 100644
--- a/nums/core/kernel/numpy_kernel.py
+++ b/nums/core/kernel/numpy_kernel.py
@@ -540,6 +540,16 @@ def sparse_nnz(self, arr):
     def sparse_nbytes(self, arr):
         return arr.nbytes
 
+    def new_sparse_block(self, op_name, grid_entry, grid_meta):
+        op_func = sparse.__getattribute__(op_name)
+        grid = ArrayGrid.from_meta(grid_meta)
+        block_shape = grid.get_block_shape(grid_entry)
+        if op_name == "eye":
+            assert np.all(np.diff(grid_entry) == 0)
+            return op_func(*block_shape, dtype=grid.dtype)
+        else:
+            return op_func(block_shape, dtype=grid.dtype)
+
     def sparse_random_block(
         self,
         rng_params,
diff --git a/nums/experimental/optimizer/fusion.py b/nums/experimental/optimizer/fusion.py
index e20b04e7..1729f4d7 100644
--- a/nums/experimental/optimizer/fusion.py
+++ b/nums/experimental/optimizer/fusion.py
@@ -5,7 +5,6 @@
 import numpy as np
 
 from nums.core.array import utils as array_utils
-from nums.core.array.base import Block
 from nums.core.grid.grid import Device
 from nums.experimental.optimizer.clusterstate import ClusterState
 
diff --git a/nums/experimental/optimizer/graph.py b/nums/experimental/optimizer/graph.py
index 5c505340..361e740f 100644
--- a/nums/experimental/optimizer/graph.py
+++ b/nums/experimental/optimizer/graph.py
@@ -21,7 +21,7 @@
 
 from nums.core.settings import sync_nnz
 from nums.core.array import utils as array_utils
-from nums.core.array.base import Block
+from nums.core.array.base import BlockBase, Block
 from nums.core.grid.grid import Device
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.experimental.optimizer.clusterstate import ClusterState
@@ -313,7 +313,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         assert isinstance(self.child, Leaf)
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         self.cluster_state.commit_uop(self._mem_cost(), self.child.block.id, device)
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
@@ -330,12 +330,12 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
 
     def _collapse(self, device: Device):
         assert isinstance(self.child, Leaf)
-        block: Block = self.child.block
+        block: BlockBase = self.child.block
         op_name, args = self.op_name, {}
         if op_name == "transpose":
-            block: Block = block.transpose(defer=True)
+            block: BlockBase = block.transpose(defer=True)
         else:
-            block: Block = block.ufunc(op_name, device=device)
+            block: BlockBase = block.ufunc(op_name, device=device)
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
         leaf.tree_node_size = self.child.tree_node_size.uop(op_name)
@@ -344,7 +344,7 @@ def _collapse(self, device: Device):
 
     def _mem_cost(self):
         assert isinstance(self.child, Leaf)
-        block: Block = self.child.block
+        block: BlockBase = self.child.block
         if block.is_dense:
             return np.product(block.shape)
         if sync_nnz > 1:
@@ -434,10 +434,10 @@ def copy(self, cluster_state, parent=None, new_ids=False):
 
     def _collapse(self, device: Device):
         assert isinstance(self.child, Leaf)
-        child_block: Block = self.child.block
+        child_block: BlockBase = self.child.block
         op_name, args = self.op_name, {}
 
-        block = Block(
+        block: BlockBase = Block(
             grid_entry=self.grid_entry(),
             grid_shape=self.grid_shape(),
             shape=self.shape(),
@@ -467,7 +467,7 @@ def _collapse(self, device: Device):
 
     def _mem_cost(self):
         assert isinstance(self.child, Leaf)
-        block: Block = self.child.block
+        block: BlockBase = self.child.block
         if block.is_dense:
             return np.product(self.shape())
         if sync_nnz > 1:
@@ -661,7 +661,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf)
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         # This updates load on nodes and channels.
         # This also updates block states to indicate that they now reside on the provided nodes.
         # Update the cluster state after computing the leaf, so that transfer costs are properly
@@ -690,8 +690,8 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
 
     def _collapse(self, device: Device):
         assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf)
-        lblock: Block = self.left.block
-        rblock: Block = self.right.block
+        lblock: BlockBase = self.left.block
+        rblock: BlockBase = self.right.block
         if self.op_name == "matmul":
             op_name, args = "tensordot", {"axes": 1}
         elif self.op_name == "tensordot":
@@ -699,7 +699,7 @@ def _collapse(self, device: Device):
         else:
             op_name, args = self.op_name, {}
             assert array_utils.can_broadcast_shapes(lblock.shape, rblock.shape)
-        block: Block = lblock.bop(op_name, rblock, args=args, device=device)
+        block: BlockBase = lblock.bop(op_name, rblock, args=args, device=device)
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
         leaf.tree_node_size = self.left.tree_node_size.bop(
@@ -714,8 +714,8 @@ def _mem_cost(self):
         # Computes the memory required to perform this operation.
         # We approximate by just computing the memory required to store the result.
         assert isinstance(self.left, Leaf) and isinstance(self.right, Leaf)
-        lblock: Block = self.left.block
-        rblock: Block = self.right.block
+        lblock: BlockBase = self.left.block
+        rblock: BlockBase = self.right.block
         if self.op_name == "matmul":
             op_name, args = "tensordot", {"axes": 1}
         elif self.op_name == "tensordot":
@@ -981,7 +981,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         block_ids = [child.block.id for child in self.children]
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         # This updates load on nodes and channels.
         # This also updates block states to indicate that they now reside on the provided nodes.
         # Update the cluster state after computing the leaf, so that transfer costs are properly
@@ -1007,7 +1007,7 @@ def _collapse(self, device: Device):
             block_oids.append(child.block.oid)
             if km is None:
                 km = child.block.km
-        block: Block = Block(
+        block: BlockBase = Block(
             self._grid_entry, self._grid_shape, self._shape, self._dtype, False, km
         )
         block._device = device
@@ -1175,7 +1175,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         block_ids = [child.block.id for child in self.children]
         result = self._collapse(device)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
         # This updates load on nodes and channels.
         # This also updates block states to indicate that they now reside on the provided nodes.
         # Update the cluster state after computing the leaf, so that transfer costs are properly
@@ -1204,7 +1204,7 @@ def _collapse(self, device: Device):
             block_oids.append(child.block.oid)
             if km is None:
                 km = child.block.km
-        block: Block = Block(
+        block: BlockBase = Block(
             self.grid_entry(), self.grid_shape(), self.shape(), self.dtype(), False, km
         )
         block._device = device
diff --git a/nums/experimental/optimizer/grapharray.py b/nums/experimental/optimizer/grapharray.py
index f03a4163..f4a2300b 100644
--- a/nums/experimental/optimizer/grapharray.py
+++ b/nums/experimental/optimizer/grapharray.py
@@ -23,7 +23,7 @@
 
 from nums.core.settings import sync_nnz
 from nums.core.array import utils as array_utils
-from nums.core.array.base import BlockArrayBase, Block
+from nums.core.array.base import BlockBase, BlockArrayBase
 from nums.core.array.blockarray import BlockArray
 from nums.core.array.sparse import SparseBlockArray
 from nums.core.kernel.kernel_manager import KernelManager
@@ -50,7 +50,7 @@ def graphs_from_ba(
     ) -> np.ndarray:
         graphs = np.empty(shape=ba.grid.grid_shape, dtype=np.object)
         for grid_entry in ba.grid.get_entry_iterator():
-            block: Block = ba.blocks[grid_entry]
+            block: BlockBase = ba.blocks[grid_entry]
             # Allocate the block to the node on which it's created.
             km: KernelManager = ba.km
             device: Device = km.device_grid.get_device(
@@ -166,7 +166,7 @@ def iterator(self):
                 yield node
 
     def to_blocks(self) -> np.ndarray:
-        blocks: np.ndarray = np.empty(self.grid.grid_shape, dtype=Block)
+        blocks: np.ndarray = np.empty(self.grid.grid_shape, dtype=BlockBase)
         for grid_entry in self.grid.get_entry_iterator():
             leaf: TreeNode = self.graphs[grid_entry]
             assert isinstance(leaf, Leaf), "%s,%s" % (str(leaf), type(leaf))
diff --git a/nums/experimental/optimizer/reduction_ops.py b/nums/experimental/optimizer/reduction_ops.py
index 1a363513..5ef10f03 100644
--- a/nums/experimental/optimizer/reduction_ops.py
+++ b/nums/experimental/optimizer/reduction_ops.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from nums.core.settings import sync_nnz
-from nums.core.array.base import Block
+from nums.core.array.base import BlockBase
 from nums.core.grid.grid import Device
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.experimental.optimizer.clusterstate import ClusterState
@@ -215,7 +215,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode:
         assert isinstance(left, Leaf) and isinstance(right, Leaf)
         result = self._collapse(device, left, right)
         new_leaf: Leaf = result[0]
-        new_block: Block = result[1]
+        new_block: BlockBase = result[1]
 
         # Update action leaf queue.
         assert set(leaf_ids) == {self.action_leaf_q.pop(0), self.action_leaf_q.pop(0)}
@@ -257,13 +257,13 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode:
         return self
 
     def _collapse(self, device: Device, left: Leaf, right: Leaf):
-        lblock: Block = left.block
-        rblock: Block = right.block
+        lblock: BlockBase = left.block
+        rblock: BlockBase = right.block
         if self.op_name == "matmul":
             raise ValueError("matmul is not a supported reduction operator.")
         op_name, args = self.op_name, {}
         assert lblock.shape == rblock.shape
-        block: Block = lblock.copy()
+        block: BlockBase = lblock.copy()
         block.transposed = False
         block.dtype = array_utils.get_reduce_output_type(self.op_name, lblock.dtype)
         block.oid = lblock.km.bop_reduce(
@@ -294,12 +294,12 @@ def _mem_cost(self, leafs):
         shape = None
         for leaf in leafs:
             assert leaf.tree_node_id in self.leafs_dict
-            leaf_block: Block = leaf.block
+            leaf_block: BlockBase = leaf.block
             if shape is None:
                 shape = leaf_block.shape
             else:
                 assert leaf_block.shape == shape
-        leaf_block: Block = leafs[0].block
+        leaf_block: BlockBase = leafs[0].block
         if leaf_block.is_dense:
             return leaf_block.size()
         if sync_nnz > 1:
diff --git a/tests/core/array/test_sparse.py b/tests/core/array/test_sparse.py
index 85f0254b..302bd082 100644
--- a/tests/core/array/test_sparse.py
+++ b/tests/core/array/test_sparse.py
@@ -171,6 +171,17 @@ def test_tensordot(app_inst: ArrayApplication):
     assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get())
 
 
+# def test_getitem(app_inst: ArrayApplication):
+#     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])
+#     x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
+#     x1_ba = app_inst.array(x1, block_shape=(2, 2))
+#     x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
+#     y_sp = x1_sp[0:2, 1:3]
+#     y_sba = x1_sba[0:2, 1:3]
+#     assert y_sba.fill_value == y_sp.fill_value # 2
+#     assert np.array_equal(y_sp.todense(), y_sba.to_ba().get())
+
+
 def test_sdtp(app_inst: ArrayApplication):
     shape = 50, 50, 50
     block_shape = 10, 10, 10

From 7bb4470123b70155cce8afc11d8f7dc609e3e2f3 Mon Sep 17 00:00:00 2001
From: Daniel Zou <zoudl2000@gmail.com>
Date: Tue, 16 Aug 2022 08:46:06 -0700
Subject: [PATCH 3/5] Remove fill_value from API

---
 nums/core/array/base.py                   | 674 ++--------------------
 nums/core/array/blockarray.py             | 606 ++++++++++++++++++-
 nums/core/array/random.py                 |  10 +-
 nums/core/array/sparse.py                 | 191 ++----
 nums/core/array/utils.py                  |  27 +-
 nums/core/kernel/kernel_interface.py      |   3 +-
 nums/core/kernel/numpy_kernel.py          |  14 +-
 nums/experimental/optimizer/graph.py      |   2 +-
 nums/experimental/optimizer/grapharray.py |   6 +-
 nums/experimental/optimizer/size.py       |  98 ++--
 nums/numpy/random.py                      |  11 +-
 tests/core/array/test_sparse.py           |  54 +-
 tests/experimental/optimizer/test_size.py |  25 +-
 13 files changed, 783 insertions(+), 938 deletions(-)

diff --git a/nums/core/array/base.py b/nums/core/array/base.py
index 648a4fc5..bec634fe 100644
--- a/nums/core/array/base.py
+++ b/nums/core/array/base.py
@@ -17,6 +17,7 @@
 # pylint: disable=too-many-lines
 
 import warnings
+from numba.core.errors import NumbaNotImplementedError
 
 import numpy as np
 
@@ -52,11 +53,10 @@ def __init__(
             self.id = Block.block_id_counter
         # Set if a device id was used to compute this block.
         self._device = None
-        self.fill_value = None
 
     @property
     def is_dense(self):
-        return self.fill_value is None
+        raise NotImplementedError()
 
     @property
     def nbytes(self):
@@ -95,7 +95,31 @@ def true_grid_shape(self):
         return self.grid_shape
 
     def transpose(self, defer=False, redistribute=False):
-        raise NotImplementedError()
+        # If defer is True, this operation does not modify the remote object.
+        # If defer is True and redistribute is False,
+        # this operation does not move the remote object.
+        grid_entryT = tuple(reversed(self.grid_entry))
+        grid_shapeT = tuple(reversed(self.grid_shape))
+        blockT = type(self)(
+            grid_entry=grid_entryT,
+            grid_shape=grid_shapeT,
+            shape=tuple(reversed(self.shape)),
+            dtype=self.dtype,
+            transposed=not self.transposed,
+            km=self.km,
+        )
+        blockT.oid = self.oid
+        if not defer:
+            blockT.transposed = False
+            if redistribute:
+                syskwargs = {"grid_entry": grid_entryT, "grid_shape": grid_shapeT}
+            else:
+                syskwargs = {
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                }
+            blockT.oid = self.km.transpose(self.oid, syskwargs=syskwargs)
+        return blockT
 
     def device(self):
         if self._device is not None:
@@ -299,6 +323,10 @@ def __init__(
     ):
         super().__init__(grid_entry, grid_shape, shape, dtype, transposed, km, id)
 
+    @property
+    def is_dense(self):
+        return True
+
     @property
     def nbytes(self):
         return np.prod(self.shape) * np.dtype(self.dtype).itemsize
@@ -319,33 +347,6 @@ def copy(self, shallow=True):
         block.oid = self.oid
         return block
 
-    def transpose(self, defer=False, redistribute=False):
-        # If defer is True, this operation does not modify the remote object.
-        # If defer is True and redistribute is False,
-        # this operation does not move the remote object.
-        grid_entryT = tuple(reversed(self.grid_entry))
-        grid_shapeT = tuple(reversed(self.grid_shape))
-        blockT = Block(
-            grid_entry=grid_entryT,
-            grid_shape=grid_shapeT,
-            shape=tuple(reversed(self.shape)),
-            dtype=self.dtype,
-            transposed=not self.transposed,
-            km=self.km,
-        )
-        blockT.oid = self.oid
-        if not defer:
-            blockT.transposed = False
-            if redistribute:
-                syskwargs = {"grid_entry": grid_entryT, "grid_shape": grid_shapeT}
-            else:
-                syskwargs = {
-                    "grid_entry": self.grid_entry,
-                    "grid_shape": self.grid_shape,
-                }
-            blockT.oid = self.km.transpose(self.oid, syskwargs=syskwargs)
-        return blockT
-
     def map_uop(self, op_name, args=None, kwargs=None, device=None):
         # This retains transpose.
         block = self.copy()
@@ -459,11 +460,10 @@ def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None
         self.ndim = len(self.shape)
         self.dtype = self.grid.dtype
         self.blocks = blocks
-        self.fill_value = None
 
     @property
     def is_dense(self):
-        return self.fill_value is None
+        raise NotImplementedError()
 
     def __repr__(self):
         return "BlockArray(" + str(self.blocks) + ")"
@@ -545,7 +545,29 @@ def to_block_array(obj, km: KernelManager, block_shape=None):
         raise NotImplementedError()
 
     def transpose(self, defer=False, redistribute=False):
-        raise NotImplementedError()
+        """
+        Transpose this matrix. Only use defer with arithmetic operations.
+        Setting redistribute to True may significantly impact performance.
+        :param defer: When true, the transpose operation will be applied
+        with the next arithmetic operation.
+        :param redistribute: If defer is false, setting this to true will
+        redistribute the data according to the device grid (data placement policy).
+        This parameter has no effect when defer is true.
+        :return: The transposed matrix.
+        """
+        if defer and redistribute:
+            warnings.warn("defer is True, redistribute=True will be ignored.")
+        metaT = self.grid.to_meta()
+        metaT["shape"] = tuple(reversed(metaT["shape"]))
+        metaT["block_shape"] = tuple(reversed(metaT["block_shape"]))
+        gridT = ArrayGrid.from_meta(metaT)
+        rarrT = type(self)(gridT, self.km)
+        rarrT.blocks = np.copy(self.blocks.T)
+        for grid_entry in rarrT.grid.get_entry_iterator():
+            rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose(
+                defer, redistribute
+            )
+        return rarrT
 
     def broadcast_to(self, shape):
         b = array_utils.broadcast(self.shape, shape)
@@ -570,391 +592,6 @@ def broadcast_to(self, shape):
         result.blocks = broadcast
         return result
 
-    def reshape(self, *shape, **kwargs):
-        block_shape = kwargs.get("block_shape", None)
-        if array_utils.is_int(shape):
-            shape = (shape,)
-        elif len(shape) == 0:
-            shape = self.shape
-        elif isinstance(shape[0], (tuple, list)):
-            assert len(shape) == 1
-            shape = shape[0]
-        else:
-            assert all(np.issubdtype(type(n), int) for n in shape)
-        shape = Reshape.compute_shape(self.shape, shape)
-        if block_shape is None:
-            if shape == self.shape:
-                # This is a noop.
-                block_shape = self.block_shape
-            else:
-                block_shape = self.km.get_block_shape(shape, self.dtype)
-        return Reshape()(self, shape, block_shape)
-
-    def _preprocess_subscript(self, item):
-        if not isinstance(item, tuple):
-            ss = (item,)
-        else:
-            ss = item
-        # We need to fetch any block arrays.
-        tmp = []
-        for entry in ss:
-            if isinstance(entry, BlockArrayBase):
-                val = entry.get()
-            else:
-                val = entry
-            if isinstance(val, list):
-                val = np.array(val)
-            if isinstance(val, np.ndarray):
-                # If this is a Boolean mask, convert it to integers.
-                if array_utils.is_bool(val.dtype, type_test=True):
-                    val = np.arange(len(val))[val]
-                if val.shape == ():
-                    val = val.item()
-            tmp.append(val)
-        ss = tuple(tmp)
-        is_handled_advanced = False
-        array_encountered = False
-        axis = None
-
-        # Check if this is a supported advanced indexing operation.
-        for i, entry in enumerate(ss):
-            if isinstance(entry, slice) and entry.start is None and entry.stop is None:
-                continue
-            elif array_utils.is_int(entry) or array_utils.is_uint(entry):
-                continue
-            elif array_utils.is_array_like(entry):
-                if array_encountered:
-                    raise NotImplementedError(
-                        "Advanced indexing is only supported along a single axis."
-                    )
-                is_handled_advanced = True
-                array_encountered = True
-                axis = i
-                if not (np.all(0 <= entry) and np.all(entry < self.shape[axis])):
-                    raise IndexError(
-                        "Advanced indexing array along axis %s is out of bounds." % axis
-                    )
-            else:
-                if array_encountered:
-                    raise NotImplementedError(
-                        "Advanced indexing is only supported "
-                        "with full slices and integers along other axes."
-                    )
-                is_handled_advanced = False
-                break
-
-        return ss, is_handled_advanced, axis
-
-    def __getitem__(self, item):
-        ss, is_handled_advanced, axis = self._preprocess_subscript(item)
-
-        if is_handled_advanced:
-            # Treat this as a shuffle.
-            return self._advanced_single_array_select(ss, axis=axis)
-
-        # This is to deal with circular imports. Little overhead since this happens once per call.
-        # However, would be better to rearrange modules in the future.
-        from nums.core.array.view import ArrayView
-
-        av: ArrayView = ArrayView.from_block_array(self)
-        # TODO (hme): We don't have to create, but do so for now until we need to optimize.
-        return av[ss].create()
-
-    def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
-        # Create output array along the axis of the selection operation.
-        # We don't allocate zeros for output array. Instead, we let the update kernel
-        # create the initial set of zeros to save some memory.
-        array = ss[axis]
-        assert len(array.shape) == 1
-
-        # TODO: We may encounter block shape incompatability due to this.
-        block_size = self.block_shape[axis]
-        self.km.update_block_shape_map(array.shape[0], block_size)
-
-        dst_axis = None
-        shape = []
-        block_shape = []
-        for i in range(len(self.shape)):
-            if i == axis:
-                dst_axis = len(shape)
-                shape.append(array.shape[0])
-                block_shape.append(block_size)
-            elif i < len(ss):
-                if isinstance(ss[i], slice):
-                    shape.append(self.shape[i])
-                    block_shape.append(self.block_shape[i])
-                else:
-                    # It's an index. We drop the indices.
-                    continue
-            else:
-                shape.append(self.shape[i])
-                block_shape.append(self.block_shape[i])
-
-        dst_arr = type(self)(
-            ArrayGrid(
-                shape=tuple(shape),
-                block_shape=tuple(block_shape),
-                dtype=self.dtype.__name__,
-            ),
-            km=self.km,
-        )
-
-        src_arr = self
-        np_ss = ss
-        ss = self.km.put(
-            ss,
-            syskwargs={
-                "grid_entry": (0,),
-                "grid_shape": (1,),
-            },
-        )
-        for src_grid_entry in src_arr.grid.get_entry_iterator():
-            src_coord: tuple = src_arr.grid.get_entry_coordinates(src_grid_entry)
-            src_block: Block = src_arr.blocks[src_grid_entry]
-
-            # Make sure index values in subscript are within bounds of src_arr.
-            # We also prepare dst_grid_entry here.
-            dst_grid_entry_list = []
-            skip = False
-            for curr_axis in range(len(np_ss)):
-                if curr_axis == axis:
-                    dst_grid_entry_list.append(None)
-                elif isinstance(np_ss[curr_axis], slice):
-                    dst_grid_entry_list.append(src_grid_entry[curr_axis])
-                elif not (
-                    src_coord[curr_axis]
-                    <= np_ss[curr_axis]
-                    < src_coord[curr_axis] + src_block.shape[curr_axis]
-                ):
-                    skip = True
-                    break
-            if skip:
-                continue
-            for curr_axis in range(len(np_ss), len(src_grid_entry)):
-                dst_grid_entry_list.append(src_grid_entry[curr_axis])
-
-            for j in range(dst_arr.grid.grid_shape[dst_axis]):
-                dst_grid_entry_list[dst_axis] = j
-                dst_grid_entry = tuple(dst_grid_entry_list)
-                dst_block: Block = dst_arr.blocks[dst_grid_entry]
-                dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
-
-                if dst_block.oid is None:
-                    dst_arg = (dst_block.shape, dst_block.dtype)
-                else:
-                    dst_arg = dst_block.oid
-                dst_block.oid = self.km.advanced_select_block_along_axis(
-                    dst_arg,
-                    src_block.oid,
-                    ss,
-                    dst_axis,
-                    axis,
-                    dst_coord,
-                    src_coord,
-                    syskwargs={
-                        "grid_entry": dst_grid_entry,
-                        "grid_shape": dst_arr.grid.grid_shape,
-                    },
-                )
-        return dst_arr
-
-    def __setitem__(self, key, value):
-        value: BlockArrayBase = self.to_block_array(value, self.km)
-        ss, is_handled_advanced, axis = self._preprocess_subscript(key)
-        if is_handled_advanced:
-            return self._advanced_single_array_assign(ss, value, axis)
-
-        # This is to deal with circular imports. Little overhead since this happens once per call.
-        # However, would be better to rearrange modules in the future.
-        from nums.core.array.view import ArrayView
-
-        av: ArrayView = ArrayView.from_block_array(self)
-        av[key] = value
-
-    def _advanced_single_array_assign(
-        self,
-        ss: tuple,
-        value,
-        axis: int,
-    ):
-        array = ss[axis]
-        assert len(array.shape) == 1
-
-        # The subscript contains a single array. We therefore know one of two things is true:
-        # 1. value is the same shape as self along axes != axis.
-        # 2. value is scalar or 1-dimensional.
-        # We currently don't support the case where value may broadcasted if it has more dims.
-        # This should be a straight-forward future task.
-        value: BlockArrayBase = value
-        mode = None
-        if len(value.shape) == 0:
-            # subscripted value per block will broadcast to other dimensions.
-            mode = "scalar"
-        elif len(value.shape) == 1:
-            # assert len(value.shape) == len(ss)
-            mode = "single-dim"
-            # Can broadcast if trailing dim matches.
-            assert len(ss[axis]) == value.shape[0]
-
-            for i in range(len(self.shape)):
-                if i == axis:
-                    assert len(ss[i]) == value.shape[0]
-                elif i < axis:
-                    # Nothing to check here.
-                    # These entries are : or integer.
-                    pass
-                else:
-                    if i < len(ss):
-                        if not isinstance(ss[i], slice):
-                            # ss[i] is an integer.
-                            continue
-                    # If we're here, then the rest of the subscript operator
-                    # will resolve to :, which is not broadcastable.
-                    raise ValueError(
-                        "Cannot broadcast input array "
-                        "from shape %s into shape %s"
-                        % (value.shape, tuple([value.shape[0]] + list(self.shape[i:])))
-                    )
-        elif len(value.shape) == len(self.shape):
-            mode = "multi-dim"
-            new_block_shape = []
-            for i in range(len(self.shape)):
-                if i == axis:
-                    new_block_shape.append(value.block_shape[i])
-                elif i < len(ss) and (
-                    array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
-                ):
-                    # These entries are : or integer.
-                    # assert array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
-                    assert value.shape[i] == 1
-                    new_block_shape.append(1)
-                else:
-                    assert value.shape[i] == self.shape[i], "Shape mismatch."
-                    new_block_shape.append(self.block_shape[i])
-            new_block_shape = tuple(new_block_shape)
-            if new_block_shape != value.block_shape:
-                # TODO: This message occurs on X[idx[:n]] = X[idx[n:]] + 0.5,
-                #  even when n is a multiple of block_shape[0].
-                warnings.warn(
-                    ("Assigned value block shape %s " % str(value.block_shape))
-                    + (
-                        "does not match block shape %s of assignee. "
-                        % str(new_block_shape)
-                    )
-                    + "Applying reshape to assigned value."
-                )
-                value = value.reshape(block_shape=new_block_shape)
-
-        # Like select, iterate over destination blocks along the axis being updated.
-        # e.g. if self is 2-dim and axis=0, then fix the row and iterate over the columns.
-        # If value has the same shape as self, then for each destination block,
-        # iterate over the blocks in value along axis.
-        # e.g. if self is 2-dim and axis=0, then for the given column, iterate over the rows
-        # of value.
-        # If value is scalar, then attempt to assign it to every destination block.
-        # If value is 1-dim, the just iterate over the dim and assign accordingly.
-
-        dst_arr = self
-        src_arr = value
-        src_grid_shape = src_arr.grid.grid_shape
-        np_ss = ss
-        ss = self.km.put(
-            ss,
-            syskwargs={
-                "grid_entry": (0,),
-                "grid_shape": (1,),
-            },
-        )
-        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
-            dst_block: BlockBase = dst_arr.blocks[dst_grid_entry]
-            dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
-
-            # Make sure index values in subscript are within bounds of dst_arr.
-            # We don't need to check src_arr:
-            # 1) The block shapes of dst_arr and src_arr are the same except along axis
-            #    and indices in ss. We are not concerned with axes the indices in ss correspond to,
-            #    because they are of size 1 in src_arr => we only need to check that indices
-            #    fall within bounds of dst_arr.
-            # 2) For each dst_arr, we test the values
-            #    to assign to dst_arr by traverse the src_arr along axis.
-            #    Thus, size along all other axes are equal or broadcasted.
-            skip = False
-            for curr_axis in range(len(np_ss)):
-                if curr_axis == axis or isinstance(np_ss[curr_axis], slice):
-                    continue
-                if not (
-                    dst_coord[curr_axis]
-                    <= np_ss[curr_axis]
-                    < dst_coord[curr_axis] + dst_block.shape[curr_axis]
-                ):
-                    skip = True
-                    break
-            if skip:
-                continue
-
-            if mode == "scalar":
-                src_block: BlockBase = src_arr.blocks.item()
-                src_coord: tuple = src_arr.grid.get_entry_coordinates(
-                    src_block.grid_entry
-                )
-                dst_block.oid = self.km.advanced_assign_block_along_axis(
-                    dst_block.oid,
-                    src_block.oid,
-                    ss,
-                    axis,
-                    dst_coord,
-                    src_coord,
-                    syskwargs={
-                        "grid_entry": dst_grid_entry,
-                        "grid_shape": dst_arr.grid.grid_shape,
-                    },
-                )
-            elif mode == "single-dim":
-                for src_grid_entry in src_arr.grid.get_entry_iterator():
-                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
-                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
-                        src_grid_entry
-                    )
-                    dst_block.oid = self.km.advanced_assign_block_along_axis(
-                        dst_block.oid,
-                        src_block.oid,
-                        ss,
-                        axis,
-                        dst_coord,
-                        src_coord,
-                        syskwargs={
-                            "grid_entry": dst_grid_entry,
-                            "grid_shape": dst_arr.grid.grid_shape,
-                        },
-                    )
-            elif mode == "multi-dim":
-                for j in range(src_grid_shape[axis]):
-                    # Apply sel from each block along axis of src_arr.
-                    # e.g. for 2 dim array, we fix the column blocks
-                    # given by dst_grid_entry, and iterate over the rows.
-                    src_grid_entry = tuple(
-                        list(dst_grid_entry[:axis])
-                        + [j]
-                        + list(dst_grid_entry[axis + 1 :])
-                    )
-                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
-                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
-                        src_grid_entry
-                    )
-                    dst_block.oid = self.km.advanced_assign_block_along_axis(
-                        dst_block.oid,
-                        src_block.oid,
-                        ss,
-                        axis,
-                        dst_coord,
-                        src_coord,
-                        syskwargs={
-                            "grid_entry": dst_grid_entry,
-                            "grid_shape": dst_arr.grid.grid_shape,
-                        },
-                    )
-        return dst_arr
-
     def tree_reduce(
         self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape
     ):
@@ -1277,204 +914,3 @@ def __ne__(self, other):
     def __rne__(self, other):
         other = self.check_or_convert_other(other)
         return other.__inequality__("ne", self)
-
-
-class Reshape:
-    @staticmethod
-    def compute_shape(shape, input_shape):
-        size = np.product(shape)
-        if -1 in input_shape:
-            new_shape = []
-            other_dim_prod = 1
-            negative_one_seen = False
-            for dim in input_shape:
-                if dim == -1:
-                    if negative_one_seen:
-                        raise Exception("Only one -1 permitted in reshape.")
-                    negative_one_seen = True
-                    continue
-                other_dim_prod *= dim
-            if size % other_dim_prod != 0:
-                raise Exception("Invalid shape.")
-            for dim in input_shape:
-                if dim == -1:
-                    new_shape.append(size // other_dim_prod)
-                else:
-                    new_shape.append(dim)
-        else:
-            new_shape = input_shape
-        assert size == np.product(new_shape)
-        return new_shape
-
-    def _group_index_lists_by_block(
-        self, dst_slice_tuples, src_grid: ArrayGrid, dst_index_list, src_index_list
-    ):
-        # TODO(hme): Keep this function here until it's needed for greater support of
-        #  selection/assignment operations.
-        # Block grid entries needed to write to given dst_slice_selection.
-        src_blocks = {}
-        dst_slice_np = np.array(dst_slice_tuples).T
-        dst_index_arr = np.array(dst_index_list)
-        src_index_arr = np.array(src_index_list)
-        # Pick the smallest type to represent indices.
-        # A set of these indices may be transmitted over the network,
-        # so we want to pick the smallest encoding possible.
-        index_types = [
-            (2**8, np.uint8),
-            (2**16, np.uint16),
-            (2**32, np.uint32),
-            (2**64, np.uint64),
-        ]
-        index_type = None
-        for bound, curr_index_type in index_types:
-            if np.all(np.array(src_grid.block_shape) < bound) and np.all(
-                dst_slice_np[1] < bound
-            ):
-                index_type = curr_index_type
-                break
-        if index_type is None:
-            raise Exception("Unable to encode block indices, blocks are too large.")
-        for grid_entry in src_grid.get_entry_iterator():
-            src_slice_np = np.array(src_grid.get_slice_tuples(grid_entry)).T
-            index_pairs = []
-            for i in range(src_index_arr.shape[0]):
-                src_index = src_index_arr[i]
-                dst_index = dst_index_arr[i]
-                if np.all(
-                    (src_slice_np[0] <= src_index) & (src_index < src_slice_np[1])
-                ):
-                    index_pair = (
-                        (dst_index - dst_slice_np[0]).astype(index_type),
-                        (src_index - src_slice_np[0]).astype(index_type),
-                    )
-                    index_pairs.append(index_pair)
-            if len(index_pairs) > 0:
-                src_blocks[grid_entry] = index_pairs
-        return src_blocks
-
-    def _arbitrary_reshape(
-        self, arr: BlockArrayBase, shape, block_shape
-    ) -> BlockArrayBase:
-        # This is the worst-case scenario.
-        # Generate index mappings per block, and group source indices to minimize
-        # RPCs and generation of new objects.
-        km = arr.km
-        dst_arr = type(arr).empty(
-            shape=shape, block_shape=block_shape, dtype=arr.dtype, km=km
-        )
-        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
-            dst_block: BlockBase = dst_arr.blocks[dst_grid_entry]
-            dst_slice_selection = dst_arr.grid.get_slice(dst_grid_entry)
-            dst_index_list = array_utils.slice_sel_to_index_list(dst_slice_selection)
-            src_index_list = array_utils.translate_index_list(
-                dst_index_list, shape, arr.shape
-            )
-            src_blocks = self._group_index_lists_by_block(
-                dst_arr.grid.get_slice_tuples(dst_grid_entry),
-                arr.grid,
-                dst_index_list,
-                src_index_list,
-            )
-            for src_grid_entry in src_blocks:
-                src_block: BlockBase = arr.blocks[src_grid_entry]
-                index_pairs = src_blocks[src_grid_entry]
-                syskwargs = {
-                    "grid_entry": dst_grid_entry,
-                    "grid_shape": dst_arr.grid.grid_shape,
-                }
-                dst_block.oid = km.update_block_by_index(
-                    dst_block.oid, src_block.oid, index_pairs, syskwargs=syskwargs
-                )
-        return dst_arr
-
-    def _block_shape_reshape(self, arr, block_shape):
-        rarr: BlockArrayBase = type(arr).empty(
-            arr.shape, block_shape, arr.dtype, arr.km
-        )
-        for grid_entry in rarr.grid.get_entry_iterator():
-            grid_entry_slice = rarr.grid.get_slice(grid_entry)
-            # TODO (hme): This could be less costly.
-            rarr[grid_entry_slice] = arr[grid_entry_slice]
-        return rarr
-
-    def _strip_ones(self, shape):
-        return tuple(filter(lambda x: x != 1, shape))
-
-    def _check_positions_ones(self, shape, block_shape):
-        # If a position in the shape is 1, then the corresponding
-        # position in block_shape should also be 1.
-        for i in range(len(shape)):
-            if shape[i] == 1:
-                if shape[i] != block_shape[i]:
-                    return False
-        return True
-
-    def _is_simple_reshape(self, arr: BlockArrayBase, shape, block_shape):
-        # Is the reshape a difference of factors of 1?
-        # Strip out 1s and compare.
-        # If a position in the shape is 1, then the corresponding
-        # position in block_shape should also be 1.
-
-        # If source shape and dest shape are the same or source block_shape and dest block_shape
-        # are same, this is not a simple reshape.
-        if shape == arr.shape or block_shape == arr.block_shape:
-            return False
-
-        # Checks if source shape and dest shape are same & source block_shape and dest
-        # block_shape are same after stripping ones.
-        if not (
-            self._strip_ones(shape) == self._strip_ones(arr.shape)
-            and self._strip_ones(block_shape) == self._strip_ones(arr.block_shape)
-        ):
-            return False
-        if not self._check_positions_ones(shape, block_shape):
-            return False
-        return True
-
-    def _simple_reshape(self, arr, shape, block_shape):
-        # Reshape the array of blocks only.
-        # This is only used when the difference in shape are factors of 1s,
-        # and the ordering of other factors are maintained.
-
-        # Check assumptions.
-        assert len(self._strip_ones(arr.shape)) == len(self._strip_ones(shape))
-
-        # Create new grid, and perform reshape on blocks
-        # to simplify access to source blocks.
-        grid = ArrayGrid(shape, block_shape, dtype=arr.dtype.__name__)
-        src_blocks = arr.blocks.reshape(grid.grid_shape)
-        if arr.is_dense:
-            rarr = type(arr)(grid, arr.km)
-        else:
-            rarr = type(arr)(grid, arr.km, arr.fill_value)
-        for grid_entry in grid.get_entry_iterator():
-            src_block: BlockBase = src_blocks[grid_entry]
-            dst_block: BlockBase = rarr.blocks[grid_entry]
-            syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape}
-            dst_block.oid = arr.km.reshape(
-                src_block.oid, dst_block.shape, syskwargs=syskwargs
-            )
-        return rarr
-
-    def _validate(self, arr, shape, block_shape):
-        assert -1 not in shape
-        assert -1 not in block_shape
-        assert len(shape) == len(block_shape)
-        assert np.product(arr.shape) == np.product(shape)
-
-    def __call__(self, arr: BlockArrayBase, shape, block_shape):
-        self._validate(arr, shape, block_shape)
-        if arr.shape == shape and arr.block_shape == block_shape:
-            return arr
-        elif self._is_simple_reshape(arr, shape, block_shape):
-            return self._simple_reshape(arr, shape, block_shape)
-        elif arr.shape == shape and arr.block_shape != block_shape:
-            return self._block_shape_reshape(arr, block_shape)
-        elif arr.shape != shape and arr.block_shape == block_shape:
-            # Just do full reshape for this case as well.
-            # Though there may be a better solution, we generally expect
-            # the block shape to change with array shape.
-            return self._arbitrary_reshape(arr, shape, block_shape)
-        else:
-            assert arr.shape != shape and arr.block_shape != block_shape
-            return self._arbitrary_reshape(arr, shape, block_shape)
diff --git a/nums/core/array/blockarray.py b/nums/core/array/blockarray.py
index 041747b2..db0eb20d 100644
--- a/nums/core/array/blockarray.py
+++ b/nums/core/array/blockarray.py
@@ -51,6 +51,10 @@ def __init__(self, grid: ArrayGrid, km: KernelManager, blocks: np.ndarray = None
                     km=self.km,
                 )
 
+    @property
+    def is_dense(self):
+        return True
+
     @classmethod
     def empty(cls, shape, block_shape, dtype, km: KernelManager):
         return BlockArray.create("empty", shape, block_shape, dtype, km)
@@ -161,6 +165,26 @@ def to_single_block(self, replicate=False):
                 )
         return res
 
+    def reshape(self, *shape, **kwargs):
+        block_shape = kwargs.get("block_shape", None)
+        if array_utils.is_int(shape):
+            shape = (shape,)
+        elif len(shape) == 0:
+            shape = self.shape
+        elif isinstance(shape[0], (tuple, list)):
+            assert len(shape) == 1
+            shape = shape[0]
+        else:
+            assert all(np.issubdtype(type(n), int) for n in shape)
+        shape = Reshape.compute_shape(self.shape, shape)
+        if block_shape is None:
+            if shape == self.shape:
+                # This is a noop.
+                block_shape = self.block_shape
+            else:
+                block_shape = self.km.get_block_shape(shape, self.dtype)
+        return Reshape()(self, shape, block_shape)
+
     def expand_dims(self, axis):
         """
         This function refers to the numpy implementation of expand_dims.
@@ -212,30 +236,370 @@ def swapaxes(self, axis1, axis2):
         rarr_swap = BlockArray(grid_swap, self.km, rarr_src)
         return rarr_swap
 
-    def transpose(self, defer=False, redistribute=False):
-        """
-        Transpose this matrix. Only use defer with arithmetic operations.
-        Setting redistribute to True may significantly impact performance.
-        :param defer: When true, the transpose operation will be applied
-        with the next arithmetic operation.
-        :param redistribute: If defer is false, setting this to true will
-        redistribute the data according to the device grid (data placement policy).
-        This parameter has no effect when defer is true.
-        :return: The transposed matrix.
-        """
-        if defer and redistribute:
-            warnings.warn("defer is True, redistribute=True will be ignored.")
-        metaT = self.grid.to_meta()
-        metaT["shape"] = tuple(reversed(metaT["shape"]))
-        metaT["block_shape"] = tuple(reversed(metaT["block_shape"]))
-        gridT = ArrayGrid.from_meta(metaT)
-        rarrT = BlockArray(gridT, self.km)
-        rarrT.blocks = np.copy(self.blocks.T)
-        for grid_entry in rarrT.grid.get_entry_iterator():
-            rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose(
-                defer, redistribute
-            )
-        return rarrT
+    def _preprocess_subscript(self, item):
+        if not isinstance(item, tuple):
+            ss = (item,)
+        else:
+            ss = item
+        # We need to fetch any block arrays.
+        tmp = []
+        for entry in ss:
+            if isinstance(entry, BlockArrayBase):
+                val = entry.get()
+            else:
+                val = entry
+            if isinstance(val, list):
+                val = np.array(val)
+            if isinstance(val, np.ndarray):
+                # If this is a Boolean mask, convert it to integers.
+                if array_utils.is_bool(val.dtype, type_test=True):
+                    val = np.arange(len(val))[val]
+                if val.shape == ():
+                    val = val.item()
+            tmp.append(val)
+        ss = tuple(tmp)
+        is_handled_advanced = False
+        array_encountered = False
+        axis = None
+
+        # Check if this is a supported advanced indexing operation.
+        for i, entry in enumerate(ss):
+            if isinstance(entry, slice) and entry.start is None and entry.stop is None:
+                continue
+            elif array_utils.is_int(entry) or array_utils.is_uint(entry):
+                continue
+            elif array_utils.is_array_like(entry):
+                if array_encountered:
+                    raise NotImplementedError(
+                        "Advanced indexing is only supported along a single axis."
+                    )
+                is_handled_advanced = True
+                array_encountered = True
+                axis = i
+                if not (np.all(0 <= entry) and np.all(entry < self.shape[axis])):
+                    raise IndexError(
+                        "Advanced indexing array along axis %s is out of bounds." % axis
+                    )
+            else:
+                if array_encountered:
+                    raise NotImplementedError(
+                        "Advanced indexing is only supported "
+                        "with full slices and integers along other axes."
+                    )
+                is_handled_advanced = False
+                break
+
+        return ss, is_handled_advanced, axis
+
+    def __getitem__(self, item):
+        ss, is_handled_advanced, axis = self._preprocess_subscript(item)
+
+        if is_handled_advanced:
+            # Treat this as a shuffle.
+            return self._advanced_single_array_select(ss, axis=axis)
+
+        # This is to deal with circular imports. Little overhead since this happens once per call.
+        # However, would be better to rearrange modules in the future.
+        from nums.core.array.view import ArrayView
+
+        av: ArrayView = ArrayView.from_block_array(self)
+        # TODO (hme): We don't have to create, but do so for now until we need to optimize.
+        return av[ss].create()
+
+    def _advanced_single_array_select(self, ss: tuple, axis: int = 0):
+        # Create output array along the axis of the selection operation.
+        # We don't allocate zeros for output array. Instead, we let the update kernel
+        # create the initial set of zeros to save some memory.
+        array = ss[axis]
+        assert len(array.shape) == 1
+
+        # TODO: We may encounter block shape incompatability due to this.
+        block_size = self.block_shape[axis]
+        self.km.update_block_shape_map(array.shape[0], block_size)
+
+        dst_axis = None
+        shape = []
+        block_shape = []
+        for i in range(len(self.shape)):
+            if i == axis:
+                dst_axis = len(shape)
+                shape.append(array.shape[0])
+                block_shape.append(block_size)
+            elif i < len(ss):
+                if isinstance(ss[i], slice):
+                    shape.append(self.shape[i])
+                    block_shape.append(self.block_shape[i])
+                else:
+                    # It's an index. We drop the indices.
+                    continue
+            else:
+                shape.append(self.shape[i])
+                block_shape.append(self.block_shape[i])
+
+        dst_arr = type(self)(
+            ArrayGrid(
+                shape=tuple(shape),
+                block_shape=tuple(block_shape),
+                dtype=self.dtype.__name__,
+            ),
+            km=self.km,
+        )
+
+        src_arr = self
+        np_ss = ss
+        ss = self.km.put(
+            ss,
+            syskwargs={
+                "grid_entry": (0,),
+                "grid_shape": (1,),
+            },
+        )
+        for src_grid_entry in src_arr.grid.get_entry_iterator():
+            src_coord: tuple = src_arr.grid.get_entry_coordinates(src_grid_entry)
+            src_block: Block = src_arr.blocks[src_grid_entry]
+
+            # Make sure index values in subscript are within bounds of src_arr.
+            # We also prepare dst_grid_entry here.
+            dst_grid_entry_list = []
+            skip = False
+            for curr_axis in range(len(np_ss)):
+                if curr_axis == axis:
+                    dst_grid_entry_list.append(None)
+                elif isinstance(np_ss[curr_axis], slice):
+                    dst_grid_entry_list.append(src_grid_entry[curr_axis])
+                elif not (
+                    src_coord[curr_axis]
+                    <= np_ss[curr_axis]
+                    < src_coord[curr_axis] + src_block.shape[curr_axis]
+                ):
+                    skip = True
+                    break
+            if skip:
+                continue
+            for curr_axis in range(len(np_ss), len(src_grid_entry)):
+                dst_grid_entry_list.append(src_grid_entry[curr_axis])
+
+            for j in range(dst_arr.grid.grid_shape[dst_axis]):
+                dst_grid_entry_list[dst_axis] = j
+                dst_grid_entry = tuple(dst_grid_entry_list)
+                dst_block: Block = dst_arr.blocks[dst_grid_entry]
+                dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
+
+                if dst_block.oid is None:
+                    dst_arg = (dst_block.shape, dst_block.dtype)
+                else:
+                    dst_arg = dst_block.oid
+                dst_block.oid = self.km.advanced_select_block_along_axis(
+                    dst_arg,
+                    src_block.oid,
+                    ss,
+                    dst_axis,
+                    axis,
+                    dst_coord,
+                    src_coord,
+                    syskwargs={
+                        "grid_entry": dst_grid_entry,
+                        "grid_shape": dst_arr.grid.grid_shape,
+                    },
+                )
+        return dst_arr
+
+    def __setitem__(self, key, value):
+        value: BlockArrayBase = self.to_block_array(value, self.km)
+        ss, is_handled_advanced, axis = self._preprocess_subscript(key)
+        if is_handled_advanced:
+            return self._advanced_single_array_assign(ss, value, axis)
+
+        # This is to deal with circular imports. Little overhead since this happens once per call.
+        # However, would be better to rearrange modules in the future.
+        from nums.core.array.view import ArrayView
+
+        av: ArrayView = ArrayView.from_block_array(self)
+        av[key] = value
+
+    def _advanced_single_array_assign(
+        self,
+        ss: tuple,
+        value,
+        axis: int,
+    ):
+        array = ss[axis]
+        assert len(array.shape) == 1
+
+        # The subscript contains a single array. We therefore know one of two things is true:
+        # 1. value is the same shape as self along axes != axis.
+        # 2. value is scalar or 1-dimensional.
+        # We currently don't support the case where value may broadcasted if it has more dims.
+        # This should be a straight-forward future task.
+        value: BlockArrayBase = value
+        mode = None
+        if len(value.shape) == 0:
+            # subscripted value per block will broadcast to other dimensions.
+            mode = "scalar"
+        elif len(value.shape) == 1:
+            # assert len(value.shape) == len(ss)
+            mode = "single-dim"
+            # Can broadcast if trailing dim matches.
+            assert len(ss[axis]) == value.shape[0]
+
+            for i in range(len(self.shape)):
+                if i == axis:
+                    assert len(ss[i]) == value.shape[0]
+                elif i < axis:
+                    # Nothing to check here.
+                    # These entries are : or integer.
+                    pass
+                else:
+                    if i < len(ss):
+                        if not isinstance(ss[i], slice):
+                            # ss[i] is an integer.
+                            continue
+                    # If we're here, then the rest of the subscript operator
+                    # will resolve to :, which is not broadcastable.
+                    raise ValueError(
+                        "Cannot broadcast input array "
+                        "from shape %s into shape %s"
+                        % (value.shape, tuple([value.shape[0]] + list(self.shape[i:])))
+                    )
+        elif len(value.shape) == len(self.shape):
+            mode = "multi-dim"
+            new_block_shape = []
+            for i in range(len(self.shape)):
+                if i == axis:
+                    new_block_shape.append(value.block_shape[i])
+                elif i < len(ss) and (
+                    array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
+                ):
+                    # These entries are : or integer.
+                    # assert array_utils.is_int(ss[i]) or array_utils.is_uint(ss[i])
+                    assert value.shape[i] == 1
+                    new_block_shape.append(1)
+                else:
+                    assert value.shape[i] == self.shape[i], "Shape mismatch."
+                    new_block_shape.append(self.block_shape[i])
+            new_block_shape = tuple(new_block_shape)
+            if new_block_shape != value.block_shape:
+                # TODO: This message occurs on X[idx[:n]] = X[idx[n:]] + 0.5,
+                #  even when n is a multiple of block_shape[0].
+                warnings.warn(
+                    ("Assigned value block shape %s " % str(value.block_shape))
+                    + (
+                        "does not match block shape %s of assignee. "
+                        % str(new_block_shape)
+                    )
+                    + "Applying reshape to assigned value."
+                )
+                value = value.reshape(block_shape=new_block_shape)
+
+        # Like select, iterate over destination blocks along the axis being updated.
+        # e.g. if self is 2-dim and axis=0, then fix the row and iterate over the columns.
+        # If value has the same shape as self, then for each destination block,
+        # iterate over the blocks in value along axis.
+        # e.g. if self is 2-dim and axis=0, then for the given column, iterate over the rows
+        # of value.
+        # If value is scalar, then attempt to assign it to every destination block.
+        # If value is 1-dim, the just iterate over the dim and assign accordingly.
+
+        dst_arr = self
+        src_arr = value
+        src_grid_shape = src_arr.grid.grid_shape
+        np_ss = ss
+        ss = self.km.put(
+            ss,
+            syskwargs={
+                "grid_entry": (0,),
+                "grid_shape": (1,),
+            },
+        )
+        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
+            dst_block: BlockBase = dst_arr.blocks[dst_grid_entry]
+            dst_coord: tuple = dst_arr.grid.get_entry_coordinates(dst_grid_entry)
+
+            # Make sure index values in subscript are within bounds of dst_arr.
+            # We don't need to check src_arr:
+            # 1) The block shapes of dst_arr and src_arr are the same except along axis
+            #    and indices in ss. We are not concerned with axes the indices in ss correspond to,
+            #    because they are of size 1 in src_arr => we only need to check that indices
+            #    fall within bounds of dst_arr.
+            # 2) For each dst_arr, we test the values
+            #    to assign to dst_arr by traverse the src_arr along axis.
+            #    Thus, size along all other axes are equal or broadcasted.
+            skip = False
+            for curr_axis in range(len(np_ss)):
+                if curr_axis == axis or isinstance(np_ss[curr_axis], slice):
+                    continue
+                if not (
+                    dst_coord[curr_axis]
+                    <= np_ss[curr_axis]
+                    < dst_coord[curr_axis] + dst_block.shape[curr_axis]
+                ):
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            if mode == "scalar":
+                src_block: BlockBase = src_arr.blocks.item()
+                src_coord: tuple = src_arr.grid.get_entry_coordinates(
+                    src_block.grid_entry
+                )
+                dst_block.oid = self.km.advanced_assign_block_along_axis(
+                    dst_block.oid,
+                    src_block.oid,
+                    ss,
+                    axis,
+                    dst_coord,
+                    src_coord,
+                    syskwargs={
+                        "grid_entry": dst_grid_entry,
+                        "grid_shape": dst_arr.grid.grid_shape,
+                    },
+                )
+            elif mode == "single-dim":
+                for src_grid_entry in src_arr.grid.get_entry_iterator():
+                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
+                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
+                        src_grid_entry
+                    )
+                    dst_block.oid = self.km.advanced_assign_block_along_axis(
+                        dst_block.oid,
+                        src_block.oid,
+                        ss,
+                        axis,
+                        dst_coord,
+                        src_coord,
+                        syskwargs={
+                            "grid_entry": dst_grid_entry,
+                            "grid_shape": dst_arr.grid.grid_shape,
+                        },
+                    )
+            elif mode == "multi-dim":
+                for j in range(src_grid_shape[axis]):
+                    # Apply sel from each block along axis of src_arr.
+                    # e.g. for 2 dim array, we fix the column blocks
+                    # given by dst_grid_entry, and iterate over the rows.
+                    src_grid_entry = tuple(
+                        list(dst_grid_entry[:axis])
+                        + [j]
+                        + list(dst_grid_entry[axis + 1 :])
+                    )
+                    src_block: BlockBase = src_arr.blocks[src_grid_entry]
+                    src_coord: tuple = src_arr.grid.get_entry_coordinates(
+                        src_grid_entry
+                    )
+                    dst_block.oid = self.km.advanced_assign_block_along_axis(
+                        dst_block.oid,
+                        src_block.oid,
+                        ss,
+                        axis,
+                        dst_coord,
+                        src_coord,
+                        syskwargs={
+                            "grid_entry": dst_grid_entry,
+                            "grid_shape": dst_arr.grid.grid_shape,
+                        },
+                    )
+        return dst_arr
 
     @staticmethod
     def to_block_array(obj, km: KernelManager, block_shape=None):
@@ -511,3 +875,197 @@ def __inequality__(self, op, other):
                 op, other_block, args={}
             )
         return result
+
+
+class Reshape:
+    @staticmethod
+    def compute_shape(shape, input_shape):
+        size = np.product(shape)
+        if -1 in input_shape:
+            new_shape = []
+            other_dim_prod = 1
+            negative_one_seen = False
+            for dim in input_shape:
+                if dim == -1:
+                    if negative_one_seen:
+                        raise Exception("Only one -1 permitted in reshape.")
+                    negative_one_seen = True
+                    continue
+                other_dim_prod *= dim
+            if size % other_dim_prod != 0:
+                raise Exception("Invalid shape.")
+            for dim in input_shape:
+                if dim == -1:
+                    new_shape.append(size // other_dim_prod)
+                else:
+                    new_shape.append(dim)
+        else:
+            new_shape = input_shape
+        assert size == np.product(new_shape)
+        return new_shape
+
+    def _group_index_lists_by_block(
+        self, dst_slice_tuples, src_grid: ArrayGrid, dst_index_list, src_index_list
+    ):
+        # TODO(hme): Keep this function here until it's needed for greater support of
+        #  selection/assignment operations.
+        # Block grid entries needed to write to given dst_slice_selection.
+        src_blocks = {}
+        dst_slice_np = np.array(dst_slice_tuples).T
+        dst_index_arr = np.array(dst_index_list)
+        src_index_arr = np.array(src_index_list)
+        # Pick the smallest type to represent indices.
+        # A set of these indices may be transmitted over the network,
+        # so we want to pick the smallest encoding possible.
+        index_types = [
+            (2**8, np.uint8),
+            (2**16, np.uint16),
+            (2**32, np.uint32),
+            (2**64, np.uint64),
+        ]
+        index_type = None
+        for bound, curr_index_type in index_types:
+            if np.all(np.array(src_grid.block_shape) < bound) and np.all(
+                dst_slice_np[1] < bound
+            ):
+                index_type = curr_index_type
+                break
+        if index_type is None:
+            raise Exception("Unable to encode block indices, blocks are too large.")
+        for grid_entry in src_grid.get_entry_iterator():
+            src_slice_np = np.array(src_grid.get_slice_tuples(grid_entry)).T
+            index_pairs = []
+            for i in range(src_index_arr.shape[0]):
+                src_index = src_index_arr[i]
+                dst_index = dst_index_arr[i]
+                if np.all(
+                    (src_slice_np[0] <= src_index) & (src_index < src_slice_np[1])
+                ):
+                    index_pair = (
+                        (dst_index - dst_slice_np[0]).astype(index_type),
+                        (src_index - src_slice_np[0]).astype(index_type),
+                    )
+                    index_pairs.append(index_pair)
+            if len(index_pairs) > 0:
+                src_blocks[grid_entry] = index_pairs
+        return src_blocks
+
+    def _arbitrary_reshape(self, arr: BlockArray, shape, block_shape) -> BlockArray:
+        # This is the worst-case scenario.
+        # Generate index mappings per block, and group source indices to minimize
+        # RPCs and generation of new objects.
+        km = arr.km
+        dst_arr = type(arr).empty(
+            shape=shape, block_shape=block_shape, dtype=arr.dtype, km=km
+        )
+        for dst_grid_entry in dst_arr.grid.get_entry_iterator():
+            dst_block: Block = dst_arr.blocks[dst_grid_entry]
+            dst_slice_selection = dst_arr.grid.get_slice(dst_grid_entry)
+            dst_index_list = array_utils.slice_sel_to_index_list(dst_slice_selection)
+            src_index_list = array_utils.translate_index_list(
+                dst_index_list, shape, arr.shape
+            )
+            src_blocks = self._group_index_lists_by_block(
+                dst_arr.grid.get_slice_tuples(dst_grid_entry),
+                arr.grid,
+                dst_index_list,
+                src_index_list,
+            )
+            for src_grid_entry in src_blocks:
+                src_block: Block = arr.blocks[src_grid_entry]
+                index_pairs = src_blocks[src_grid_entry]
+                syskwargs = {
+                    "grid_entry": dst_grid_entry,
+                    "grid_shape": dst_arr.grid.grid_shape,
+                }
+                dst_block.oid = km.update_block_by_index(
+                    dst_block.oid, src_block.oid, index_pairs, syskwargs=syskwargs
+                )
+        return dst_arr
+
+    def _block_shape_reshape(self, arr, block_shape):
+        rarr: BlockArray = type(arr).empty(arr.shape, block_shape, arr.dtype, arr.km)
+        for grid_entry in rarr.grid.get_entry_iterator():
+            grid_entry_slice = rarr.grid.get_slice(grid_entry)
+            # TODO (hme): This could be less costly.
+            rarr[grid_entry_slice] = arr[grid_entry_slice]
+        return rarr
+
+    def _strip_ones(self, shape):
+        return tuple(filter(lambda x: x != 1, shape))
+
+    def _check_positions_ones(self, shape, block_shape):
+        # If a position in the shape is 1, then the corresponding
+        # position in block_shape should also be 1.
+        for i in range(len(shape)):
+            if shape[i] == 1:
+                if shape[i] != block_shape[i]:
+                    return False
+        return True
+
+    def _is_simple_reshape(self, arr: BlockArray, shape, block_shape):
+        # Is the reshape a difference of factors of 1?
+        # Strip out 1s and compare.
+        # If a position in the shape is 1, then the corresponding
+        # position in block_shape should also be 1.
+
+        # If source shape and dest shape are the same or source block_shape and dest block_shape
+        # are same, this is not a simple reshape.
+        if shape == arr.shape or block_shape == arr.block_shape:
+            return False
+
+        # Checks if source shape and dest shape are same & source block_shape and dest
+        # block_shape are same after stripping ones.
+        if not (
+            self._strip_ones(shape) == self._strip_ones(arr.shape)
+            and self._strip_ones(block_shape) == self._strip_ones(arr.block_shape)
+        ):
+            return False
+        if not self._check_positions_ones(shape, block_shape):
+            return False
+        return True
+
+    def _simple_reshape(self, arr, shape, block_shape):
+        # Reshape the array of blocks only.
+        # This is only used when the difference in shape are factors of 1s,
+        # and the ordering of other factors are maintained.
+
+        # Check assumptions.
+        assert len(self._strip_ones(arr.shape)) == len(self._strip_ones(shape))
+
+        # Create new grid, and perform reshape on blocks
+        # to simplify access to source blocks.
+        grid = ArrayGrid(shape, block_shape, dtype=arr.dtype.__name__)
+        src_blocks = arr.blocks.reshape(grid.grid_shape)
+        rarr = BlockArray(grid, arr.km)
+        for grid_entry in grid.get_entry_iterator():
+            src_block: Block = src_blocks[grid_entry]
+            dst_block: Block = rarr.blocks[grid_entry]
+            syskwargs = {"grid_entry": grid_entry, "grid_shape": grid.grid_shape}
+            dst_block.oid = arr.km.reshape(
+                src_block.oid, dst_block.shape, syskwargs=syskwargs
+            )
+        return rarr
+
+    def _validate(self, arr, shape, block_shape):
+        assert -1 not in shape
+        assert -1 not in block_shape
+        assert len(shape) == len(block_shape)
+        assert np.product(arr.shape) == np.product(shape)
+
+    def __call__(self, arr: BlockArray, shape, block_shape):
+        self._validate(arr, shape, block_shape)
+        if arr.shape == shape and arr.block_shape == block_shape:
+            return arr
+        elif self._is_simple_reshape(arr, shape, block_shape):
+            return self._simple_reshape(arr, shape, block_shape)
+        elif arr.shape == shape and arr.block_shape != block_shape:
+            return self._block_shape_reshape(arr, block_shape)
+        elif arr.shape != shape and arr.block_shape == block_shape:
+            # Just do full reshape for this case as well.
+            # Though there may be a better solution, we generally expect
+            # the block shape to change with array shape.
+            return self._arbitrary_reshape(arr, shape, block_shape)
+        else:
+            assert arr.shape != shape and arr.block_shape != block_shape
+            return self._arbitrary_reshape(arr, shape, block_shape)
diff --git a/nums/core/array/random.py b/nums/core/array/random.py
index fb17e1d8..45196888 100644
--- a/nums/core/array/random.py
+++ b/nums/core/array/random.py
@@ -224,7 +224,6 @@ def sparse_randint(
         shape=None,
         block_shape=None,
         p=0.01,
-        fill_value=0,
     ):
         if dtype is None:
             dtype = np.int64
@@ -236,7 +235,6 @@ def sparse_randint(
             block_shape,
             dtype,
             p,
-            fill_value,
         )
 
     def sparse_uniform(
@@ -247,7 +245,6 @@ def sparse_uniform(
         block_shape=None,
         dtype=None,
         p=0.01,
-        fill_value=0,
     ):
         return self._sparse_sample_basic(
             "uniform",
@@ -256,7 +253,6 @@ def sparse_uniform(
             block_shape,
             dtype,
             p,
-            fill_value,
         )
 
     def sparse_normal(
@@ -267,7 +263,6 @@ def sparse_normal(
         block_shape=None,
         dtype=None,
         p=0.01,
-        fill_value=0,
     ):
         return self._sparse_sample_basic(
             "normal",
@@ -276,7 +271,6 @@ def sparse_normal(
             block_shape,
             dtype,
             p,
-            fill_value,
         )
 
     def _sparse_sample_basic(
@@ -287,7 +281,6 @@ def _sparse_sample_basic(
         block_shape,
         dtype,
         p,
-        fill_value,
     ) -> SparseBlockArray:
         if shape is None:
             assert block_shape is None
@@ -300,7 +293,7 @@ def _sparse_sample_basic(
         assert isinstance(dtype, type)
         assert "size" not in rfunc_args
         grid: ArrayGrid = ArrayGrid(shape, block_shape, dtype=dtype.__name__)
-        sba: SparseBlockArray = SparseBlockArray(grid, self._km, fill_value)
+        sba: SparseBlockArray = SparseBlockArray(grid, self._km)
         for grid_entry in sba.grid.get_entry_iterator():
             rng_params = list(self._rng.new_block_rng_params())
             this_block_shape = grid.get_block_shape(grid_entry)
@@ -316,7 +309,6 @@ def _sparse_sample_basic(
                 this_block_shape,
                 dtype,
                 p,
-                fill_value,
                 syskwargs=syskwargs,
             )
             block._nnz = sba.km.sparse_nnz(block.oid, syskwargs=syskwargs)
diff --git a/nums/core/array/sparse.py b/nums/core/array/sparse.py
index 0a51947e..f7dc2e2d 100644
--- a/nums/core/array/sparse.py
+++ b/nums/core/array/sparse.py
@@ -19,17 +19,19 @@ def __init__(
         dtype,
         transposed,
         km: KernelManager,
-        fill_value=0,
         id=None,
         index_dtype=np.int64,
     ):
         super().__init__(grid_entry, grid_shape, shape, dtype, transposed, km, id)
-        self.fill_value = fill_value
         self.index_dtype = index_dtype
         self.oid = None
         self._nnz: object = None
         self._nbytes: object = None
 
+    @property
+    def is_dense(self):
+        return False
+
     @property
     def nnz(self):
         if self._nnz is None:
@@ -74,30 +76,6 @@ def copy(self, shallow=True):
         block.oid = self.oid
         return block
 
-    def transpose(self, defer=False, redistribute=False):
-        grid_entryT = tuple(reversed(self.grid_entry))
-        grid_shapeT = tuple(reversed(self.grid_shape))
-        blockT = SparseBlock(
-            grid_entry=grid_entryT,
-            grid_shape=grid_shapeT,
-            shape=tuple(reversed(self.shape)),
-            dtype=self.dtype,
-            transposed=not self.transposed,
-            km=self.km,
-        )
-        blockT.oid = self.oid
-        if not defer:
-            blockT.transposed = False
-            if redistribute:
-                syskwargs = {"grid_entry": grid_entryT, "grid_shape": grid_shapeT}
-            else:
-                syskwargs = {
-                    "grid_entry": self.grid_entry,
-                    "grid_shape": self.grid_shape,
-                }
-            blockT.oid = self.km.transpose(self.oid, syskwargs=syskwargs)
-        return blockT
-
     def map_uop(self, op_name, args=None, kwargs=None, device=None):
         block = self.copy()
         block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
@@ -113,27 +91,43 @@ def map_uop(self, op_name, args=None, kwargs=None, device=None):
         )
         block._nnz = self.km.sparse_nnz(block.oid, syskwargs=syskwargs)
         block._nbytes = self.km.sparse_nbytes(block.oid, syskwargs=syskwargs)
-        block.fill_value = np.__getattribute__(op_name)(self.fill_value)
         return block
 
     def block_from_scalar(self, other):
         assert array_utils.is_scalar(other)
-        block = SparseBlock(
-            self.grid_entry,
-            self.grid_shape,
-            (1,),
-            self.dtype,
-            False,
-            self.km,
-            fill_value=other,
-        )
-        block.oid = self.km.sparse_block_from_scalar(
-            other,
-            syskwargs={
-                "grid_entry": self.grid_entry,
-                "grid_shape": self.grid_shape,
-            },
-        )
+        # Construct sparse block only if value is 0.
+        if np.isclose(other, 0):
+            block = SparseBlock(
+                self.grid_entry,
+                self.grid_shape,
+                (1,),
+                self.dtype,
+                False,
+                self.km,
+            )
+            block.oid = self.km.sparse_block_from_scalar(
+                other,
+                syskwargs={
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                },
+            )
+        else:
+            block = Block(
+                self.grid_entry,
+                self.grid_shape,
+                (1,),
+                self.dtype,
+                False,
+                self.km,
+            )
+            block.oid = self.km.block_from_scalar(
+                other,
+                syskwargs={
+                    "grid_entry": self.grid_entry,
+                    "grid_shape": self.grid_shape,
+                },
+            )
         return block
 
     @staticmethod
@@ -144,9 +138,6 @@ def init_block(op_name, block1, block2, args, device=None):
             result_shape,
             dtype,
         ) = BlockBase.block_meta(op_name, block1, block2, args)
-        fill_value = array_utils.get_bop_fill_value(
-            op_name, block1.fill_value, block2.fill_value
-        )
         # TODO: what happens when different index_dtype?
         block = SparseBlock(
             grid_entry=result_grid_entry,
@@ -154,7 +145,6 @@ def init_block(op_name, block1, block2, args, device=None):
             shape=result_shape,
             dtype=dtype,
             transposed=False,
-            fill_value=fill_value,
             km=block1.km,
         )
         block._device = device
@@ -174,9 +164,7 @@ def binary_op(op_name, a: BlockBase, b: BlockBase, args: dict, device=None):
         if not isinstance(a, BlockBase) or not isinstance(b, BlockBase):
             raise NotImplementedError()
 
-        densify = array_utils.get_sparse_bop_return_type(
-            op_name, a.fill_value, b.fill_value
-        )
+        densify = array_utils.get_sparse_bop_densify(op_name, a.is_dense, b.is_dense)
         if densify:
             block = Block.init_block(op_name, a, b, args, device)
         else:
@@ -206,11 +194,7 @@ def binary_op(op_name, a: BlockBase, b: BlockBase, args: dict, device=None):
     def bop(self, op_name, other, args: dict, device=None):
         return self.binary_op(op_name, self, other, args, device)
 
-    # TODO: densify when fill_value != 0
     def tensordot(self, other, axes):
-        assert self.fill_value == 0
-        if not other.is_dense:
-            assert other.fill_value == 0
         return self.binary_op("tensordot", self, other, args={"axes": axes})
 
 
@@ -219,7 +203,6 @@ def __init__(
         self,
         grid: ArrayGrid,
         km: KernelManager,
-        fill_value=0,
         blocks: np.ndarray = None,
     ):
         if blocks is not None:
@@ -237,12 +220,14 @@ def __init__(
                     self.dtype,
                     False,
                     self.km,
-                    fill_value,
                 )
-        self.fill_value = fill_value
         self._nnz = -1
         self._nbytes = -1
 
+    @property
+    def is_dense(self):
+        return False
+
     @property
     def nnz(self):
         return self._get_nnz()
@@ -270,17 +255,13 @@ def __repr__(self):
 
     @classmethod
     def empty(cls, shape, block_shape, dtype, km: KernelManager):
-        return SparseBlockArray.create(
-            "empty", shape, block_shape, dtype, km, fill_value=0
-        )
+        return SparseBlockArray.create("empty", shape, block_shape, dtype, km)
 
     @classmethod
-    def create(
-        cls, create_op_name, shape, block_shape, dtype, km: KernelManager, fill_value=0
-    ):
+    def create(cls, create_op_name, shape, block_shape, dtype, km: KernelManager):
         grid = ArrayGrid(shape=shape, block_shape=block_shape, dtype=dtype.__name__)
         grid_meta = grid.to_meta()
-        arr = SparseBlockArray(grid, km, fill_value)
+        arr = SparseBlockArray(grid, km)
         for grid_entry in grid.get_entry_iterator():
             arr.blocks[grid_entry].oid = km.new_sparse_block(
                 create_op_name,
@@ -291,10 +272,10 @@ def create(
         return arr
 
     @classmethod
-    def from_np(cls, arr, block_shape, copy, km, fill_value=0):
+    def from_np(cls, arr, block_shape, copy, km):
         dtype_str = str(arr.dtype)
         grid = ArrayGrid(arr.shape, block_shape, dtype_str)
-        rarr = SparseBlockArray(grid, km, fill_value)
+        rarr = SparseBlockArray(grid, km)
         grid_entry_iterator = grid.get_entry_iterator()
         for grid_entry in grid_entry_iterator:
             grid_slice = grid.get_slice(grid_entry)
@@ -302,7 +283,7 @@ def from_np(cls, arr, block_shape, copy, km, fill_value=0):
             if copy:
                 block = np.copy(block)
             # TODO: generalize for different kernels
-            block = sparse.COO.from_numpy(block, fill_value)
+            block = sparse.COO.from_numpy(block, fill_value=0)
             rarr.blocks[grid_entry].oid = km.put(
                 block,
                 syskwargs={"grid_entry": grid_entry, "grid_shape": grid.grid_shape},
@@ -333,12 +314,10 @@ def from_sparse(cls, arr, block_shape, copy, km, fill_value=0):
     def from_scalar(cls, val, km):
         if not array_utils.is_scalar(val):
             raise ValueError("%s is not a scalar." % val)
-        return SparseBlockArray.from_np(
-            np.array(val), (), copy=False, km=km, fill_value=val
-        )
+        return SparseBlockArray.from_np(np.array(val), (), copy=False, km=km)
 
     @classmethod
-    def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value):
+    def from_blocks(cls, arr: np.ndarray, result_shape, km):
         sample_idx = tuple(0 for _ in arr.shape)
         if isinstance(arr, SparseBlock):
             sample_block = arr
@@ -353,7 +332,7 @@ def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value):
             shape=result_shape, block_shape=result_block_shape, dtype=result_dtype_str
         )
         assert arr.shape == result_grid.grid_shape
-        result = SparseBlockArray(result_grid, km, fill_value)
+        result = SparseBlockArray(result_grid, km)
         for grid_entry in result_grid.get_entry_iterator():
             if isinstance(arr, SparseBlock):
                 block: SparseBlock = arr
@@ -363,12 +342,12 @@ def from_blocks(cls, arr: np.ndarray, result_shape, km, fill_value):
         return result
 
     @classmethod
-    def from_ba(cls, ba: BlockArrayBase, fill_value=0):
+    def from_ba(cls, ba: BlockArrayBase):
         assert (
             ba.shape != ()
         ), "from_ba does not support scalar BlockArray. Use from_scalar."
         grid = ArrayGrid(ba.shape, ba.block_shape, ba.dtype.__name__)
-        sba = SparseBlockArray(grid, ba.km, fill_value)
+        sba = SparseBlockArray(grid, ba.km)
         for grid_entry in grid.get_entry_iterator():
             block: Block = ba.blocks[grid_entry]
             sblock: SparseBlock = sba.blocks[grid_entry]
@@ -378,12 +357,10 @@ def from_ba(cls, ba: BlockArrayBase, fill_value=0):
             }
             sblock.oid = sba.km.dense_to_sparse(
                 block.oid,
-                fill_value,
                 syskwargs=syskwargs,
             )
             sblock._nnz = sba.km.sparse_nnz(sblock.oid, syskwargs=syskwargs)
             sblock._nbytes = sba.km.sparse_nbytes(sblock.oid, syskwargs=syskwargs)
-        sba.fill_value = fill_value
         return sba
 
     def to_ba(self):
@@ -404,7 +381,7 @@ def todense(self) -> BlockArray:
 
     def copy(self):
         grid_copy = self.grid.from_meta(self.grid.to_meta())
-        rarr_copy = SparseBlockArray(grid_copy, self.km, self.fill_value)
+        rarr_copy = SparseBlockArray(grid_copy, self.km)
         for grid_entry in grid_copy.get_entry_iterator():
             rarr_copy.blocks[grid_entry] = self.blocks[grid_entry].copy()
         return rarr_copy
@@ -416,21 +393,6 @@ def astype(self, dtype):
             result.blocks[grid_entry] = self.blocks[grid_entry].astype(dtype)
         return result
 
-    def transpose(self, defer=False, redistribute=False):
-        if defer and redistribute:
-            warnings.warn("defer is True, redistribute=True will be ignored.")
-        metaT = self.grid.to_meta()
-        metaT["shape"] = tuple(reversed(metaT["shape"]))
-        metaT["block_shape"] = tuple(reversed(metaT["block_shape"]))
-        gridT = ArrayGrid.from_meta(metaT)
-        rarrT = SparseBlockArray(gridT, self.km, self.fill_value)
-        rarrT.blocks = np.copy(self.blocks.T)
-        for grid_entry in rarrT.grid.get_entry_iterator():
-            rarrT.blocks[grid_entry] = rarrT.blocks[grid_entry].transpose(
-                defer, redistribute
-            )
-        return rarrT
-
     @staticmethod
     def to_block_array(obj, km: KernelManager, block_shape=None):
         if isinstance(obj, (BlockArray, SparseBlockArray)):
@@ -463,8 +425,6 @@ def ufunc(self, op_name):
         result = self.copy()
         for grid_entry in self.grid.get_entry_iterator():
             result.blocks[grid_entry] = self.blocks[grid_entry].ufunc(op_name)
-        func = np.__getattribute__(op_name)
-        result.fill_value = func(self.fill_value)
         result._nnz = -1
         result._nbytes = -1
         return result
@@ -482,11 +442,7 @@ def elementwise(op_name, a, b):
         else:
             raise NotImplementedError()
 
-        densify = array_utils.get_sparse_bop_return_type(
-            op_name,
-            a.fill_value,
-            b.fill_value,
-        )
+        densify = array_utils.get_sparse_bop_densify(op_name, a.is_dense, b.is_dense)
         if a.shape == b.shape and a.block_shape == b.block_shape:
             return SparseBlockArray._fast_elementwise(op_name, a, b, densify)
         else:
@@ -498,13 +454,9 @@ def elementwise(op_name, a, b):
                     km=a.km,
                 )
             else:
-                fill_value = array_utils.get_bop_fill_value(
-                    op_name, a.fill_value, b.fill_value
-                )
                 result = SparseBlockArray.from_blocks(
                     blocks_op(b.blocks),
                     result_shape=None,
-                    fill_value=fill_value,
                     km=a.km,
                 )
             return result
@@ -515,12 +467,8 @@ def _fast_elementwise(op_name, a, b, densify):
         dtype = array_utils.get_bop_output_type(op_name, a.dtype, b.dtype)
         if densify:
             block_type = Block
-            fill_value = None
         else:
             block_type = SparseBlock
-            fill_value = array_utils.get_bop_fill_value(
-                op_name, a.fill_value, b.fill_value
-            )
         blocks = np.empty(shape=a.grid_shape, dtype=block_type)
         for grid_entry in a.grid.get_entry_iterator():
             a_block: BlockBase = a.blocks[grid_entry]
@@ -550,7 +498,7 @@ def _fast_elementwise(op_name, a, b, densify):
         if densify:
             return BlockArray(grid, a.km, blocks=blocks)
         else:
-            return SparseBlockArray(grid, a.km, fill_value, blocks=blocks)
+            return SparseBlockArray(grid, a.km, blocks=blocks)
 
     #################
     # Linear Algebra
@@ -572,20 +520,11 @@ def tensordot(a, b, axes=2):
         else:
             raise NotImplementedError()
 
-        # PyData/Sparse only works with fill_value == 0
-        # TODO: densify when fill_value != 0
-        if not (a.is_dense or b.is_dense):
-            assert (
-                a.fill_value == 0 and b.fill_value == 0
-            ), "Sparse-sparse tensordot with non-zero fill value is not supported."
-
         if array_utils.np_tensordot_param_test(a.shape, a.ndim, b.shape, b.ndim, axes):
             raise ValueError("shape-mismatch for sum")
 
-        densify = array_utils.get_sparse_bop_return_type(
-            "tensordot",
-            a.fill_value,
-            b.fill_value,
+        densify = array_utils.get_sparse_bop_densify(
+            "tensordot", a.is_dense, b.is_dense
         )
 
         if axes > 0:
@@ -614,7 +553,7 @@ def tensordot(a, b, axes=2):
         if densify:
             result = BlockArray(result_grid, a.km)
         else:
-            result = SparseBlockArray(result_grid, a.km, a.fill_value)
+            result = SparseBlockArray(result_grid, a.km)
         a_dims = list(itertools.product(*map(range, a_axes)))
         b_dims = list(itertools.product(*map(range, b_axes)))
         sum_dims = list(itertools.product(*map(range, a_sum_axes)))
@@ -663,13 +602,12 @@ def sdtp(self, *block_arrays: List[BlockArray]):
         """
         Perform a sampled tensor product among an arbitrary number of block arrays.
         """
-        assert np.allclose(self.fill_value, 0)
         for i, ba in enumerate(block_arrays):
             assert len(ba.shape) == 1
             assert ba.shape[0] == self.shape[i]
             assert ba.block_shape[0] == self.block_shape[i]
         # Sparsity of result is same as self.
-        result: SparseBlockArray = SparseBlockArray(self.grid, self.km, self.fill_value)
+        result: SparseBlockArray = SparseBlockArray(self.grid, self.km)
         for grid_entry in self.grid.get_entry_iterator():
             dense_oids = [
                 ba.blocks[grid_entry[i]].oid for i, ba in enumerate(block_arrays)
@@ -690,8 +628,6 @@ def sdtd(self, x: BlockArray, y: BlockArray, axes: int):
         is being performed are not partitioned.
         The last constraint can be eliminated if we add a sampled element-wise kernel.
         """
-        assert np.allclose(self.fill_value, 0)
-
         if array_utils.np_tensordot_param_test(x.shape, x.ndim, y.shape, y.ndim, axes):
             raise ValueError("shape-mismatch for sum")
 
@@ -716,7 +652,7 @@ def sdtd(self, x: BlockArray, y: BlockArray, axes: int):
         assert result_grid.grid_shape == tuple(x_axes + y_axes)
         assert result_grid.grid_shape == self.grid_shape
         assert result_grid.block_shape == self.block_shape
-        result: SparseBlockArray = SparseBlockArray(self.grid, self.km, self.fill_value)
+        result: SparseBlockArray = SparseBlockArray(self.grid, self.km)
         x_dims = list(itertools.product(*map(range, x_axes)))
         y_dims = list(itertools.product(*map(range, y_axes)))
         sum_dims = tuple([0] * axes)
@@ -752,10 +688,7 @@ def __inequality__(self, op_name, other):
         )
         dtype = bool.__name__
         grid = ArrayGrid(shape, block_shape, dtype)
-        fill_value = array_utils.get_bop_fill_value(
-            op_name, self.fill_value, other.fill_value
-        )
-        result = SparseBlockArray(grid, self.km, fill_value)
+        result = SparseBlockArray(grid, self.km)
         for grid_entry in result.grid.get_entry_iterator():
             other_block: Block = other.blocks.item()
             result.blocks[grid_entry] = self.blocks[grid_entry].bop(
diff --git a/nums/core/array/utils.py b/nums/core/array/utils.py
index 872f5d9d..ed01190e 100644
--- a/nums/core/array/utils.py
+++ b/nums/core/array/utils.py
@@ -453,15 +453,15 @@ def normalize_axis_index(axis, ndim):
     return axis % ndim
 
 
-def get_sparse_bop_return_type(op_name, a_fv, b_fv):
-    def sample_array(fv):
+def get_sparse_bop_densify(op_name, a_dense, b_dense):
+    def sample_array(is_dense):
         s = np.eye(2)
-        if fv is not None:
-            return sparse.COO.from_numpy(s, fill_value=fv)
+        if not is_dense:
+            return sparse.COO.from_numpy(s, fill_value=0)
         return s
 
-    sa = sample_array(a_fv)
-    sb = sample_array(b_fv)
+    sa = sample_array(a_dense)
+    sb = sample_array(b_dense)
     if op_name == "tensordot":
         result = sparse.tensordot(sa, sb)
     else:
@@ -474,18 +474,3 @@ def sample_array(fv):
     if isinstance(result, sparse.SparseArray):
         return False
     return True
-
-
-def get_bop_fill_value(op_name, a_fv, b_fv):
-    if a_fv is None and b_fv is None:
-        return None
-    if a_fv is None:
-        return b_fv
-    if b_fv is None:
-        return a_fv
-    op_name = np_ufunc_map.get(op_name, op_name)
-    try:
-        func = np.__getattribute__(op_name)
-    except Exception as _:
-        func = scipy.special.__getattribute__(op_name)
-    return func(a_fv, b_fv)
diff --git a/nums/core/kernel/kernel_interface.py b/nums/core/kernel/kernel_interface.py
index 76631799..f1f16218 100644
--- a/nums/core/kernel/kernel_interface.py
+++ b/nums/core/kernel/kernel_interface.py
@@ -167,7 +167,7 @@ def any(self, arr, syskwargs: Dict):
 
     # Sparse
 
-    def dense_to_sparse(self, arr, fill_value, syskwargs: Dict):
+    def dense_to_sparse(self, arr, syskwargs: Dict):
         raise NotImplementedError()
 
     def sparse_to_dense(self, arr, syskwargs: Dict):
@@ -190,7 +190,6 @@ def sparse_random_block(
         shape,
         dtype,
         p,
-        fill_value,
         syskwargs: Dict,
     ):
         raise NotImplementedError()
diff --git a/nums/core/kernel/numpy_kernel.py b/nums/core/kernel/numpy_kernel.py
index f5fee593..20523e0a 100644
--- a/nums/core/kernel/numpy_kernel.py
+++ b/nums/core/kernel/numpy_kernel.py
@@ -527,9 +527,8 @@ def any(self, arr):
 
     # Sparse
 
-    def dense_to_sparse(self, arr, fill_value):
-        result = COO.from_numpy(arr, fill_value=fill_value)
-        return result
+    def dense_to_sparse(self, arr):
+        return COO.from_numpy(arr, fill_value=0)
 
     def sparse_to_dense(self, arr):
         return arr.todense()
@@ -558,7 +557,6 @@ def sparse_random_block(
         shape,
         dtype,
         p,
-        fill_value,
     ):
         rng = block_rng_legacy(*rng_params)
         rfunc = rng.__getattribute__(rfunc_name)
@@ -570,7 +568,7 @@ def sparse_random_block(
             random_state=rng,
             data_rvs=lambda s: rfunc(**rfunc_args, size=s),
             format="coo",
-            fill_value=fill_value,
+            fill_value=0,
         )
         return result.astype(dtype)
 
@@ -615,14 +613,14 @@ def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify):
 
     def sparse_block_from_scalar(self, x):
         assert np.isscalar(x)
-        return sparse.COO.from_numpy(np.array(x), fill_value=x)
+        return sparse.COO.from_numpy(np.array(x), fill_value=0)
 
     def sdtp(self, s: sparse.COO, *dense_arrays):
         data = np.copy(s.data)
         for position in range(s.nnz):
             for axis in range(len(s.shape)):
                 data[position] *= dense_arrays[axis][s.coords[axis][position]]
-        return sparse.COO(s.coords, data, shape=s.shape, fill_value=s.fill_value)
+        return sparse.COO(s.coords, data, shape=s.shape, fill_value=0)
 
     def sdtd(self, s: sparse.COO, x: np.ndarray, y: np.ndarray, axes: int):
         # Check some things.
@@ -647,4 +645,4 @@ def sdtd(self, s: sparse.COO, x: np.ndarray, y: np.ndarray, axes: int):
             sx = x[tuple(x_coords)]
             sy = y[tuple(y_coords)]
             data[position] *= np.tensordot(sx, sy, axes=axes)
-        return sparse.COO(s.coords, data, shape=s.shape, fill_value=s.fill_value)
+        return sparse.COO(s.coords, data, shape=s.shape, fill_value=0)
diff --git a/nums/experimental/optimizer/graph.py b/nums/experimental/optimizer/graph.py
index 361e740f..2440e622 100644
--- a/nums/experimental/optimizer/graph.py
+++ b/nums/experimental/optimizer/graph.py
@@ -1215,7 +1215,7 @@ def _collapse(self, device: Device):
         leaf.block = block
         # Assume dense for simplicity.
         leaf.tree_node_size = TreeNodeSize(
-            self.shape(), np.prod(self.shape()), block.dtype, block.fill_value
+            self.shape(), np.prod(self.shape()), block.dtype, block.is_dense
         )
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
diff --git a/nums/experimental/optimizer/grapharray.py b/nums/experimental/optimizer/grapharray.py
index f4a2300b..4755a574 100644
--- a/nums/experimental/optimizer/grapharray.py
+++ b/nums/experimental/optimizer/grapharray.py
@@ -70,7 +70,7 @@ def graphs_from_ba(
                 block.shape,
                 nnz,
                 block.dtype,
-                block.fill_value,
+                block.is_dense,
             )
             leaf.copy_on_op = copy_on_op
             graphs[grid_entry] = leaf
@@ -110,9 +110,7 @@ def to_ba(self):
         ), "Cannot convert unsolved GraphArray to BlockArray."
         if sample_node.block.is_dense:
             return BlockArray(self.grid.copy(), self.km, self.to_blocks())
-        return SparseBlockArray(
-            self.grid.copy(), self.km, sample_node.block.fill_value, self.to_blocks()
-        )
+        return SparseBlockArray(self.grid.copy(), self.km, self.to_blocks())
 
     def __init__(
         self,
diff --git a/nums/experimental/optimizer/size.py b/nums/experimental/optimizer/size.py
index e813259b..089d97d4 100644
--- a/nums/experimental/optimizer/size.py
+++ b/nums/experimental/optimizer/size.py
@@ -10,11 +10,11 @@ class TreeNodeSize:
     Sparse binary operations use B(n, p) to estimate nnz as needed.
     """
 
-    def __init__(self, shape, nnz, dtype, fill_value=None, index_dtype=np.int64):
+    def __init__(self, shape, nnz, dtype, is_dense, index_dtype=np.int64):
         self.shape = shape
         self.nnz = nnz
         self.dtype = dtype
-        self.fill_value = fill_value
+        self.is_dense = is_dense
         self.index_dtype = index_dtype
         if self.is_dense:
             assert nnz == np.prod(shape)
@@ -44,14 +44,10 @@ def copy(self):
             self.shape,
             self.nnz,
             self.dtype,
-            self.fill_value,
+            self.is_dense,
             self.index_dtype,
         )
 
-    @property
-    def is_dense(self):
-        return self.fill_value is None
-
     @property
     def nbytes(self):
         if self.is_dense:
@@ -69,17 +65,13 @@ def uop(self, op_name):
                 shape=tuple(reversed(self.shape)),
                 nnz=self.nnz,
                 dtype=self.dtype,
-                fill_value=self.fill_value,
+                is_dense=self.is_dense,
             )
-        if self.is_dense:
-            fill_value = None
-        else:
-            fill_value = np.__getattribute__(op_name)(self.fill_value)
         return TreeNodeSize(
             shape=self.shape,
             nnz=self.nnz,
             dtype=array_utils.get_uop_output_type(op_name, self.dtype),
-            fill_value=fill_value,
+            is_dense=self.is_dense,
         )
 
     def reduce_axis(self, op_name, axis, keepdims, transposed):
@@ -97,7 +89,7 @@ def reduce_axis(self, op_name, axis, keepdims, transposed):
             shape=tuple(shape),
             nnz=np.prod(shape),
             dtype=array_utils.get_uop_output_type(op_name, self.dtype),
-            fill_value=None,
+            is_dense=True,
         )
 
     def _nnz_disjunction(self, other, shape):
@@ -118,7 +110,6 @@ def _nnz_conjunction(self, other, shape):
 
     def _nnz_selection(self, other, shape):
         # If self element is nonzero, result is nonzero.
-        assert self.fill_value == 0
         n1 = np.prod(self.shape)
         p1 = self.nnz / n1
         return int(p1 * np.prod(shape))
@@ -126,17 +117,17 @@ def _nnz_selection(self, other, shape):
     def add(self, other):
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("add", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
+        is_dense = array_utils.get_sparse_bop_densify(
             "add",
-            self.fill_value,
-            other.fill_value,
+            self.is_dense,
+            other.is_dense,
         )
         if is_dense:
             return TreeNodeSize(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
         if self.is_dense or other.is_dense:
             raise ValueError(
@@ -146,9 +137,7 @@ def add(self, other):
             shape=shape,
             nnz=self._nnz_disjunction(other, shape),
             dtype=dtype,
-            fill_value=array_utils.get_bop_fill_value(
-                "add", self.fill_value, other.fill_value
-            ),
+            is_dense=False,
         )
 
     __add__ = add
@@ -156,30 +145,23 @@ def add(self, other):
     def mul(self, other):
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("mul", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
+        is_dense = array_utils.get_sparse_bop_densify(
             "mul",
-            self.fill_value,
-            other.fill_value,
+            self.is_dense,
+            other.is_dense,
         )
         if is_dense:
             return TreeNodeSize(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
         if not self.is_dense and not other.is_dense:
-            if self.fill_value == 0 and other.fill_value == 0:
-                nnz = self._nnz_conjunction(other, shape)
-            elif self.fill_value == 0:
-                nnz = self._nnz_selection(other, shape)
-            elif other.fill_value == 0:
-                nnz = other._nnz_selection(self, shape)
-            else:
-                nnz = self._nnz_disjunction(other, shape)
-        elif self.fill_value == 0 and other.is_dense:
+            nnz = self._nnz_conjunction(other, shape)
+        elif not self.is_dense and other.is_dense:
             nnz = self._nnz_selection(other, shape)
-        elif self.is_dense and other.fill_value == 0:
+        elif self.is_dense and not other.is_dense:
             nnz = other._nnz_selection(self, shape)
         else:
             raise ValueError(
@@ -189,9 +171,7 @@ def mul(self, other):
             shape=shape,
             nnz=nnz,
             dtype=dtype,
-            fill_value=array_utils.get_bop_fill_value(
-                "mul", self.fill_value, other.fill_value
-            ),
+            is_dense=False,
         )
 
     __mul__ = mul
@@ -199,25 +179,25 @@ def mul(self, other):
     def truediv(self, other):
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("truediv", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
+        is_dense = array_utils.get_sparse_bop_densify(
             "truediv",
-            self.fill_value,
-            other.fill_value,
+            self.is_dense,
+            other.is_dense,
         )
         if is_dense:
             return TreeNodeSize(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
         if self.is_dense or other.is_dense:
             raise ValueError(
                 "TreeNodeSize.__add__ is inconsistent with sparse bop rules."
             )
-        if other.fill_value == 0:  # nan
+        if not other.is_dense:  # nan
             nnz = other._nnz_selection(self, shape)
-        elif self.fill_value == 0:
+        elif not self.is_dense:
             nnz = self._nnz_selection(other, shape)
         else:
             nnz = self._nnz_disjunction(other, shape)
@@ -225,9 +205,7 @@ def truediv(self, other):
             shape=shape,
             nnz=nnz,
             dtype=dtype,
-            fill_value=array_utils.get_bop_fill_value(
-                "truediv", self.fill_value, other.fill_value
-            ),
+            is_dense=False,
         )
 
     __truediv__ = truediv
@@ -248,22 +226,17 @@ def tensordot(self, other, axes):
             shape = tuple(self.shape + other.shape)
             sum_shape = (1,)
         dtype = array_utils.get_bop_output_type("tensordot", self.dtype, other.dtype)
-        is_dense = array_utils.get_sparse_bop_return_type(
-            "tensordot", self.fill_value, other.fill_value
+        is_dense = array_utils.get_sparse_bop_densify(
+            "tensordot", self.is_dense, other.is_dense
         )
         if is_dense:
             return TreeNodeSize(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
-                fill_value=None,
+                is_dense=True,
             )
-        if (
-            self.is_dense
-            or other.is_dense
-            or self.fill_value != 0
-            or other.fill_value != 0
-        ):
+        if self.is_dense or other.is_dense:
             raise ValueError(
                 "TreeNodeSize.tensordot is inconsistent with sparse bop rules."
             )
@@ -277,20 +250,17 @@ def tensordot(self, other, axes):
             shape=shape,
             nnz=int((1 - (1 - p1 * p2) ** k) * m),
             dtype=dtype,
-            fill_value=0,
+            is_dense=False,
         )
 
     def inequality(self, op_name, other):
         assert other.shape == ()
         dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
-        fill_value = array_utils.get_bop_fill_value(
-            op_name, self.fill_value, other.fill_value
-        )
-        return TreeNode(
+        return TreeNodeSize(
             shape=self.shape,
             nnz=self.nnz,
             dtype=dtype,
-            fill_value=fill_value,
+            is_dense=self.is_dense,
         )
 
     def bop_dense(self, other):
@@ -301,7 +271,7 @@ def bop_dense(self, other):
             shape=shape,
             nnz=np.prod(shape),
             dtype=dtype,
-            fill_value=None,
+            is_dense=False,
         )
 
     def bop(self, op_name, other, **kwargs):
diff --git a/nums/numpy/random.py b/nums/numpy/random.py
index 4f031df5..edd7d6a4 100644
--- a/nums/numpy/random.py
+++ b/nums/numpy/random.py
@@ -79,25 +79,20 @@ def permutation(self, x):
             arr_perm = self.rs().permutation(shape[0], block_shape[0]).get()
             return x[arr_perm]
 
-    def sparse_randn(self, density, fill_value, *shape):
+    def sparse_randn(self, density, *shape):
         shape, block_shape = self._get_shapes(shape, _np.float64)
         return self.rs().sparse_normal(
             shape=shape,
             block_shape=block_shape,
             p=density,
-            fill_value=fill_value,
         )
 
-    def sparse_randint(
-        self, low, high=None, size=None, dtype=int, density=0.01, fill_value=0
-    ):
+    def sparse_randint(self, low, high=None, size=None, dtype=int, density=0.01):
         if high is None:
             high = low
             low = 0
         shape, block_shape = self._get_shapes(size, dtype)
-        return self.rs().sparse_randint(
-            low, high, dtype, shape, block_shape, density, fill_value
-        )
+        return self.rs().sparse_randint(low, high, dtype, shape, block_shape, density)
 
 
 # Default imp.
diff --git a/tests/core/array/test_sparse.py b/tests/core/array/test_sparse.py
index 302bd082..4563771e 100644
--- a/tests/core/array/test_sparse.py
+++ b/tests/core/array/test_sparse.py
@@ -11,7 +11,7 @@
 def test_sparse_init(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
     x_ba = app_inst.array(x1, block_shape=(2, 2))
-    x_sba = SparseBlockArray.from_ba(x_ba, fill_value=0)
+    x_sba = SparseBlockArray.from_ba(x_ba)
     assert x_sba.nnz == 8
     assert x_sba.nbytes == 8 * 4 + 2 * 8 * 8
     y_ba = x_sba.to_ba()
@@ -40,7 +40,6 @@ def test_sparse_random(app_inst: ArrayApplication):
         shape=(100, 50),
         block_shape=(50, 50),
         p=0.1,
-        fill_value=0,
     )
     x_ba = x_sba.to_ba()
     x_np = x_ba.get()
@@ -50,12 +49,11 @@ def test_sparse_random(app_inst: ArrayApplication):
 
 def test_sparse_uop(app_inst: ArrayApplication):
     x = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
-    x_sp = sparse.COO.from_numpy(x, fill_value=1)
+    x_sp = sparse.COO.from_numpy(x, fill_value=0)
     x_sp = sparse.elemwise(np.negative, x_sp)
     x_ba = app_inst.array(x, block_shape=(2, 2))
-    x_sba = SparseBlockArray.from_ba(x_ba, fill_value=1)
+    x_sba = SparseBlockArray.from_ba(x_ba)
     y_sba = -x_sba
-    assert y_sba.fill_value == x_sp.fill_value  # -1
     assert y_sba.nnz == x_sp.nnz  # 12
     y_ba = y_sba.to_ba()
     assert np.array_equal(np.negative(x), y_ba.get())
@@ -64,15 +62,14 @@ def test_sparse_uop(app_inst: ArrayApplication):
 def test_sparse_add(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
     x2 = x1 * 2
-    x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
-    x2_sp = sparse.COO.from_numpy(x2, fill_value=2)
+    x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
+    x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
     x2_ba = app_inst.array(x2, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
-    x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=2)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
+    x2_sba = SparseBlockArray.from_ba(x2_ba)
     y_sp = x1_sp + x2_sp
     y_sba = x1_sba + x2_sba
-    assert y_sba.fill_value == y_sp.fill_value  # 4
     assert y_sba.nnz == y_sp.nnz  # 16
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 + x2, y_ba.get())
@@ -88,13 +85,11 @@ def test_sparse_add(app_inst: ArrayApplication):
     # Test sparse-scalar.
     y_sp = x1_sp - 1  # __sub__
     y_sba = x1_sba - 1
-    assert y_sba.fill_value == y_sp.fill_value  # 4
     assert y_sba.nnz == y_sp.nnz  # 16
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 - 1, y_ba.get())
     y_sp = 1 - x1_sp  # __rsub__
     y_sba = 1 - x1_sba
-    assert y_sba.fill_value == y_sp.fill_value  # 4
     assert y_sba.nnz == y_sp.nnz  # 16
     y_ba = y_sba.to_ba()
     assert np.array_equal(1 - x1, y_ba.get())
@@ -103,15 +98,14 @@ def test_sparse_add(app_inst: ArrayApplication):
 def test_sparse_mul(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
     x2 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])
-    x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
-    x2_sp = sparse.COO.from_numpy(x2, fill_value=2)
+    x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
+    x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
     x2_ba = app_inst.array(x2, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
-    x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=2)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
+    x2_sba = SparseBlockArray.from_ba(x2_ba)
     y_sp = x1_sp * x2_sp
     y_sba = x1_sba * x2_sba
-    assert y_sba.fill_value == y_sp.fill_value  # 4
     assert y_sba.nnz == y_sp.nnz  # 16
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 * x2, y_ba.get())
@@ -119,10 +113,10 @@ def test_sparse_mul(app_inst: ArrayApplication):
     # Test sparse-dense.
     rs: NumsRandomState = app_inst.random_state(1337)
     x1_sba = rs.sparse_randint(
-        1, high=5, dtype=int, shape=(100, 50), block_shape=(5, 5), p=0.1, fill_value=0
+        1, high=5, dtype=int, shape=(100, 50), block_shape=(5, 5), p=0.1
     )
     x2_sba = rs.sparse_randint(
-        1, high=5, dtype=int, shape=(1, 50), block_shape=(5, 5), p=0.1, fill_value=0
+        1, high=5, dtype=int, shape=(1, 50), block_shape=(5, 5), p=0.1
     )
     x1_ba = x1_sba.to_ba()
     x2_ba = x2_sba.to_ba()
@@ -132,7 +126,6 @@ def test_sparse_mul(app_inst: ArrayApplication):
     x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     y_sp = x1_sp * x2
     y_sba = x2_ba * x1_sba  # __rmul__
-    assert y_sba.fill_value == y_sp.fill_value
     assert y_sba.nnz == y_sp.nnz
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 * x2, y_ba.get())
@@ -140,12 +133,11 @@ def test_sparse_mul(app_inst: ArrayApplication):
 
 def test_neq(app_inst: ArrayApplication):
     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 0, 0], [2, 2, 0, 0]])
-    x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
+    x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
     y_sp = x1_sp > 0
     y_sba = x1_sba > 0
-    assert y_sba.fill_value == y_sp.fill_value  # True
     assert y_sba.nnz == y_sp.nnz  # 8 (nnz changes!)
     y_ba = y_sba.to_ba()
     assert np.array_equal(x1 > 0, y_ba.get())
@@ -158,11 +150,10 @@ def test_tensordot(app_inst: ArrayApplication):
     x2_sp = sparse.COO.from_numpy(x2, fill_value=0)
     x1_ba = app_inst.array(x1, block_shape=(2, 2))
     x2_ba = app_inst.array(x2, block_shape=(2, 2))
-    x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=0)
-    x2_sba = SparseBlockArray.from_ba(x2_ba, fill_value=0)
+    x1_sba = SparseBlockArray.from_ba(x1_ba)
+    x2_sba = SparseBlockArray.from_ba(x2_ba)
     y_sp = sparse.tensordot(x1_sp, x2_sp, axes=1)
     y_sba = x1_sba @ x2_sba
-    assert y_sba.fill_value == y_sp.fill_value  # 0
     assert y_sba.nnz == y_sp.nnz  #
     y_ba = y_sba.to_ba()
     assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get())
@@ -171,17 +162,6 @@ def test_tensordot(app_inst: ArrayApplication):
     assert np.array_equal(np.tensordot(x1, x2, axes=1), y_ba.get())
 
 
-# def test_getitem(app_inst: ArrayApplication):
-#     x1 = np.array([[0, 0, 1, 1], [0, 0, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]])
-#     x1_sp = sparse.COO.from_numpy(x1, fill_value=2)
-#     x1_ba = app_inst.array(x1, block_shape=(2, 2))
-#     x1_sba = SparseBlockArray.from_ba(x1_ba, fill_value=2)
-#     y_sp = x1_sp[0:2, 1:3]
-#     y_sba = x1_sba[0:2, 1:3]
-#     assert y_sba.fill_value == y_sp.fill_value # 2
-#     assert np.array_equal(y_sp.todense(), y_sba.to_ba().get())
-
-
 def test_sdtp(app_inst: ArrayApplication):
     shape = 50, 50, 50
     block_shape = 10, 10, 10
diff --git a/tests/experimental/optimizer/test_size.py b/tests/experimental/optimizer/test_size.py
index a5606c6b..340ae163 100644
--- a/tests/experimental/optimizer/test_size.py
+++ b/tests/experimental/optimizer/test_size.py
@@ -11,7 +11,7 @@ def test_nbytes():
         (10, 10),
         10,
         np.int64,
-        0,
+        False,
     )
     assert x1_sp.nbytes == x1_ts.nbytes
 
@@ -21,11 +21,11 @@ def test_uop():
         (10, 10),
         10,
         np.int64,
-        1,
+        False,
     )
     y_ts = x1_ts.uop("negative")
     assert np.allclose(y_ts.nnz, x1_ts.nnz)
-    assert y_ts.fill_value == -1
+    assert not y_ts.is_dense
 
 
 def test_add():
@@ -33,17 +33,17 @@ def test_add():
         (10, 10),
         10,
         np.int64,
-        1,
+        False,
     )
     x2_ts = TreeNodeSize(
         (10, 10),
         20,
         np.int64,
-        2,
+        False,
     )
     y_ts = x1_ts + x2_ts
     assert np.allclose(y_ts.nnz, int((1 - 0.9 * 0.8) * 100))
-    assert y_ts.fill_value == 3
+    assert not y_ts.is_dense
 
 
 def test_mul():
@@ -51,13 +51,13 @@ def test_mul():
         (10, 10),
         10,
         np.int64,
-        0,
+        False,
     )
     x2_ts = TreeNodeSize(
         (10, 10),
         20,
         np.int64,
-        0,
+        False,
     )
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 0.2 * 100))
@@ -66,7 +66,7 @@ def test_mul():
         (10, 1),
         10,
         np.int64,
-        1,
+        False,
     )
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 100))
@@ -75,10 +75,11 @@ def test_mul():
         (10, 1),
         10,
         np.int64,
-        None,
+        True,
     )
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 100))
+    assert not y_ts.is_dense
 
 
 def test_tensordot():
@@ -86,13 +87,13 @@ def test_tensordot():
         (10, 10),
         10,
         np.int64,
-        0,
+        False,
     )
     x2_ts = TreeNodeSize(
         (10, 10),
         20,
         np.int64,
-        0,
+        False,
     )
     y_ts = x1_ts.tensordot(x2_ts, axes=1)
     assert np.allclose(y_ts.nnz, int((1 - (1 - 0.1 * 0.2) ** 10) * 100))

From 865e16d966bf864a643c3f319638717ffb5cbad9 Mon Sep 17 00:00:00 2001
From: Daniel Zou <zoudl2000@gmail.com>
Date: Sun, 4 Sep 2022 10:22:11 -0700
Subject: [PATCH 4/5] Integrate sparsity into GraphArray and fusion

---
 CONTRIBUTING_MAC.md                           |  17 ++
 nums/core/array/base.py                       |  79 +-----
 nums/core/array/blockarray.py                 |  63 ++++-
 nums/core/array/random.py                     |   1 +
 nums/core/array/sparse.py                     | 208 +++++++++++++++-
 nums/core/array/utils.py                      |  40 +--
 nums/core/array/view.py                       |   2 +-
 nums/core/kernel/kernel_interface.py          |  10 +-
 nums/core/kernel/numpy_kernel.py              |  36 ++-
 nums/experimental/optimizer/fusion.py         |   8 +-
 nums/experimental/optimizer/fusion_utils.py   |  18 ++
 nums/experimental/optimizer/graph.py          | 229 +++++++++++++-----
 nums/experimental/optimizer/grapharray.py     |  41 +++-
 .../optimizer/{size.py => node_meta.py}       | 151 ++++++++----
 nums/experimental/optimizer/reduction_ops.py  |  60 +++--
 tests/core/array/test_sparse.py               |  18 +-
 tests/experimental/optimizer/test_fusion.py   | 163 ++++++++++---
 .../optimizer/{test_size.py => test_meta.py}  |  22 +-
 tests/experimental/optimizer/test_ops.py      |  21 +-
 19 files changed, 862 insertions(+), 325 deletions(-)
 create mode 100644 CONTRIBUTING_MAC.md
 rename nums/experimental/optimizer/{size.py => node_meta.py} (74%)
 rename tests/experimental/optimizer/{test_size.py => test_meta.py} (82%)

diff --git a/CONTRIBUTING_MAC.md b/CONTRIBUTING_MAC.md
new file mode 100644
index 00000000..1fcf6f38
--- /dev/null
+++ b/CONTRIBUTING_MAC.md
@@ -0,0 +1,17 @@
+# Contributing As a Mac User
+
+## Installation
+
+For M1 Macs (Apple Silicon), various [online sources](https://github.com/scipy/scipy/issues/13409) recommend using OpenBLAS as a native (not Rosetta) linear algebra backend for dependencies like NumPy.
+```sh
+brew install openblas
+export OPENBLAS=$(/opt/homebrew/bin/brew --prefix openblas)
+```
+
+As [documented by Ray](https://docs.ray.io/en/latest/ray-overview/installation.html#m1-mac-apple-silicon-support), one must install miniforge, a community-driven Conda installer, for packages that support `arm64`. In particular, the GRPCIO package from pip will not work with Ray, so it must be uninstalled (`pip uninstall grpcio`) and replaced with the one from miniforge (`conda install grpcio`).
+
+
+## Testing
+
+`tests/core/backend/test_backend_init.py::test_head_detection`
+As [documented by Ray](https://github.com/ray-project/ray/issues/24130), there is different default behavior for fetching node IP addresses. On Linux, a private address (e.g. `192.168.0.1`) is returned, while on Windows and Mac, `127.0.0.1` is returned. As NumS is primarily released for Linux, this test will fail for Mac contributors, but it shouldn't affect development.
diff --git a/nums/core/array/base.py b/nums/core/array/base.py
index bec634fe..a09bbbb1 100644
--- a/nums/core/array/base.py
+++ b/nums/core/array/base.py
@@ -13,12 +13,7 @@
 # limitations under the License.
 
 
-# pylint: disable = protected-access
-# pylint: disable=too-many-lines
-
 import warnings
-from numba.core.errors import NumbaNotImplementedError
-
 import numpy as np
 
 from nums.core.array import utils as array_utils
@@ -26,6 +21,10 @@
 from nums.core.grid.grid import ArrayGrid
 
 
+# pylint: disable=protected-access, redefined-builtin
+# pylint: disable=too-many-lines
+
+
 class BlockBase:
     block_id_counter = -1
 
@@ -68,7 +67,7 @@ def __repr__(self):
     def size(self):
         return np.product(self.shape)
 
-    def copy(self):
+    def copy(self, shallow=True):
         raise NotImplementedError()
 
     def get(self):
@@ -209,7 +208,7 @@ def _check_bop_implemented(self, other):
     def binary_op(op_name, a, b, args: dict, device=None):
         raise NotImplementedError()
 
-    def bop(self, op_name, other, args: dict, device=None, **kwargs):
+    def bop(self, op_name, other, args: dict, device=None):
         raise NotImplementedError()
 
     def tensordot(self, other, axes):
@@ -253,7 +252,7 @@ def __truediv__(self, other):
     def __rtruediv__(self, other):
         if not self._check_bop_implemented(other):
             return NotImplemented
-        return self.binary_op("trudiv", other, self, args={})
+        return self.binary_op("truediv", other, self, args={})
 
     def __pow__(self, other):
         if not self._check_bop_implemented(other):
@@ -311,18 +310,6 @@ def __ne__(self, other):
 class Block(BlockBase):
     # pylint: disable=redefined-builtin, global-statement
 
-    def __init__(
-        self,
-        grid_entry,
-        grid_shape,
-        shape,
-        dtype,
-        transposed,
-        km: KernelManager,
-        id=None,
-    ):
-        super().__init__(grid_entry, grid_shape, shape, dtype, transposed, km, id)
-
     @property
     def is_dense(self):
         return True
@@ -592,53 +579,13 @@ def broadcast_to(self, shape):
         result.blocks = broadcast
         return result
 
+    def reduce_axis(self, op_name, axis, keepdims=False):
+        raise NotImplementedError()
+
     def tree_reduce(
-        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape
+        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape, *args
     ):
-        """
-        Basic tree reduce imp.
-        Schedules op on same node as left operand.
-        :param op_name: The reduction op.
-        :param blocks_or_oids: A list of type Block or a list of tuples.
-        Tuples must be of the form
-        (oid, grid_entry, grid_shape, transposed)
-        :param result_grid_entry: The grid entry of the result block. This will be used
-        to compute the final reduction step.
-        :param result_grid_shape: The grid entry of the result block. This will be used
-        to compute the final reduction step.
-        :return: The oid of the result.
-        """
-        oid_list = blocks_or_oids
-        if isinstance(blocks_or_oids[0], Block):
-            oid_list = [
-                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
-                for b in blocks_or_oids
-            ]
-        if len(oid_list) == 1:
-            return oid_list[0][0]
-        q = oid_list
-        while len(q) > 1:
-            a_oid, a_ge, a_gs, a_T = q.pop(0)
-            b_oid, _, _, b_T = q.pop(0)
-            ge, gs = (
-                (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs)
-            )
-            c_oid = self.km.bop_reduce(
-                op_name,
-                a_oid,
-                b_oid,
-                a_T,
-                b_T,
-                syskwargs={
-                    "grid_entry": ge,
-                    "grid_shape": gs,
-                },
-            )
-            q.append((c_oid, ge, gs, False))
-        r_oid, r_ge, r_gs, _ = q.pop(0)
-        assert r_ge == result_grid_entry
-        assert r_gs == result_grid_shape
-        return r_oid
+        raise NotImplementedError()
 
     def check_or_convert_other(self, other, compute_block_shape=False):
         raise NotImplementedError()
@@ -870,7 +817,7 @@ def __rmatmul__(self, other):
     # Inequalities
     #################
 
-    def __inequality__(self, op, other):
+    def __inequality__(self, op_name, other):
         raise NotImplementedError()
 
     def __ge__(self, other):
diff --git a/nums/core/array/blockarray.py b/nums/core/array/blockarray.py
index db0eb20d..d6dabd6a 100644
--- a/nums/core/array/blockarray.py
+++ b/nums/core/array/blockarray.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 from nums.core.array import utils as array_utils
-from nums.core.array.base import Block, BlockArrayBase
+from nums.core.array.base import BlockBase, Block, BlockArrayBase
 from nums.core.array.view import ArrayView
 from nums.core.grid.grid import ArrayGrid
 from nums.core.kernel.kernel_manager import KernelManager
@@ -298,10 +298,6 @@ def __getitem__(self, item):
             # Treat this as a shuffle.
             return self._advanced_single_array_select(ss, axis=axis)
 
-        # This is to deal with circular imports. Little overhead since this happens once per call.
-        # However, would be better to rearrange modules in the future.
-        from nums.core.array.view import ArrayView
-
         av: ArrayView = ArrayView.from_block_array(self)
         # TODO (hme): We don't have to create, but do so for now until we need to optimize.
         return av[ss].create()
@@ -410,10 +406,6 @@ def __setitem__(self, key, value):
         if is_handled_advanced:
             return self._advanced_single_array_assign(ss, value, axis)
 
-        # This is to deal with circular imports. Little overhead since this happens once per call.
-        # However, would be better to rearrange modules in the future.
-        from nums.core.array.view import ArrayView
-
         av: ArrayView = ArrayView.from_block_array(self)
         av[key] = value
 
@@ -714,6 +706,54 @@ def reduce_axis(self, op_name, axis, keepdims=False):
                 )
         return result
 
+    def tree_reduce(
+        self, op_name, blocks_or_oids, result_grid_entry, result_grid_shape, *args
+    ):
+        """
+        Basic tree reduce imp.
+        Schedules op on same node as left operand.
+        :param op_name: The reduction op.
+        :param blocks_or_oids: A list of type Block or a list of tuples.
+        Tuples must be of the form
+        (oid, grid_entry, grid_shape, transposed)
+        :param result_grid_entry: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :param result_grid_shape: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :return: The oid of the result.
+        """
+        oid_list = blocks_or_oids
+        if isinstance(blocks_or_oids[0], Block):
+            oid_list = [
+                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
+                for b in blocks_or_oids
+            ]
+        if len(oid_list) == 1:
+            return oid_list[0][0]
+        q = oid_list
+        while len(q) > 1:
+            a_oid, a_ge, a_gs, a_T = q.pop(0)
+            b_oid, _, _, b_T = q.pop(0)
+            ge, gs = (
+                (result_grid_entry, result_grid_shape) if len(q) == 0 else (a_ge, a_gs)
+            )
+            c_oid = self.km.bop_reduce(
+                op_name,
+                a_oid,
+                b_oid,
+                a_T,
+                b_T,
+                syskwargs={
+                    "grid_entry": ge,
+                    "grid_shape": gs,
+                },
+            )
+            q.append((c_oid, ge, gs, False))
+        r_oid, r_ge, r_gs, _ = q.pop(0)
+        assert r_ge == result_grid_entry
+        assert r_gs == result_grid_shape
+        return r_oid
+
     #################
     # Arithmetic
     #################
@@ -827,6 +867,7 @@ def tensordot(a, b, axes=2):
                 for k in sum_dims:
                     a_block: Block = a.blocks[tuple(i + k)]
                     b_block: Block = b.blocks[tuple(k + j)]
+                    # pylint: disable=protected-access
                     dot_grid_args = a._compute_tensordot_syskwargs(a_block, b_block)
                     dotted_oid = a.km.bop(
                         "tensordot",
@@ -852,7 +893,7 @@ def tensordot(a, b, axes=2):
     # Inequalities
     #################
 
-    def __inequality__(self, op, other):
+    def __inequality__(self, op_name, other):
         other = self.check_or_convert_other(other)
         if other is NotImplemented:
             return NotImplemented
@@ -872,7 +913,7 @@ def __inequality__(self, op, other):
             else:
                 other_block: Block = other.blocks[grid_entry]
             result.blocks[grid_entry] = self.blocks[grid_entry].bop(
-                op, other_block, args={}
+                op_name, other_block, args={}
             )
         return result
 
diff --git a/nums/core/array/random.py b/nums/core/array/random.py
index 45196888..5e417270 100644
--- a/nums/core/array/random.py
+++ b/nums/core/array/random.py
@@ -311,5 +311,6 @@ def _sparse_sample_basic(
                 p,
                 syskwargs=syskwargs,
             )
+            # pylint: disable=protected-access
             block._nnz = sba.km.sparse_nnz(block.oid, syskwargs=syskwargs)
         return sba
diff --git a/nums/core/array/sparse.py b/nums/core/array/sparse.py
index f7dc2e2d..37939b68 100644
--- a/nums/core/array/sparse.py
+++ b/nums/core/array/sparse.py
@@ -1,13 +1,17 @@
 from typing import List
+import itertools
+import warnings
+import numpy as np
+import sparse
+
 from nums.core.array import utils as array_utils
 from nums.core.array.base import BlockBase, Block, BlockArrayBase
 from nums.core.array.blockarray import BlockArray
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.core.grid.grid import ArrayGrid
-import numpy as np
-import itertools
-import warnings
-import sparse
+
+
+# pylint: disable=protected-access, redefined-builtin
 
 
 class SparseBlock(BlockBase):
@@ -77,7 +81,18 @@ def copy(self, shallow=True):
         return block
 
     def map_uop(self, op_name, args=None, kwargs=None, device=None):
-        block = self.copy()
+        densify = array_utils.get_sparse_uop_densify(op_name)
+        if densify:
+            block = Block(
+                self.grid_entry,
+                self.grid_shape,
+                self.shape,
+                self.dtype,
+                self.transposed,
+                self.km,
+            )
+        else:
+            block = self.copy()
         block.dtype = array_utils.get_uop_output_type(op_name, self.dtype)
         args = () if args is None else args
         kwargs = {} if kwargs is None else kwargs
@@ -87,7 +102,7 @@ def map_uop(self, op_name, args=None, kwargs=None, device=None):
             syskwargs = {"device": device}
         block._device = device
         block.oid = self.km.sparse_map_uop(
-            op_name, self.oid, args, kwargs, syskwargs=syskwargs
+            op_name, self.oid, args, kwargs, densify, syskwargs=syskwargs
         )
         block._nnz = self.km.sparse_nnz(block.oid, syskwargs=syskwargs)
         block._nbytes = self.km.sparse_nbytes(block.oid, syskwargs=syskwargs)
@@ -312,8 +327,12 @@ def from_sparse(cls, arr, block_shape, copy, km, fill_value=0):
 
     @classmethod
     def from_scalar(cls, val, km):
-        if not array_utils.is_scalar(val):
-            raise ValueError("%s is not a scalar." % val)
+        # Only create SparseBlockArray with 0s. Other scalars should use dense BlockArray.
+        if not np.isclose(val, 0):
+            warnings.warn(
+                "%s cannot fill SparseBlockArray. Converting to BlockArray." % val
+            )
+            return BlockArray.from_np(np.array(val), (), copy=False, km=km)
         return SparseBlockArray.from_np(np.array(val), (), copy=False, km=km)
 
     @classmethod
@@ -422,13 +441,178 @@ def _check_bop_implemented(self, other):
         return False
 
     def ufunc(self, op_name):
-        result = self.copy()
+        densify = array_utils.get_sparse_uop_densify(op_name)
+        if densify:
+            result = BlockArray(self.grid, self.km)
+        else:
+            result = self.copy()
         for grid_entry in self.grid.get_entry_iterator():
             result.blocks[grid_entry] = self.blocks[grid_entry].ufunc(op_name)
         result._nnz = -1
         result._nbytes = -1
         return result
 
+    def reduce_axis(self, op_name, axis, keepdims=False):
+        if not (axis is None or isinstance(axis, (int, np.int32, np.int64))):
+            raise NotImplementedError("Only integer axis is currently supported.")
+        if 0 in self.shape:
+            return SparseBlockArray.create("zeros", (), (), float, self.km)
+        block_reduced_oids = np.empty_like(self.blocks, dtype=tuple)
+        for grid_entry in self.grid.get_entry_iterator():
+            block = self.blocks[grid_entry]
+            block_oid = self.km.sparse_reduce_axis(
+                op_name=op_name,
+                arr=block.oid,
+                axis=axis,
+                keepdims=keepdims,
+                transposed=block.transposed,
+                syskwargs={
+                    "grid_entry": block.grid_entry,
+                    "grid_shape": block.grid_shape,
+                },
+            )
+            block_reduced_oids[grid_entry] = (
+                block_oid,
+                block.grid_entry,
+                block.grid_shape,
+                False,
+            )
+        result_shape = []
+        result_block_shape = []
+        for curr_axis in range(len(self.shape)):
+            axis_size, axis_block_size = (
+                self.shape[curr_axis],
+                self.block_shape[curr_axis],
+            )
+            if curr_axis == axis or axis is None:
+                if keepdims:
+                    axis_size, axis_block_size = 1, 1
+                else:
+                    continue
+            result_shape.append(axis_size)
+            result_block_shape.append(axis_block_size)
+        result_shape = tuple(result_shape)
+        result_block_shape = tuple(result_block_shape)
+        result_dtype = array_utils.get_reduce_output_type(op_name, self.dtype)
+        result_grid = ArrayGrid(
+            shape=result_shape,
+            block_shape=result_block_shape,
+            dtype=result_dtype.__name__,
+        )
+        result = BlockArray(result_grid, self.km)
+
+        if axis is None:
+            if result.shape == ():
+                result_block: Block = result.blocks[()]
+            else:
+                result_block: Block = result.blocks[:].item()
+            result_block.oid = self.tree_reduce(
+                op_name,
+                block_reduced_oids.flatten().tolist(),
+                result_block.grid_entry,
+                result_block.grid_shape,
+                False,
+            )
+        else:
+            for result_grid_entry in result_grid.get_entry_iterator():
+                block_reduced_oids_axis = []
+                for sum_dim in range(self.grid.grid_shape[axis]):
+                    grid_entry = list(result_grid_entry)
+                    if keepdims:
+                        grid_entry[axis] = sum_dim
+                    else:
+                        grid_entry = grid_entry[:axis] + [sum_dim] + grid_entry[axis:]
+                    grid_entry = tuple(grid_entry)
+                    block_reduced_oids_axis.append(block_reduced_oids[grid_entry])
+                result_block: Block = result.blocks[result_grid_entry]
+                result_block.oid = self.tree_reduce(
+                    op_name,
+                    block_reduced_oids_axis,
+                    result_block.grid_entry,
+                    result_block.grid_shape,
+                    False,
+                )
+        return result
+
+    # pylint: disable=arguments-differ
+    def tree_reduce(
+        self,
+        op_name,
+        blocks_or_oids,
+        result_grid_entry,
+        result_grid_shape,
+        densify,
+        *args,
+    ):
+        """
+        Basic tree reduce imp.
+        Schedules op on same node as left operand.
+        :param op_name: The reduction op.
+        :param blocks_or_oids: A list of type Block or a list of tuples.
+        Tuples must be of the form
+        (oid, grid_entry, grid_shape, transposed)
+        :param result_grid_entry: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :param result_grid_shape: The grid entry of the result block. This will be used
+        to compute the final reduction step.
+        :return: The oid of the result.
+        """
+        oid_list = blocks_or_oids
+        if isinstance(blocks_or_oids[0], Block):
+            oid_list = [
+                (b.oid, b.grid_entry, b.grid_shape, b.transposed)
+                for b in blocks_or_oids
+            ]
+        if len(oid_list) == 1:
+            return oid_list[0][0]
+        q = oid_list
+        if densify:
+            while len(q) > 1:
+                a_oid, a_ge, a_gs, a_T = q.pop(0)
+                b_oid, _, _, b_T = q.pop(0)
+                ge, gs = (
+                    (result_grid_entry, result_grid_shape)
+                    if len(q) == 0
+                    else (a_ge, a_gs)
+                )
+                c_oid = self.km.bop_reduce(
+                    op_name,
+                    a_oid,
+                    b_oid,
+                    a_T,
+                    b_T,
+                    syskwargs={
+                        "grid_entry": ge,
+                        "grid_shape": gs,
+                    },
+                )
+                q.append((c_oid, ge, gs, False))
+        else:
+            while len(q) > 1:
+                a_oid, a_ge, a_gs, a_T = q.pop(0)
+                b_oid, _, _, b_T = q.pop(0)
+                ge, gs = (
+                    (result_grid_entry, result_grid_shape)
+                    if len(q) == 0
+                    else (a_ge, a_gs)
+                )
+                c_oid = self.km.sparse_bop_reduce(
+                    op_name,
+                    a_oid,
+                    b_oid,
+                    a_T,
+                    b_T,
+                    syskwargs={
+                        "grid_entry": ge,
+                        "grid_shape": gs,
+                    },
+                )
+                q.append((c_oid, ge, gs, False))
+        r_oid, r_ge, r_gs, _ = q.pop(0)
+        assert r_ge == result_grid_entry
+        assert r_gs == result_grid_shape
+        return r_oid
+
     #################
     # Arithmetic
     #################
@@ -583,7 +767,11 @@ def tensordot(a, b, axes=2):
                         (dotted_oid, dot_grid_args[0], dot_grid_args[1], False)
                     )
                 result_block.oid = a.tree_reduce(
-                    "sum", sum_oids, result_block.grid_entry, result_block.grid_shape
+                    "sum",
+                    sum_oids,
+                    result_block.grid_entry,
+                    result_block.grid_shape,
+                    densify,
                 )
                 if not densify:
                     syskwargs = {
diff --git a/nums/core/array/utils.py b/nums/core/array/utils.py
index ed01190e..356b78f7 100644
--- a/nums/core/array/utils.py
+++ b/nums/core/array/utils.py
@@ -18,7 +18,6 @@
 
 import numpy as np
 import scipy.special
-import sparse
 
 from nums.core.settings import np_ufunc_map
 from nums.core.array.errors import AxisError
@@ -453,24 +452,25 @@ def normalize_axis_index(axis, ndim):
     return axis % ndim
 
 
-def get_sparse_bop_densify(op_name, a_dense, b_dense):
-    def sample_array(is_dense):
-        s = np.eye(2)
-        if not is_dense:
-            return sparse.COO.from_numpy(s, fill_value=0)
-        return s
-
-    sa = sample_array(a_dense)
-    sb = sample_array(b_dense)
-    if op_name == "tensordot":
-        result = sparse.tensordot(sa, sb)
-    else:
-        op_name = np_ufunc_map.get(op_name, op_name)
-        try:
-            ufunc = np.__getattribute__(op_name)
-        except Exception as _:
-            ufunc = scipy.special.__getattribute__(op_name)
-        result = sparse.elemwise(ufunc, sa, sb)
-    if isinstance(result, sparse.SparseArray):
+def get_sparse_uop_densify(op_name):
+    ufunc = np.__getattribute__(op_name)
+    if ufunc(0) == 0:
         return False
     return True
+
+
+sd_bop_sparse_ops = [
+    "mul",
+]
+
+
+def get_sparse_bop_densify(op_name, a_dense: bool, b_dense: bool):
+    if a_dense and b_dense:
+        return True
+
+    if a_dense or b_dense:
+        if op_name in sd_bop_sparse_ops:
+            return False
+        return True
+
+    return False
diff --git a/nums/core/array/view.py b/nums/core/array/view.py
index 88e79267..9fa40b7e 100644
--- a/nums/core/array/view.py
+++ b/nums/core/array/view.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-from typing import Tuple, Type
+from typing import Tuple
 
 import numpy as np
 
diff --git a/nums/core/kernel/kernel_interface.py b/nums/core/kernel/kernel_interface.py
index f1f16218..0314011c 100644
--- a/nums/core/kernel/kernel_interface.py
+++ b/nums/core/kernel/kernel_interface.py
@@ -194,12 +194,20 @@ def sparse_random_block(
     ):
         raise NotImplementedError()
 
-    def sparse_map_uop(self, op_name, arr, args, kwargs, syskwargs: Dict):
+    def sparse_map_uop(self, op_name, arr, args, kwargs, densify, syskwargs: Dict):
         raise NotImplementedError()
 
     def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify, syskwargs: Dict):
         raise NotImplementedError()
 
+    def sparse_reduce_axis(
+        self, op_name, arr, axis, keepdims, transposed, syskwargs: Dict
+    ):
+        raise NotImplementedError()
+
+    def sparse_bop_reduce(self, op, a1, a2, a1_T, a2_T, syskwargs: Dict):
+        raise NotImplementedError()
+
     def sparse_block_from_scalar(self, x, syskwargs: Dict):
         raise NotImplementedError()
 
diff --git a/nums/core/kernel/numpy_kernel.py b/nums/core/kernel/numpy_kernel.py
index 20523e0a..1789622f 100644
--- a/nums/core/kernel/numpy_kernel.py
+++ b/nums/core/kernel/numpy_kernel.py
@@ -435,7 +435,6 @@ def bop(self, op, a1, a2, a1_T, a2_T, axes):
             ufunc = scipy.special.__getattribute__(op)
         return ufunc(a1, a2)
 
-    # Works for sparse too.
     def bop_reduce(self, op, a1, a2, a1_T, a2_T):
         if a1_T:
             a1 = a1.T
@@ -572,7 +571,7 @@ def sparse_random_block(
         )
         return result.astype(dtype)
 
-    def sparse_map_uop(self, op_name, arr, args, kwargs):
+    def sparse_map_uop(self, op_name, arr, args, kwargs, densify):
         """
         Args:
             func: types.Callable
@@ -583,6 +582,10 @@ def sparse_map_uop(self, op_name, arr, args, kwargs):
         args = list(args)
         args.insert(0, arr)
         result = sparse.elemwise(ufunc, *args, **kwargs)
+        if densify and isinstance(result, sparse.SparseArray):
+            result = result.todense()
+        elif not densify:
+            assert isinstance(result, sparse.SparseArray)
         return result
 
     def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify):
@@ -611,6 +614,35 @@ def sparse_bop(self, op, a1, a2, a1_T, a2_T, axes, densify):
             assert isinstance(result, sparse.SparseArray)
         return result
 
+    def sparse_reduce_axis(self, op_name, arr, axis, keepdims, transposed):
+        assert isinstance(arr, sparse.COO)
+        op_func = np.__getattribute__(op_name)
+        if transposed:
+            arr = arr.T
+        return arr.reduce(op_func, axis=axis, keepdims=keepdims)
+
+    def sparse_bop_reduce(self, op, a1, a2, a1_T, a2_T):
+        assert isinstance(a1, sparse.COO) and isinstance(a2, sparse.COO)
+        if a1_T:
+            a1 = a1.T
+        if a2_T:
+            a2 = a2.T
+
+        # These are faster.
+        if op == "sum":
+            r = a1 + a2
+        elif op == "prod":
+            r = a1 * a2
+        else:
+            a = sparse.stack([a1, a2], axis=0)
+            r = a.reduce(np.__getattribute__(op), axis=0, keepdims=False)
+
+        if a1 is np.nan or a2 is np.nan or r is np.nan:
+            assert np.isscalar(a1) and np.isscalar(a2) and np.isscalar(r)
+        else:
+            assert a1.shape == a2.shape == r.shape
+        return r
+
     def sparse_block_from_scalar(self, x):
         assert np.isscalar(x)
         return sparse.COO.from_numpy(np.array(x), fill_value=0)
diff --git a/nums/experimental/optimizer/fusion.py b/nums/experimental/optimizer/fusion.py
index 1729f4d7..40373351 100644
--- a/nums/experimental/optimizer/fusion.py
+++ b/nums/experimental/optimizer/fusion.py
@@ -48,7 +48,7 @@ def traverse(self, node: TreeNode, fuseable_nodes):
             )
             if child_leafs is None:
                 # This branch has been pruned.
-                return None, None, None
+                return None, None, 0
             if len(child_block_id_set) <= self.max_args:
                 # If it's a frontier node, then there's no point in fusing it.
                 # If it's a leaf, we still want to count toward node_leafs,
@@ -102,11 +102,15 @@ def update_graph(self, root, fuseable_nodes):
 
     def fuse_node(self, node: TreeNode):
         result = FunctionNode(node.cluster_state)
-        result.op_func, result.children = node.fuse(result, self.km)
+        result.op_func, result.children, result.tree_node_meta = node.fuse(
+            result, self.km
+        )
         result.set_shape(node.shape())
         result.set_grid_entry(node.grid_entry())
         result.set_grid_shape(node.grid_shape())
         result.set_dtype(node.dtype())
+        assert result.tree_node_meta.shape == result.shape()
+        assert result.tree_node_meta.dtype == result.dtype()
 
         expression_str = node.expression()
         result.set_expression(expression_str)
diff --git a/nums/experimental/optimizer/fusion_utils.py b/nums/experimental/optimizer/fusion_utils.py
index cc4e8134..c47aedd1 100644
--- a/nums/experimental/optimizer/fusion_utils.py
+++ b/nums/experimental/optimizer/fusion_utils.py
@@ -40,3 +40,21 @@ def set_using_marker(node, inputs):
         node.update_child([child], [inputs[child.marker]])
         inputs[child.marker].parent = node
     return node
+
+
+def print_graph(root: TreeNode):
+    """
+    Recursively print string representations of tree nodes using BFS.
+    """
+    print("print_graph")
+    nodes = [root]
+    while len(nodes) > 0:
+        node = nodes.pop(0)
+        if hasattr(node, "child"):
+            nodes.append(node.child)
+        if hasattr(node, "left"):
+            nodes.append(node.left)
+            nodes.append(node.right)
+        if hasattr(node, "children"):
+            nodes.extend(node.children)
+        print(node)
diff --git a/nums/experimental/optimizer/graph.py b/nums/experimental/optimizer/graph.py
index 2440e622..2ffc7a51 100644
--- a/nums/experimental/optimizer/graph.py
+++ b/nums/experimental/optimizer/graph.py
@@ -22,10 +22,11 @@
 from nums.core.settings import sync_nnz
 from nums.core.array import utils as array_utils
 from nums.core.array.base import BlockBase, Block
+from nums.core.array.sparse import SparseBlock
 from nums.core.grid.grid import Device
 from nums.core.kernel.kernel_manager import KernelManager
 from nums.experimental.optimizer.clusterstate import ClusterState
-from nums.experimental.optimizer.size import TreeNodeSize
+from nums.experimental.optimizer.node_meta import TreeNodeMeta, LeafMeta
 
 
 def subsample(total_items, max_items, rs: np.random.RandomState):
@@ -52,6 +53,13 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         self._dtype = None
         self._expression = None
 
+        # Type LeafMeta for leaves: shape, nnz, dtype, and whether output is_dense.
+        # Type TreeNodeMeta for other nodes: dtype and whether output is_dense.
+        self.tree_node_meta: TreeNodeMeta = None
+        # Whether this operation invokes dense or sparse kernel.
+        # Need to explicitly set is_dense property after initialization.
+        self.dense_kernel = True
+
     def get_root(self):
         if self.parent is None:
             return self
@@ -123,6 +131,12 @@ def make_bop(self, op_name, other, args=None):
             assert self.parent is None and other.parent is None
             bop.left, bop.right = self, other
             bop.left.parent, bop.right.parent = bop, bop
+        bop.tree_node_meta = bop.left.tree_node_meta.bop_partial(
+            op_name, bop.right.tree_node_meta
+        )
+        bop.dense_kernel = (
+            bop.left.tree_node_meta.is_dense and bop.right.tree_node_meta.is_dense
+        )
         return bop
 
     def tensordot(self, other, axes):
@@ -154,7 +168,6 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         super().__init__(cluster_state, tree_node_id)
         self.block = None
         self.marker = -1
-        self.tree_node_size = None
 
     def get_children(self):
         return []
@@ -186,8 +199,8 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         # This property is only used for fusion.
         leaf.marker = self.marker
 
-        # This property is for experimental output size estimation.
-        leaf.tree_node_size = self.tree_node_size
+        leaf.tree_node_meta = self.tree_node_meta
+        leaf.dense_kernel = self.dense_kernel
         return leaf
 
     def get_leafs(self):
@@ -229,7 +242,7 @@ def expression(self):
 
     def fuse(self, func_node, km: KernelManager):
         f = km.get_fuseable("identity")
-        return f, [self.copy(self.cluster_state, func_node)]
+        return f, [self.copy(self.cluster_state, func_node)], self.tree_node_meta
 
     def is_scalar(self):
         return self.block.size() == 1
@@ -242,10 +255,11 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         self.op_name = None
 
     def __repr__(self):
-        return "UnaryOp(name=%s, id=%s, child=%s)" % (
+        return "UnaryOp(name=%s, id=%s, child=%s, dense_kernel=%s)" % (
             self.op_name,
             str(self.tree_node_id),
             str(self.child.tree_node_id),
+            self.dense_kernel,
         )
 
     def get_children(self):
@@ -264,6 +278,9 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         uop.child = self.child.copy(cluster_state, parent=uop, new_ids=new_ids)
         uop.op_name = self.op_name
         uop.copy_on_op = self.copy_on_op
+
+        uop.tree_node_meta = self.tree_node_meta
+        uop.dense_kernel = self.dense_kernel
         return uop
 
     def update_child(self, old_children, new_children):
@@ -317,7 +334,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         self.cluster_state.commit_uop(self._mem_cost(), self.child.block.id, device)
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             assert self.cluster_state.blocks_local(
@@ -338,7 +355,8 @@ def _collapse(self, device: Device):
             block: BlockBase = block.ufunc(op_name, device=device)
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = self.child.tree_node_size.uop(op_name)
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        leaf.tree_node_meta = self.child.tree_node_meta.uop(op_name)
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
 
@@ -348,8 +366,9 @@ def _mem_cost(self):
         if block.is_dense:
             return np.product(block.shape)
         if sync_nnz > 1:
-            self.child.tree_node_size.nnz = block.nnz  # Blocking fetch
-        return self.child.tree_node_size.uop(self.op_name).nbytes
+            self.child.tree_node_meta.nnz = block.nnz  # Blocking fetch
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        return self.child.tree_node_meta.uop(self.op_name).nbytes
 
     def shape(self):
         if self._shape is None:
@@ -387,24 +406,35 @@ def dtype(self):
 
     def expression(self):
         if self._expression is None:
-            self._expression = "UnaryOp(op=%s, x=%s)" % (
+            self._expression = "UnaryOp(op=%s, dense_kernel=%s, x=%s)" % (
                 self.op_name,
+                self.dense_kernel,
                 self.child.expression(),
             )
         return self._expression
 
     def fuse(self, func_node, km: KernelManager):
-        child_op, child_args = self.child.fuse(func_node, km)
+        child_op, child_args, child_meta = self.child.fuse(func_node, km)
         if self.op_name == "transpose":
             self_op = km.get_fuseable("transpose")
         else:
-            self_op = km.get_fuseable("map_uop")
-            self_op = partial(self_op, op_name=self.op_name, args=(), kwargs={})
+            if self.dense_kernel:
+                self_op = km.get_fuseable("map_uop")
+                self_op = partial(self_op, op_name=self.op_name, args=(), kwargs={})
+            else:
+                self_op = km.get_fuseable("sparse_map_uop")
+                self_op = partial(
+                    self_op,
+                    op_name=self.op_name,
+                    args=(),
+                    kwargs={},
+                    densify=self.tree_node_meta.is_dense,
+                )
 
         def fused(*args):
             return self_op(arr=child_op(*args))
 
-        return fused, child_args
+        return fused, child_args, child_meta.uop(self.op_name)
 
 
 class ReduceAxis(UnaryOp):
@@ -430,6 +460,9 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         ra.axis = self.axis
         ra.keepdims = self.keepdims
         ra.copy_on_op = self.copy_on_op
+
+        ra.tree_node_meta = self.tree_node_meta
+        ra.dense_kernel = self.dense_kernel
         return ra
 
     def _collapse(self, device: Device):
@@ -445,22 +478,32 @@ def _collapse(self, device: Device):
             transposed=False,
             km=child_block.km,
         )
-        block.oid = child_block.km.reduce_axis(
-            op_name=op_name,
-            arr=child_block.oid,
-            axis=self.axis,
-            keepdims=self.keepdims,
-            transposed=child_block.transposed,
-            syskwargs={"device": device},
-        )
+        if self.dense_kernel:
+            block.oid = child_block.km.reduce_axis(
+                op_name=op_name,
+                arr=child_block.oid,
+                axis=self.axis,
+                keepdims=self.keepdims,
+                transposed=child_block.transposed,
+                syskwargs={"device": device},
+            )
+        else:
+            block.oid = child_block.km.sparse_reduce_axis(
+                op_name=op_name,
+                arr=child_block.oid,
+                axis=self.axis,
+                keepdims=self.keepdims,
+                transposed=child_block.transposed,
+                syskwargs={"device": device},
+            )
         block._device = device
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = self.child.tree_node_size.reduce_axis(
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        leaf.tree_node_meta = self.child.tree_node_meta.reduce_axis(
             self.op_name,
             self.axis,
             self.keepdims,
-            self.child.block.transposed,
         )
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
@@ -471,12 +514,12 @@ def _mem_cost(self):
         if block.is_dense:
             return np.product(self.shape())
         if sync_nnz > 1:
-            self.child.tree_node_size.nnz = block.nnz  # Blocking fetch
-        return self.child.tree_node_size.reduce_axis(
+            self.child.tree_node_meta.nnz = block.nnz  # Blocking fetch
+        assert isinstance(self.child.tree_node_meta, LeafMeta)
+        return self.child.tree_node_meta.reduce_axis(
             self.op_name,
             self.axis,
             self.keepdims,
-            block.transposed,
         ).nbytes
 
     def update_tuple_property(self, val, keep_dim_val: Union[int, tuple] = 1):
@@ -519,18 +562,25 @@ def dtype(self):
 
     def expression(self):
         if self._expression is None:
-            self._expression = "ReduceAxis(op=%s, x=%s, axis=%s, keepdims=%s)" % (
-                self.op_name,
-                self.child.expression(),
-                str(self.axis),
-                str(self.keepdims),
+            self._expression = (
+                "ReduceAxis(op=%s, axis=%s, keepdims=%s, dense_kernel=%s, x=%s)"
+                % (
+                    self.op_name,
+                    str(self.axis),
+                    str(self.keepdims),
+                    self.dense_kernel,
+                    self.child.expression(),
+                )
             )
         return self._expression
 
     def fuse(self, func_node, km: KernelManager):
-        child_op, child_args = self.child.fuse(func_node, km)
+        child_op, child_args, child_meta = self.child.fuse(func_node, km)
 
-        self_op = km.get_fuseable("reduce_axis")
+        if self.dense_kernel:
+            self_op = km.get_fuseable("reduce_axis")
+        else:
+            self_op = km.get_fuseable("sparse_reduce_axis")
         kwargs = {
             "op_name": self.op_name,
             "axis": self.axis,
@@ -543,7 +593,11 @@ def fuse(self, func_node, km: KernelManager):
         def fused(*args):
             return self_op(arr=child_op(*args))
 
-        return fused, child_args
+        return (
+            fused,
+            child_args,
+            child_meta.reduce_axis(self.op_name, self.axis, self.keepdims),
+        )
 
 
 class BinaryOp(TreeNode):
@@ -563,11 +617,12 @@ def __repr__(self):
             "matmul": "@",
             "tensordot": "@",
         }[self.op_name]
-        return "BOp(id=%s, op=%s%s%s)" % (
+        return "BOp(id=%s, op=%s%s%s, dense_kernel=%s)" % (
             self.tree_node_id,
             str(self.left.tree_node_id),
             bop_symbol,
             str(self.right.tree_node_id),
+            self.dense_kernel,
         )
 
     def get_children(self):
@@ -591,6 +646,9 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         bop.left = self.left.copy(cluster_state, bop, new_ids=new_ids)
         bop.right = self.right.copy(cluster_state, bop, new_ids=new_ids)
         bop.copy_on_op = self.copy_on_op
+
+        bop.tree_node_meta = self.tree_node_meta
+        bop.dense_kernel = self.dense_kernel
         return bop
 
     def update_child(self, old_children, new_children):
@@ -672,7 +730,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         # Update cluster state with new block.
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             assert self.cluster_state.blocks_local(
@@ -702,9 +760,11 @@ def _collapse(self, device: Device):
         block: BlockBase = lblock.bop(op_name, rblock, args=args, device=device)
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = self.left.tree_node_size.bop(
+        assert isinstance(self.left.tree_node_meta, LeafMeta)
+        assert isinstance(self.right.tree_node_meta, LeafMeta)
+        leaf.tree_node_meta = self.left.tree_node_meta.bop(
             op_name,
-            self.right.tree_node_size,
+            self.right.tree_node_meta,
             **args,
         )
         leaf.copy_on_op = self.copy_on_op
@@ -726,11 +786,13 @@ def _mem_cost(self):
         if lblock.is_dense and rblock.is_dense:
             return np.product(self.shape())
         if sync_nnz > 1:
-            self.left.tree_node_size.nnz = lblock.nnz  # Blocking fetch
-            self.right.tree_node_size.nnz = rblock.nnz  # Blocking fetch
-        return self.left.tree_node_size.bop(
+            self.left.tree_node_meta.nnz = lblock.nnz  # Blocking fetch
+            self.right.tree_node_meta.nnz = rblock.nnz  # Blocking fetch
+        assert isinstance(self.left.tree_node_meta, LeafMeta)
+        assert isinstance(self.right.tree_node_meta, LeafMeta)
+        return self.left.tree_node_meta.bop(
             op_name,
-            self.right.tree_node_size,
+            self.right.tree_node_meta,
             **args,
         ).nbytes
 
@@ -801,27 +863,44 @@ def expression(self):
         if self._expression is None:
             if self.op_name == "matmul" or self.op_name == "tensordot":
                 axes = self.args.get("axes", 1)
-                self._expression = "BinaryOp(op=%s, x=%s, y=%s, axes=%s)" % (
-                    self.op_name,
-                    self.left.expression(),
-                    self.right.expression(),
-                    axes,
+                self._expression = (
+                    "BinaryOp(op=%s, axes=%s, dense_kernel=%s, x=%s, y=%s)"
+                    % (
+                        self.op_name,
+                        axes,
+                        self.dense_kernel,
+                        self.left.expression(),
+                        self.right.expression(),
+                    )
                 )
-            self._expression = "BinaryOp(op=%s, x=%s, y=%s)" % (
+            self._expression = "BinaryOp(op=%s, dense_kernel=%s, x=%s, y=%s)" % (
                 self.op_name,
+                self.dense_kernel,
                 self.left.expression(),
                 self.right.expression(),
             )
         return self._expression
 
     def fuse(self, func_node, km: KernelManager):
-        left_op, left_args = self.left.fuse(func_node, km)
-        right_op, right_args = self.right.fuse(func_node, km)
-
-        self_op = km.get_fuseable("bop")
+        left_op, left_args, left_meta = self.left.fuse(func_node, km)
+        right_op, right_args, right_meta = self.right.fuse(func_node, km)
 
         axes = 1 if self.args is None else self.args.get("axes", 1)
-        self_op = partial(self_op, op=self.op_name, a1_T=False, a2_T=False, axes=axes)
+        if self.dense_kernel:
+            self_op = km.get_fuseable("bop")
+            self_op = partial(
+                self_op, op=self.op_name, a1_T=False, a2_T=False, axes=axes
+            )
+        else:
+            self_op = km.get_fuseable("sparse_bop")
+            self_op = partial(
+                self_op,
+                op=self.op_name,
+                a1_T=False,
+                a2_T=False,
+                axes=axes,
+                densify=self.tree_node_meta.is_dense,
+            )
         num_left = len(left_args)
         # Combine the left and right args.
         args = left_args + right_args
@@ -836,7 +915,21 @@ def fused(*args):
             args2 = args[num_left:]
             return self_op(a1=left_op(*args1), a2=right_op(*args2))
 
-        return fused, args
+        if self.op_name == "matmul":
+            op_name, extra_args = "tensordot", {"axes": 1}
+        elif self.op_name == "tensordot":
+            op_name, extra_args = "tensordot", self.args
+        else:
+            op_name, extra_args = self.op_name, {}
+        return (
+            fused,
+            args,
+            left_meta.bop(
+                op_name,
+                right_meta,
+                **extra_args,
+            ),
+        )
 
 
 class FunctionNode(TreeNode):
@@ -867,7 +960,7 @@ def finalize(self, km: KernelManager):
         km.register(self.op_hash, self.op_func, {})
 
     def __repr__(self):
-        return "Function(id=%s, op=%s, args=%s" % (
+        return "Function(id=%s, op=%s, args=%s)" % (
             self.tree_node_id,
             self.op_hash,
             len(self.children),
@@ -895,11 +988,13 @@ def copy(self, cluster_state, parent=None, new_ids=False):
         fnode.parent = parent
         fnode.op_hash = self.op_hash
         fnode.op_func = self.op_func
-        fnode.op_expression = self.op_expression
         fnode.children = [
             child.copy(cluster_state, fnode, new_ids=new_ids) for child in self.children
         ]
         fnode.copy_on_op = self.copy_on_op
+
+        fnode.tree_node_meta = self.tree_node_meta
+        fnode.dense_kernel = self.dense_kernel
         return fnode
 
     def update_child(self, old_children, new_children):
@@ -1007,7 +1102,11 @@ def _collapse(self, device: Device):
             block_oids.append(child.block.oid)
             if km is None:
                 km = child.block.km
-        block: BlockBase = Block(
+        if self.tree_node_meta.is_dense:
+            block_type = Block
+        else:
+            block_type = SparseBlock
+        block: BlockBase = block_type(
             self._grid_entry, self._grid_shape, self._shape, self._dtype, False, km
         )
         block._device = device
@@ -1016,6 +1115,7 @@ def _collapse(self, device: Device):
         )
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
+        leaf.tree_node_meta = self.tree_node_meta
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
 
@@ -1063,7 +1163,7 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None):
         self.children: List[TreeNode] = None
 
     def __repr__(self):
-        return "Einsum(id=%s, subscript=%s, operands=%s" % (
+        return "Einsum(id=%s, subscript=%s, operands=%s)" % (
             self.tree_node_id,
             self.subscript,
             len(self.children),
@@ -1184,7 +1284,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> Leaf:
         # Update cluster state with new block.
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             for block_id in block_ids:
@@ -1214,8 +1314,11 @@ def _collapse(self, device: Device):
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
         # Assume dense for simplicity.
-        leaf.tree_node_size = TreeNodeSize(
-            self.shape(), np.prod(self.shape()), block.dtype, block.is_dense
+        leaf.tree_node_meta = LeafMeta(
+            self.shape(),
+            np.prod(self.shape()),
+            block.dtype,
+            block.is_dense,
         )
         leaf.copy_on_op = self.copy_on_op
         return leaf, block
diff --git a/nums/experimental/optimizer/grapharray.py b/nums/experimental/optimizer/grapharray.py
index 4755a574..07de2b6f 100644
--- a/nums/experimental/optimizer/grapharray.py
+++ b/nums/experimental/optimizer/grapharray.py
@@ -35,12 +35,13 @@
     Leaf,
     UnaryOp,
     ReduceAxis,
+    FunctionNode,
     Einsum,
 )
 from nums.experimental.optimizer.reduction_ops import TreeReductionOp
 from nums.experimental.optimizer.fusion import FuseGraph
 from nums.experimental.optimizer.fusion_utils import set_using_marker, traverse_marker
-from nums.experimental.optimizer.size import TreeNodeSize
+from nums.experimental.optimizer.node_meta import LeafMeta
 
 
 class GraphArray(object):
@@ -66,17 +67,18 @@ def graphs_from_ba(
                 nnz = block.nnz  # Blocking fetch
             else:
                 nnz = np.prod(block.shape)
-            leaf.tree_node_size = TreeNodeSize(
+            leaf.tree_node_meta = LeafMeta(
                 block.shape,
                 nnz,
                 block.dtype,
                 block.is_dense,
             )
+            leaf.dense_kernel = block.is_dense
             leaf.copy_on_op = copy_on_op
             graphs[grid_entry] = leaf
 
             cluster_state.add_block(
-                block.id, leaf.tree_node_size.nbytes, devices=[device]
+                block.id, leaf.tree_node_meta.nbytes, devices=[device]
             )
             cluster_state.init_mem_load(device, block.id)
         return graphs
@@ -224,6 +226,9 @@ def tensordot(a, b, axes=2):
                         # tree structure here is never exposed.
                         dot_node.parent = rop
                         rop.add_child(dot_node)
+                    rop.tree_node_meta = dot_node.tree_node_meta
+                    # Assume children are all dense or all sparse.
+                    rop.dense_kernel = dot_node.tree_node_meta.is_dense
                     result_graphs[grid_entry] = rop
 
         return GraphArray(
@@ -369,6 +374,8 @@ def _add_uop(self, op_name, grid_entry, old_arr, new_arr):
             assert old_root.parent is None
             uop.child = old_root
             old_root.parent = uop
+        uop.tree_node_meta = old_root.tree_node_meta.uop_partial(op_name)
+        uop.dense_kernel = old_root.tree_node_meta.is_dense
         new_arr[grid_entry] = uop
 
     def reduce_axis(self, op_name, axis, keepdims):
@@ -388,6 +395,12 @@ def reduce_axis(self, op_name, axis, keepdims):
             reduced_tnode.op_name = op_name
             reduced_tnode.axis = axis
             reduced_tnode.keepdims = keepdims
+            reduced_tnode.tree_node_meta = tnode.tree_node_meta.reduce_axis_partial(
+                op_name,
+                axis,
+                keepdims,
+            )
+            reduced_tnode.dense_kernel = tnode.tree_node_meta.is_dense
             reduced_graphs[grid_entry] = reduced_tnode
 
         # Compute output GraphArray properties.
@@ -442,6 +455,8 @@ def reduce_axis(self, op_name, axis, keepdims):
                 rop.add_child(child)
                 assert child.parent is None
                 child.parent = rop
+            rop.tree_node_meta = child.tree_node_meta
+            rop.dense_kernel = child.tree_node_meta.is_dense
             if result_grid.shape == ():
                 # keepdims = False.
                 result_graphs[()] = rop
@@ -469,6 +484,8 @@ def reduce_axis(self, op_name, axis, keepdims):
                     rop.add_child(child)
                     assert child.parent is None
                     child.parent = rop
+                rop.tree_node_meta = child.tree_node_meta
+                rop.dense_kernel = child.tree_node_meta.is_dense
                 result_graphs[result_grid_entry] = rop
 
         return GraphArray(
@@ -485,16 +502,22 @@ def sum(self, axis=None, keepdims=False):
     def compile(self, max_args: int):
         result_graphs = np.empty_like(self.graphs, dtype=self.graphs.dtype)
         counter = 0
+        first_grid_entry = (0,) * len(self.grid.shape)
         for grid_entry in self.grid.get_entry_iterator():
             graph = self.graphs[grid_entry]
             _, leaf_inputs = traverse_marker(graph, 0)
-            if grid_entry == (0,) or grid_entry == (0, 0):  # generic
-                result_graphs[grid_entry] = FuseGraph(
-                    graph, self.km, max_args=max_args
-                )()
-                fused_graph = result_graphs[grid_entry]
-                fused_graph.op_expression = fused_graph._expression
+            if grid_entry == first_grid_entry:
+                fused_graph = FuseGraph(graph, self.km, max_args=max_args)()
+                if not isinstance(fused_graph, FunctionNode):
+                    # Stopgap as this function currently assumes root is FunctionNode.
+                    return self
+                result_graphs[grid_entry] = fused_graph
+                # result_graphs[grid_entry] = FuseGraph(
+                #     graph, self.km, max_args=max_args
+                # )()
+                # fused_graph = result_graphs[grid_entry]
             else:
+                # TODO: support subtree fusion in this section. FuseGraph already does.
                 fused_graph_copy = fused_graph.copy(self.cluster_state, new_ids=True)
                 fused_graph_copy = set_using_marker(fused_graph_copy, leaf_inputs)
                 fused_graph_copy.set_grid_entry(grid_entry)
diff --git a/nums/experimental/optimizer/size.py b/nums/experimental/optimizer/node_meta.py
similarity index 74%
rename from nums/experimental/optimizer/size.py
rename to nums/experimental/optimizer/node_meta.py
index 089d97d4..236fe580 100644
--- a/nums/experimental/optimizer/size.py
+++ b/nums/experimental/optimizer/node_meta.py
@@ -3,44 +3,65 @@
 from nums.core.array import utils as array_utils
 
 
-# TODO: integrate this class with FuseGraph.
-class TreeNodeSize:
+class TreeNodeMeta:
     """
-    Encapsulated by each TreeNode to keep track of estimated or observed sizes of blocks.
+    Encapsulated by each TreeNode to keep track of node metadata like shape and output density.
+    """
+
+    def __init__(self, dtype, is_dense):
+        self.dtype = dtype
+        self.is_dense = is_dense
+
+    def copy(self):
+        return TreeNodeMeta(
+            self.dtype,
+            self.is_dense,
+        )
+
+    def uop_partial(self, op_name):
+        """
+        Does not update nnz, as it can't be estimated for non-leaf nodes.
+        """
+        is_dense = self.is_dense or array_utils.get_sparse_uop_densify(op_name)
+        return TreeNodeMeta(
+            dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+            is_dense=is_dense,
+        )
+
+    def reduce_axis_partial(self, op_name, axis, keepdims):
+        return TreeNodeMeta(
+            dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+            is_dense=True,
+        )
+
+    def bop_partial(self, op_name, other, **kwargs):
+        dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
+        is_dense = array_utils.get_sparse_bop_densify(
+            op_name,
+            self.is_dense,
+            other.is_dense,
+        )
+        return TreeNodeMeta(dtype, is_dense)
+
+
+class LeafMeta(TreeNodeMeta):
+    """
+    Encapsulated by each Leaf to keep track of sizes of blocks in addition to TreeNodeMeta.
     Sparse binary operations use B(n, p) to estimate nnz as needed.
     """
 
     def __init__(self, shape, nnz, dtype, is_dense, index_dtype=np.int64):
+        super().__init__(dtype, is_dense)
         self.shape = shape
         self.nnz = nnz
-        self.dtype = dtype
-        self.is_dense = is_dense
         self.index_dtype = index_dtype
         if self.is_dense:
             assert nnz == np.prod(shape)
         else:
             assert index_dtype is not None
 
-        self.bop_estimation_map = {
-            "add": self.add,
-            "sum": self.add,
-            "sub": self.add,
-            "mul": self.mul,
-            "prod": self.mul,
-            "truediv": self.truediv,
-            "pow": self.pow,
-            "matmul": lambda other: self.tensordot(other, axes=1),
-            "tensordot": self.tensordot,
-            "lt": lambda other: self.inequality("lt", other),
-            "le": lambda other: self.inequality("le", other),
-            "gt": lambda other: self.inequality("gt", other),
-            "ge": lambda other: self.inequality("ge", other),
-            "eq": lambda other: self.inequality("eq", other),
-            "ne": lambda other: self.inequality("ne", other),
-        }
-
     def copy(self):
-        return TreeNodeSize(
+        return LeafMeta(
             self.shape,
             self.nnz,
             self.dtype,
@@ -61,36 +82,49 @@ def nbytes(self):
 
     def uop(self, op_name):
         if op_name == "transpose":
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=tuple(reversed(self.shape)),
                 nnz=self.nnz,
                 dtype=self.dtype,
                 is_dense=self.is_dense,
             )
-        return TreeNodeSize(
+        nnz = self.nnz
+        is_dense = self.is_dense
+        if not is_dense and array_utils.get_sparse_uop_densify(op_name):
+            nnz = np.prod(self.shape)
+            is_dense = True
+        return LeafMeta(
             shape=self.shape,
-            nnz=self.nnz,
+            nnz=nnz,
             dtype=array_utils.get_uop_output_type(op_name, self.dtype),
-            is_dense=self.is_dense,
+            is_dense=is_dense,
         )
 
-    def reduce_axis(self, op_name, axis, keepdims, transposed):
-        # Assume dense for now
+    def reduce_axis(self, op_name, axis, keepdims):
         shape = list(self.shape)
-        if transposed:
-            shape.reverse()
         if axis is None:
             shape = []
         elif keepdims:
             shape[axis] = 1
         else:
             shape.pop(axis)
-        return TreeNodeSize(
-            shape=tuple(shape),
-            nnz=np.prod(shape),
-            dtype=array_utils.get_uop_output_type(op_name, self.dtype),
-            is_dense=True,
-        )
+        if self.is_dense:
+            return LeafMeta(
+                shape=tuple(shape),
+                nnz=np.prod(shape),
+                dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+                is_dense=True,
+            )
+        else:
+            # If any element in reduced axis is nonzero, result is nonzero.
+            p1 = self.nnz / np.prod(self.shape)
+            nnz = int((1 - (1 - p1) ** self.shape[axis]) * np.prod(shape))
+            return LeafMeta(
+                shape=tuple(shape),
+                nnz=nnz,
+                dtype=array_utils.get_uop_output_type(op_name, self.dtype),
+                is_dense=False,
+            )
 
     def _nnz_disjunction(self, other, shape):
         # If either element is nonzero, result is nonzero.
@@ -123,7 +157,7 @@ def add(self, other):
             other.is_dense,
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
@@ -133,7 +167,7 @@ def add(self, other):
             raise ValueError(
                 "TreeNodeSize.__add__ is inconsistent with sparse bop rules."
             )
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=self._nnz_disjunction(other, shape),
             dtype=dtype,
@@ -151,7 +185,7 @@ def mul(self, other):
             other.is_dense,
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
@@ -167,7 +201,7 @@ def mul(self, other):
             raise ValueError(
                 "TreeNodeSize.__mul__ is inconsistent with sparse bop rules."
             )
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=nnz,
             dtype=dtype,
@@ -185,7 +219,7 @@ def truediv(self, other):
             other.is_dense,
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
@@ -201,7 +235,7 @@ def truediv(self, other):
             nnz = self._nnz_selection(other, shape)
         else:
             nnz = self._nnz_disjunction(other, shape)
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=nnz,
             dtype=dtype,
@@ -230,7 +264,7 @@ def tensordot(self, other, axes):
             "tensordot", self.is_dense, other.is_dense
         )
         if is_dense:
-            return TreeNodeSize(
+            return LeafMeta(
                 shape=shape,
                 nnz=np.prod(shape),
                 dtype=dtype,
@@ -246,7 +280,7 @@ def tensordot(self, other, axes):
         p2 = other.nnz / n2
         m = np.prod(shape)
         k = np.prod(sum_shape)
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=int((1 - (1 - p1 * p2) ** k) * m),
             dtype=dtype,
@@ -256,18 +290,18 @@ def tensordot(self, other, axes):
     def inequality(self, op_name, other):
         assert other.shape == ()
         dtype = array_utils.get_bop_output_type(op_name, self.dtype, other.dtype)
-        return TreeNodeSize(
+        return LeafMeta(
             shape=self.shape,
             nnz=self.nnz,
             dtype=dtype,
             is_dense=self.is_dense,
         )
 
-    def bop_dense(self, other):
+    def bop_dense(self, other, **kwargs):
         # NOTE: as catch-all fallback, this may be wrong for unknown non-elementwise binary ops.
         shape = array_utils.broadcast_shape(self.shape, other.shape)
         dtype = array_utils.get_bop_output_type("add", self.dtype, other.dtype)
-        return TreeNodeSize(
+        return LeafMeta(
             shape=shape,
             nnz=np.prod(shape),
             dtype=dtype,
@@ -275,4 +309,21 @@ def bop_dense(self, other):
         )
 
     def bop(self, op_name, other, **kwargs):
-        return self.bop_estimation_map.get(op_name, self.bop_dense)(other, **kwargs)
+        bop_estimation_map = {
+            "add": self.add,
+            "sum": self.add,
+            "sub": self.add,
+            "mul": self.mul,
+            "prod": self.mul,
+            "truediv": self.truediv,
+            "pow": self.pow,
+            "matmul": lambda other: self.tensordot(other, axes=1),
+            "tensordot": self.tensordot,
+            "lt": lambda other: self.inequality("lt", other),
+            "le": lambda other: self.inequality("le", other),
+            "gt": lambda other: self.inequality("gt", other),
+            "ge": lambda other: self.inequality("ge", other),
+            "eq": lambda other: self.inequality("eq", other),
+            "ne": lambda other: self.inequality("ne", other),
+        }
+        return bop_estimation_map.get(op_name, self.bop_dense)(other, **kwargs)
diff --git a/nums/experimental/optimizer/reduction_ops.py b/nums/experimental/optimizer/reduction_ops.py
index 5ef10f03..50228ad3 100644
--- a/nums/experimental/optimizer/reduction_ops.py
+++ b/nums/experimental/optimizer/reduction_ops.py
@@ -41,10 +41,11 @@ def __init__(self, cluster_state: ClusterState, tree_node_id=None, seed=1337):
         self.action_leaf_q = []
 
     def __repr__(self):
-        return "Reduc(id=%s, op=%s, in=%d)" % (
+        return "Reduc(id=%s, op=%s, in=%d, dense_kernel=%s)" % (
             str(self.tree_node_id),
             self.op_name,
             len(self.children_dict),
+            self.dense_kernel,
         )
 
     def get_children(self):
@@ -83,6 +84,9 @@ def copy(self, cluster_state, parent=None, new_ids=False):
             if child.tree_node_id in self.leafs_dict:
                 rop.leafs_dict[child_copy.tree_node_id] = child_copy
         # TODO (hme): How do we properly copy random state?
+
+        rop.tree_node_meta = self.tree_node_meta
+        rop.dense_kernel = self.dense_kernel
         return rop
 
     def add_child(self, child: TreeNode):
@@ -231,7 +235,7 @@ def execute_on(self, device: Device, leaf_ids=None) -> TreeNode:
         # Update cluster state with new block.
         # self.cluster_state.add_block(new_block.id, new_block.size(), [device])
         self.cluster_state.add_block(
-            new_block.id, new_leaf.tree_node_size.nbytes, [device]
+            new_block.id, new_leaf.tree_node_meta.nbytes, [device]
         )
         if not self.cluster_state.created_on_only:
             assert self.cluster_state.blocks_local(left.block.id, right.block.id)
@@ -266,21 +270,33 @@ def _collapse(self, device: Device, left: Leaf, right: Leaf):
         block: BlockBase = lblock.copy()
         block.transposed = False
         block.dtype = array_utils.get_reduce_output_type(self.op_name, lblock.dtype)
-        block.oid = lblock.km.bop_reduce(
-            op_name,
-            lblock.oid,
-            rblock.oid,
-            lblock.transposed,
-            rblock.transposed,
-            syskwargs={"device": device},
-        )
+        if self.dense_kernel:
+            assert lblock.is_dense and rblock.is_dense
+            block.oid = lblock.km.bop_reduce(
+                op_name,
+                lblock.oid,
+                rblock.oid,
+                lblock.transposed,
+                rblock.transposed,
+                syskwargs={"device": device},
+            )
+        else:
+            assert not lblock.is_dense or not rblock.is_dense
+            block.oid = lblock.km.sparse_bop_reduce(
+                op_name,
+                lblock.oid,
+                rblock.oid,
+                lblock.transposed,
+                rblock.transposed,
+                syskwargs={"device": device},
+            )
         block._device = device
 
         leaf: Leaf = Leaf(self.cluster_state)
         leaf.block = block
-        leaf.tree_node_size = left.tree_node_size.bop(
+        leaf.tree_node_meta = left.tree_node_meta.bop(
             op_name,
-            right.tree_node_size,
+            right.tree_node_meta,
             **args,
         )
         leaf.copy_on_op = self.copy_on_op
@@ -303,13 +319,13 @@ def _mem_cost(self, leafs):
         if leaf_block.is_dense:
             return leaf_block.size()
         if sync_nnz > 1:
-            leafs[0].tree_node_size.nnz = leafs[0].block.nnz  # Blocking fetch
-            leafs[1].tree_node_size.nnz = leafs[1].block.nnz  # Blocking fetch
+            leafs[0].tree_node_meta.nnz = leafs[0].block.nnz  # Blocking fetch
+            leafs[1].tree_node_meta.nnz = leafs[1].block.nnz  # Blocking fetch
         return (
             leafs[0]
-            .tree_node_size.bop(
+            .tree_node_meta.bop(
                 self.op_name,
-                leafs[1].tree_node_size,
+                leafs[1].tree_node_meta,
             )
             .nbytes
         )
@@ -350,10 +366,14 @@ def expression(self):
             # This will force a different hash for large fused reductions,
             # if this is, for whatever reason, needed.
             size = len(self.children_dict)
-            self._expression = "TreeReductionOp(op=%s, size=%s, id=%s)" % (
-                self.op_name,
-                str(size),
-                self.tree_node_id,
+            self._expression = (
+                "TreeReductionOp(op=%s, size=%s, id=%s, dense_kernel=%s)"
+                % (
+                    self.op_name,
+                    str(size),
+                    self.tree_node_id,
+                    self.dense_kernel,
+                )
             )
         return self._expression
 
diff --git a/tests/core/array/test_sparse.py b/tests/core/array/test_sparse.py
index 4563771e..5a554799 100644
--- a/tests/core/array/test_sparse.py
+++ b/tests/core/array/test_sparse.py
@@ -1,9 +1,7 @@
 import numpy as np
 import sparse
-import pytest
 
 from nums.core.array.application import ArrayApplication
-from nums.core.array.blockarray import BlockArray
 from nums.core.array.sparse import SparseBlockArray
 from nums.core.array.random import NumsRandomState
 
@@ -83,16 +81,12 @@ def test_sparse_add(app_inst: ArrayApplication):
     assert np.array_equal(x2 - x1_sp, y_ba.get())
 
     # Test sparse-scalar.
-    y_sp = x1_sp - 1  # __sub__
-    y_sba = x1_sba - 1
-    assert y_sba.nnz == y_sp.nnz  # 16
-    y_ba = y_sba.to_ba()
-    assert np.array_equal(x1 - 1, y_ba.get())
-    y_sp = 1 - x1_sp  # __rsub__
-    y_sba = 1 - x1_sba
-    assert y_sba.nnz == y_sp.nnz  # 16
-    y_ba = y_sba.to_ba()
-    assert np.array_equal(1 - x1, y_ba.get())
+    y = (x1_sp - 1).todense()  # __sub__
+    y_ba = x1_sba - 1
+    assert np.array_equal(y, y_ba.get())
+    y = (1 - x1_sp).todense()  # __rsub__
+    y_ba = 1 - x1_sba
+    assert np.array_equal(y, y_ba.get())
 
 
 def test_sparse_mul(app_inst: ArrayApplication):
diff --git a/tests/experimental/optimizer/test_fusion.py b/tests/experimental/optimizer/test_fusion.py
index 7661fdae..4ced2e78 100644
--- a/tests/experimental/optimizer/test_fusion.py
+++ b/tests/experimental/optimizer/test_fusion.py
@@ -26,12 +26,14 @@
 
 from nums.core.array.application import BlockArray
 from nums.core.array.base import BlockArrayBase
+from nums.core.array.sparse import SparseBlockArray
 from nums.experimental.optimizer.clusterstate import ClusterState
 from nums.experimental.optimizer.grapharray import (
     GraphArray,
 )
 from nums.experimental.optimizer.tree_search import RandomTS
 from nums.experimental.optimizer.fusion import FuseGraph
+from nums.experimental.optimizer.fusion_utils import print_graph
 from nums.experimental.optimizer.graph import TreeNode, Leaf, FunctionNode
 import conftest
 
@@ -45,12 +47,8 @@ def fusion2(app, x, y):
     return x @ y
 
 
-def fusion3(app, s, q, p):
-    return s * (q @ p.T)
-
-
 def ga_op(
-    app, func, x: BlockArray, y: BlockArray, copy_on_op=True, max_args=2
+    app, func, x: BlockArrayBase, y: BlockArrayBase, copy_on_op=True, max_args=2
 ) -> BlockArray:
     cluster_state: ClusterState = ClusterState(x.km.devices())
     x_ga: GraphArray = GraphArray.from_ba(x, cluster_state, copy_on_op=copy_on_op)
@@ -69,27 +67,6 @@ def ga_op(
     return BlockArray(result_ga.grid, x.km, result_ga.to_blocks())
 
 
-def ga_op_sampled_dense_dense(
-    app, func, s: BlockArray, p: BlockArray, q: BlockArray, copy_on_op=True, max_args=3
-) -> BlockArray:
-    cluster_state: ClusterState = ClusterState(s.km.devices())
-    s_ga: GraphArray = GraphArray.from_ba(s, cluster_state, copy_on_op=copy_on_op)
-    p_ga: GraphArray = GraphArray.from_ba(p, cluster_state, copy_on_op=copy_on_op)
-    q_ga: GraphArray = GraphArray.from_ba(q, cluster_state, copy_on_op=copy_on_op)
-    op_ga: GraphArray = func(app, s_ga, p_ga, q_ga)
-    start_time = time.time()
-    fused_ga: GraphArray = op_ga.compile(max_args)
-    end_time = time.time()
-    result_ga: GraphArray = RandomTS(
-        seed=conftest.rs,
-        max_samples_per_step=1,
-        max_reduction_pairs=1,
-        force_final_action=True,
-    ).solve(fused_ga)
-
-    return BlockArray(result_ga.grid, s.km, result_ga.to_blocks())
-
-
 def test_fusion(app_inst_mock_none):
     app = app_inst_mock_none
     x_shape, x_block_shape = (10,), (5,)
@@ -140,24 +117,132 @@ def test_tensordot(app_inst_mock_none):
         assert block.dtype == opt_z.dtype
 
 
-def test_sparse_array(app_inst_mock_none):
+def ga_op_sparse_2(
+    app, func, x: BlockArrayBase, y: BlockArrayBase, copy_on_op=True, max_args=2
+) -> SparseBlockArray:
+    cluster_state: ClusterState = ClusterState(x.km.devices())
+    x_ga: GraphArray = GraphArray.from_ba(x, cluster_state, copy_on_op=copy_on_op)
+    y_ga: GraphArray = GraphArray.from_ba(y, cluster_state, copy_on_op=copy_on_op)
+    op_ga: GraphArray = func(app, x_ga, y_ga)
+    start_time = time.time()
+    fused_ga: GraphArray = op_ga.compile(max_args)
+    end_time = time.time()
+    result_ga: GraphArray = RandomTS(
+        seed=conftest.rs,
+        max_samples_per_step=1,
+        max_reduction_pairs=1,
+        force_final_action=True,
+    ).solve(fused_ga)
+
+    return SparseBlockArray(result_ga.grid, x.km, result_ga.to_blocks())
+
+
+def spmm(app, p, q):
+    return p @ q
+
+
+def test_spmm(app_inst_mock_none):
     app = app_inst_mock_none
-    q_shape, q_block_shape = (10, 2), (2, 2)
-    p_shape, p_block_shape = (20, 2), (2, 2)
-    s_shape, s_block_shape = (10, 20), (2, 2)
-    real_q = np.random.random(np.product(q_shape)).reshape(q_shape)
+    p_shape, p_block_shape = (10, 4), (2, 2)
+    q_shape, q_block_shape = (4, 10), (2, 2)
+    p: SparseBlockArray = app.random.sparse_normal(
+        shape=p_shape, block_shape=p_block_shape, p=0.1
+    )
+    q: SparseBlockArray = app.random.sparse_normal(
+        shape=q_shape, block_shape=q_block_shape, p=0.1
+    )
+    real_p = p.to_ba().get()
+    real_q = q.to_ba().get()
+    z: SparseBlockArray = spmm(app, p, q)
+    start_time = time.time()
+    opt_z: SparseBlockArray = ga_op_sparse_2(app, spmm, p, q)
+    end_time = time.time()
+    print(end_time - start_time)
+    assert z.nnz == opt_z.nnz
+    assert np.allclose(z.to_ba().get(), spmm(np, real_p, real_q))
+    assert app.allclose(z.to_ba(), opt_z.to_ba()).get()
+
+
+def ga_op_sparse_3(
+    app,
+    func,
+    s: BlockArrayBase,
+    p: BlockArrayBase,
+    q: BlockArrayBase,
+    copy_on_op=True,
+    max_args=3,
+) -> SparseBlockArray:
+    cluster_state: ClusterState = ClusterState(s.km.devices())
+    s_ga: GraphArray = GraphArray.from_ba(s, cluster_state, copy_on_op=copy_on_op)
+    p_ga: GraphArray = GraphArray.from_ba(p, cluster_state, copy_on_op=copy_on_op)
+    q_ga: GraphArray = GraphArray.from_ba(q, cluster_state, copy_on_op=copy_on_op)
+    op_ga: GraphArray = func(app, s_ga, p_ga, q_ga)
+    start_time = time.time()
+    fused_ga: GraphArray = op_ga.compile(max_args)
+    end_time = time.time()
+    print(fused_ga.graphs[0, 0])
+    result_ga: GraphArray = RandomTS(
+        seed=conftest.rs,
+        max_samples_per_step=1,
+        max_reduction_pairs=1,
+        force_final_action=True,
+    ).solve(fused_ga)
+
+    return SparseBlockArray(result_ga.grid, s.km, result_ga.to_blocks())
+
+
+def sparse_fusion(app, s, p, q):
+    return (s + s) * p * q
+
+
+def test_sparse_fusion(app_inst_mock_none):
+    app = app_inst_mock_none
+    s_shape, s_block_shape = (10, 4), (2, 2)
+    p_shape, p_block_shape = (10, 4), (2, 2)
+    q_shape, q_block_shape = (10, 4), (2, 2)
     real_p = np.random.random(np.product(p_shape)).reshape(p_shape)
-    real_s = np.random.random(np.product(s_shape)).reshape(s_shape)
+    real_q = np.random.random(np.product(q_shape)).reshape(q_shape)
+    s: SparseBlockArray = app.random.sparse_normal(
+        shape=s_shape, block_shape=s_block_shape, p=0.1
+    )
+    p: BlockArray = app.array(real_p, p_block_shape)
     q: BlockArray = app.array(real_q, q_block_shape)
+    real_s = s.to_ba().get()
+    z: SparseBlockArray = sparse_fusion(app, s, p, q)
+    start_time = time.time()
+    opt_z: SparseBlockArray = ga_op_sparse_3(app, sparse_fusion, s, p, q)
+    end_time = time.time()
+    print(end_time - start_time)
+    assert z.nnz == opt_z.nnz
+    assert np.allclose(z.to_ba().get(), sparse_fusion(np, real_s, real_p, real_q))
+    assert app.allclose(z.to_ba(), opt_z.to_ba()).get()
+
+
+def sddmm(app, s, p, q):
+    return s * (p @ q.T)
+
+
+def test_sddmm(app_inst_mock_none):
+    app = app_inst_mock_none
+    s_shape, s_block_shape = (20, 10), (2, 2)
+    p_shape, p_block_shape = (20, 4), (2, 2)
+    q_shape, q_block_shape = (10, 4), (2, 2)
+    real_p = np.random.random(np.product(p_shape)).reshape(p_shape)
+    real_q = np.random.random(np.product(q_shape)).reshape(q_shape)
+    s: SparseBlockArray = app.random.sparse_normal(
+        shape=s_shape, block_shape=s_block_shape, p=0.1
+    )
     p: BlockArray = app.array(real_p, p_block_shape)
-    s: BlockArray = app.array(real_s, s_block_shape)
-    z: BlockArray = fusion3(app, s, q, p)
+    q: BlockArray = app.array(real_q, q_block_shape)
+    real_s = s.to_ba().get()
+    z: SparseBlockArray = sddmm(app, s, p, q)
     start_time = time.time()
-    opt_z: BlockArray = ga_op_sampled_dense_dense(app, fusion3, s, q, p)
+    opt_z: SparseBlockArray = ga_op_sparse_3(app, sddmm, s, p, q)
     end_time = time.time()
     print(end_time - start_time)
-    assert np.allclose(z.get(), fusion3(np, real_s, real_q, real_p))
-    assert app.allclose(z, opt_z).get()
+    assert z.nnz == opt_z.nnz
+    assert np.allclose(z.to_ba().get(), sddmm(np, real_s, real_p, real_q))
+    assert app.allclose(z.to_ba(), opt_z.to_ba()).get()
 
 
 if __name__ == "__main__":
@@ -165,8 +250,8 @@ def test_sparse_array(app_inst_mock_none):
 
     app = conftest.mock_ray_cluster((1, 1))
     # test_sparse_array(app)
-    # test_fusion(app)
-    test_tensordot_variant2(app)
+    test_fusion(app)
+    # test_tensordot_variant2(app)
     conftest.destroy_mock_cluster(app)
 
     # app = conftest.mock_cluster((10, 1))
diff --git a/tests/experimental/optimizer/test_size.py b/tests/experimental/optimizer/test_meta.py
similarity index 82%
rename from tests/experimental/optimizer/test_size.py
rename to tests/experimental/optimizer/test_meta.py
index 340ae163..2b06ba78 100644
--- a/tests/experimental/optimizer/test_size.py
+++ b/tests/experimental/optimizer/test_meta.py
@@ -1,13 +1,13 @@
 import numpy as np
 import sparse
 
-from nums.experimental.optimizer.size import TreeNodeSize
+from nums.experimental.optimizer.node_meta import LeafMeta
 
 
 def test_nbytes():
     x1 = np.eye(10)
     x1_sp = sparse.COO.from_numpy(x1, fill_value=0)
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
@@ -17,7 +17,7 @@ def test_nbytes():
 
 
 def test_uop():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
@@ -29,13 +29,13 @@ def test_uop():
 
 
 def test_add():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
         False,
     )
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 10),
         20,
         np.int64,
@@ -47,13 +47,13 @@ def test_add():
 
 
 def test_mul():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
         False,
     )
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 10),
         20,
         np.int64,
@@ -62,7 +62,7 @@ def test_mul():
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 0.2 * 100))
 
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 1),
         10,
         np.int64,
@@ -71,7 +71,7 @@ def test_mul():
     y_ts = x1_ts * x2_ts
     assert np.allclose(y_ts.nnz, int(0.1 * 100))
 
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 1),
         10,
         np.int64,
@@ -83,13 +83,13 @@ def test_mul():
 
 
 def test_tensordot():
-    x1_ts = TreeNodeSize(
+    x1_ts = LeafMeta(
         (10, 10),
         10,
         np.int64,
         False,
     )
-    x2_ts = TreeNodeSize(
+    x2_ts = LeafMeta(
         (10, 10),
         20,
         np.int64,
diff --git a/tests/experimental/optimizer/test_ops.py b/tests/experimental/optimizer/test_ops.py
index 2098a692..156c9651 100644
--- a/tests/experimental/optimizer/test_ops.py
+++ b/tests/experimental/optimizer/test_ops.py
@@ -200,21 +200,24 @@ def compute_graph_array(app: ArrayApplication, ga) -> BlockArray:
 
     app = app_inst_mock_small
     cluster_state = ClusterState(app.km.devices())
-    X = app.random.sparse_normal(shape=(10, 3), block_shape=(5, 3))
+    X = app.random.sparse_normal(shape=(10, 6), block_shape=(5, 3), p=0.1)
     Xc = X
-    theta: SparseBlockArray = SparseBlockArray.from_ba(
-        app.zeros((Xc.shape[1],), (Xc.block_shape[1],), dtype=Xc.dtype)
-    )
+    print(Xc.nnz)
+    theta: BlockArray = app.ones((Xc.shape[1],), (Xc.block_shape[1],), dtype=Xc.dtype)
     X_ga: GraphArray = GraphArray.from_ba(Xc, cluster_state)
     theta_ga: GraphArray = GraphArray.from_ba(theta, cluster_state)
     Z_ga: GraphArray = X_ga @ theta_ga
     Z_ga: GraphArray = collapse_graph_array(app, Z_ga)
+    # one_ga: GraphArray = GraphArray.from_ba(
+    #     BlockArray.from_scalar(1, app.km), cluster_state
+    # )
     one_ga: GraphArray = GraphArray.from_ba(
-        SparseBlockArray.from_scalar(1, app.km), cluster_state
+        app.ones((Xc.shape[0],), (Xc.block_shape[0],), dtype=Xc.dtype), cluster_state
     )
     mu_ga: GraphArray = collapse_graph_array(app, one_ga / (one_ga + app.exp(-Z_ga)))
-    mu_ba: SparseBlockArray = compute_graph_array(app, mu_ga)
-    print(mu_ba.todense().get())
+    mu_ba: BlockArray = compute_graph_array(app, mu_ga)
+    print(mu_ba.get())
+    assert np.allclose(1 / (1 + np.exp(-(X.to_ba().get() @ theta.get()))), mu_ba.get())
 
 
 def test_sparse_bop_2(app_inst_mock_small):
@@ -256,7 +259,9 @@ def compute_graph_array(app: ArrayApplication, ga) -> BlockArray:
     P = P_sba.to_ba().get()
     Q = Q_sba.to_ba().get()
     S = S_sba.to_ba().get()
-    assert np.allclose(S - P @ Q.T, X_sba.to_ba().get())
+    X = X_sba.to_ba().get()
+    print(X)
+    assert np.allclose(S - P @ Q.T, X)
 
 
 if __name__ == "__main__":

From 27b29b8799147868e0c1bdb498ed861d78ed5c7c Mon Sep 17 00:00:00 2001
From: Daniel Zou <zoudl2000@gmail.com>
Date: Sat, 17 Sep 2022 11:52:18 -0700
Subject: [PATCH 5/5] Merge from_sparse

---
 nums/core/array/application.py  | 6 +++++-
 nums/core/array/sparse.py       | 4 ++--
 nums/numpy/api/creation.py      | 2 +-
 tests/core/array/test_sparse.py | 4 +---
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/nums/core/array/application.py b/nums/core/array/application.py
index ce148994..9bc47aae 100644
--- a/nums/core/array/application.py
+++ b/nums/core/array/application.py
@@ -230,7 +230,11 @@ def loadtxt(
     def scalar(self, value):
         return BlockArray.from_scalar(value, self.km)
 
-    def array(self, array: Union[np.ndarray, sparse.COO, List[float]], block_shape: tuple = None):
+    def array(
+        self,
+        array: Union[np.ndarray, sparse.COO, List[float]],
+        block_shape: tuple = None,
+    ):
         if not isinstance(array, (np.ndarray, sparse.COO)):
             if array_utils.is_array_like(array):
                 array = np.array(array)
diff --git a/nums/core/array/sparse.py b/nums/core/array/sparse.py
index 37939b68..118edcdc 100644
--- a/nums/core/array/sparse.py
+++ b/nums/core/array/sparse.py
@@ -307,10 +307,10 @@ def from_np(cls, arr, block_shape, copy, km):
         return rarr
 
     @classmethod
-    def from_sparse(cls, arr, block_shape, copy, km, fill_value=0):
+    def from_sparse(cls, arr, block_shape, copy, km):
         dtype_str = str(arr.dtype)
         grid = ArrayGrid(arr.shape, block_shape, dtype_str)
-        rarr = SparseBlockArray(grid, km, fill_value)
+        rarr = SparseBlockArray(grid, km)
         grid_entry_iterator = grid.get_entry_iterator()
         for grid_entry in grid_entry_iterator:
             grid_slice = grid.get_slice(grid_entry)
diff --git a/nums/numpy/api/creation.py b/nums/numpy/api/creation.py
index a76e815c..5257e38b 100644
--- a/nums/numpy/api/creation.py
+++ b/nums/numpy/api/creation.py
@@ -227,7 +227,7 @@ def array(object, dtype=None, copy=True, order="K", ndmin=0, subok=False) -> Blo
 
 
 def from_coo(a: sparse.COO):
-    assert(isinstance(a, sparse.COO))
+    assert isinstance(a, sparse.COO)
     dtype = np.__getattribute__(str(a.dtype))
     shape = a.shape
     app = _instance()
diff --git a/tests/core/array/test_sparse.py b/tests/core/array/test_sparse.py
index 5a554799..b03c8472 100644
--- a/tests/core/array/test_sparse.py
+++ b/tests/core/array/test_sparse.py
@@ -11,7 +11,6 @@ def test_sparse_init(app_inst: ArrayApplication):
     x_ba = app_inst.array(x1, block_shape=(2, 2))
     x_sba = SparseBlockArray.from_ba(x_ba)
     assert x_sba.nnz == 8
-    assert x_sba.nbytes == 8 * 4 + 2 * 8 * 8
     y_ba = x_sba.to_ba()
     assert np.array_equal(x1, y_ba.get())
 
@@ -19,12 +18,11 @@ def test_sparse_init(app_inst: ArrayApplication):
 def test_from_coo(app_inst: ArrayApplication):
     row_coords = [0, 1, 0, 1, 2, 2, 3, 3]
     col_coords = [3, 2, 2, 3, 0, 1, 0, 1]
-    values     = [1, 1, 1, 1, 2, 2, 2, 2]
+    values = [1, 1, 1, 1, 2, 2, 2, 2]
     x_sp = sparse.COO([row_coords, col_coords], values)
     x_de = x_sp.todense()
     x_sba = app_inst.array(x_sp, block_shape=(2, 2))
     assert x_sba.nnz == 8
-    assert x_sba.nbytes == 8 * 4 + 2 * 8 * 8
     y_ba = x_sba.to_ba()
     assert np.array_equal(x_de, y_ba.get())